Source: [Harry Potter Dataset](https://www.kaggle.com/datasets/gulsahdemiryurek/harry-potter-dataset)

### Import libraries

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Load data

In [3]:
data = pd.read_csv('Characters.csv', sep=";")

In [4]:
data.shape

(140, 15)

In [5]:
data.head(3)

Unnamed: 0,Id,Name,Gender,Job,House,Wand,Patronus,Species,Blood status,Hair colour,Eye colour,Loyalty,Skills,Birth,Death
0,1,Harry James Potter,Male,Student,Gryffindor,"11"" Holly phoenix feather",Stag,Human,Half-blood,Black,Bright green,Albus Dumbledore | Dumbledore's Army | Order of the Phoenix | Hogwarts School of Witchcraft and Wizardry,Parseltongue| Defence Against the Dark Arts | Seeker,31 July 1980,
1,2,Ronald Bilius Weasley,Male,Student,Gryffindor,"12"" Ash unicorn tail hair",Jack Russell terrier,Human,Pure-blood,Red,Blue,Dumbledore's Army | Order of the Phoenix | Hogwarts School of Witchcraft and Wizardry,Wizard chess | Quidditch goalkeeping,1 March 1980,
2,3,Hermione Jean Granger,Female,Student,Gryffindor,"10¾"" vine wood dragon heartstring",Otter,Human,Muggle-born,Brown,Brown,Dumbledore's Army | Order of the Phoenix | Hogwarts School of Witchcraft and Wizardry,Almost everything,"19 September, 1979",


# Clean data

From existing columns, I extract clean data:
- split name into first name and last name
- convert gender to boolean values (0 or 1)
- select people who are 'student' and 'teacher' (Headmaster or Professor), 
- split full information about wand into length, wood, core source and core element
- extract only animal species from patronus
- select people who are considered as: human, giant, werewolf, goblin, ghost
- select people with specific blood status: pure, half and muggle-born
- extract only colors (without tone) from hair and eye colours
- select people who are loyal to specific groups: Dumbledore's Army, Order of the Phoenix and Lord Voldemort/Death eaters
- select people with special skills: is highly skilled, is a student, is a prefect, have something in common with quidditch, have something in common with unforgivable curses
- convert Hogwart's houses names to numeric values

In [6]:
new_cols = [col.strip().replace(' ','_').lower() for col in data.columns]
data.rename(columns={key:val for (key,val) in zip(data.columns,new_cols)},inplace=True)

data = data[data.columns[1:]]

In [7]:
print('Data columns types:')
display(data.dtypes)

Data columns types:


name            object
gender          object
job             object
house           object
wand            object
patronus        object
species         object
blood_status    object
hair_colour     object
eye_colour      object
loyalty         object
skills          object
birth           object
death           object
dtype: object

In [8]:
print('Object variables:')
display(data.describe(include = [object]))

Object variables:


Unnamed: 0,name,gender,job,house,wand,patronus,species,blood_status,hair_colour,eye_colour,loyalty,skills,birth,death
count,140,139,121,101,132,130,140,123,123,86,89,113,127,42
unique,140,2,65,6,29,20,10,15,36,25,19,94,112,25
top,Harry James Potter,Male,Student,Gryffindor,Unknown,Unknown,Human,Pure-blood or half-blood,Black,Brown,Order of the Phoenix,Chaser,Pre 976,"2 May, 1998"
freq,1,90,52,38,104,75,105,38,25,16,16,7,4,9


In [9]:
houses = ['Gryffindor', 'Ravenclaw', 'Slytherin', 'Hufflepuff']
data_houses = data[data.house.isin(houses)]

In [10]:
# create a list of columns names which will be use to do the following search
new_columns = []

In [11]:
# split full names into new columns: 'first_name' and 'last_name'
i = data.columns.get_loc('name')
data_houses.insert(i+1, 'first_name', data_houses['name'].str.split().str[0], True)
data_houses.insert(i+2, 'last_name', data_houses['name'].str.split().str[-1], True)
new_columns.append('first_name')
new_columns.append('last_name')

# map gender column:
#   - 0: Male
#   - 1: Female
mapper = {'Male': 0, 'Female': 1}
i = data_houses.columns.get_loc('gender')
data_houses.insert(i+1, 'sex', data_houses['gender'].map(mapper), True)
new_columns.append('sex')

In [12]:
# split 'job' column into two new columns: 'student' and 'teacher' (Headmaster or Professor), 

student = pd.Series(('student' in str(job).lower()) for job in data_houses.job)
teacher = pd.Series((('headmaster' in str(job).lower())|('professor' in str(job).lower())) for job in data_houses.job)

i = data_houses.columns.get_loc('job')
data_houses.insert(i+1, 'student', student, True)
data_houses.insert(i+2, 'teacher', teacher, True)
new_columns.append('student')
new_columns.append('teacher')

In [13]:
# split wand characteristic into new columns: 'length', 'wood' and 'core'
woods = ['acacia','alder','apple','ash','aspen','beech','birch','black','blackthorn','cedar','cherry','chestnut','cypress',
         'dogwood','ebony','elder','elm','english','fir','hawthorn','hazel','holly','hornbeam','larch','laurel','lime',
         'mahogany','maple','oak','olive','pear','pine','poplar','red','redwood','rosewood','rowan','silver','snakewood',
         'spruce','sycamore','vine','walnut','willow','yew']
cores_source = ['basilisk','cat','coral','dittany','dragon','jackalope','kelpie','kneazle','monster','phoenix','river',
                'rougarou','serpent','snallygaster','thestral','thunderbird','troll','unicorn','veela','wampus']
cores_part = ['antler','bone','feather','hair','heartstring','horn','mane','shell','spine','stalk','whisker','white']

wands = data_houses['wand']
wand_length = []
wand_core_src = []
wand_core_part = []
wand_wood = []
for wand in wands:
    if not wand == 'Unknown':
        length = [s for s in wand[:2] if s.isdigit()]
        if len(length)==0:
            wand_length.append(np.nan)
        elif len(length)==1:
            wand_length.append(int(length[0]))
        else:
            wand_length.append(int(length[0]+length[1]))
            
        words = [word.lower().strip(',') for word in wand.split()]
        
        wood = list(set(woods)&set(words))
        if len(wood)<1:
            wand_wood.append(np.nan)
        else:
            wand_wood.append(wood[0])
            
        core_s = list(set(cores_source)&set(words))
        if len(core_s)<1:
            wand_core_src.append(np.nan)
        else:
            wand_core_src.append(core_s[0])
        
        core_p = list(set(cores_part)&set(words))
        if len(core_p)<1:
            wand_core_part.append(np.nan)
        else:
            wand_core_part.append(core_p[0])
    else:
        wand_length.append(np.nan)
        wand_wood.append(np.nan)
        wand_core_src.append(np.nan)
        wand_core_part.append(np.nan)

wand_length = pd.Series(wand_length)
wand_wood = pd.Series(wand_wood)
wand_core_src = pd.Series(wand_core_src)
wand_core_part = pd.Series(wand_core_part)

In [14]:
# convert column with strings values into numeric values
mapper = {}
def str_to_int(col, name):
    unique_val = col.unique().tolist()
    dictonary = {key:val for key,val in zip(unique_val,range(len(unique_val)))}
    dictonary[np.nan] = np.nan
    print(name + ' mapper:')
    print(dictonary)
    mapper[name]=dictonary
    return pd.Series(col).map(dictonary, na_action='ignore')

In [15]:
# convert wand_wood, wand_core_src and wand_core_part strings into numeric values
wand_wood = str_to_int(wand_wood, 'wand_wood')
wand_core_src = str_to_int(wand_core_src, 'wand_core_src')
wand_core_part = str_to_int(wand_core_part, 'wand_core_part')

i = data_houses.columns.get_loc('wand')
data_houses.insert(i+1, 'wand_length', wand_length, True)
data_houses.insert(i+2, 'wand_wood', wand_wood, True)
data_houses.insert(i+3, 'wand_core_src', wand_core_src, True)
data_houses.insert(i+4, 'wand_core_part', wand_core_part, True)
new_columns.append('wand_length')
new_columns.append('wand_wood')
new_columns.append('wand_core_src')
new_columns.append('wand_core_part')

wand_wood mapper:
{'holly': 0, 'ash': 1, 'vine': 2, 'elder': 3, 'oak': 4, 'cherry': 5, nan: nan, 'willow': 7, 'mahogany': 8, 'cypress': 9, 'chestnut': 10, 'fir': 11, 'alder': 12, 'hazel': 13, 'hornbeam': 14, 'hawthorn': 15, 'walnut': 16, 'birch': 17, 'cedar': 18, 'elm': 19, 'yew': 20, 'snakewood': 21}
wand_core_src mapper:
{'phoenix': 0, 'unicorn': 1, 'dragon': 2, 'thestral': 3, nan: nan, 'basilisk': 5}
wand_core_part mapper:
{'feather': 0, 'hair': 1, 'heartstring': 2, nan: nan, 'horn': 4}


In [16]:
# extract only species from patronus column and convert strings into numeric values
patronus = data_houses['patronus'].str.lower().str.split().str[-1]
patronus = str_to_int(patronus, 'patronus')
i = data_houses.columns.get_loc('patronus')
data_houses.insert(i+1, 'patronus_n', patronus, True)
new_columns.append('patronus_n')

patronus mapper:
{'stag': 0, 'terrier': 1, 'otter': 2, 'phoenix': 3, 'none': 4, 'non-corporeal': 5, 'unknown': 6, 'horse': 7, 'fox': 8, 'doe': 9, 'wolf': 10, 'cat': 11, 'weasel': 12, 'swan': 13, 'hare': 14, 'squirrel': 15, nan: nan, 'boar': 17}


In [17]:
# select people who are considered as: 'human', 'giant', 'werewolf', 'goblin', 'ghost' (True or False)
human = pd.Series(('human' in species.lower()) for species in data_houses.species)
giant = pd.Series(('giant' in species.lower()) for species in data_houses.species)
werewolf = pd.Series(('werewolf' in species.lower()) for species in data_houses.species)
goblin = pd.Series(('goblin' in species.lower()) for species in data_houses.species)
ghost = pd.Series(('ghost' in species.lower()) for species in data_houses.species)

i = data_houses.columns.get_loc('species')
data_houses.insert(i+1, 'human', human, True)
data_houses.insert(i+2, 'giant', giant, True)
data_houses.insert(i+3, 'werewolf', werewolf, True)
data_houses.insert(i+4, 'goblin', goblin, True)
data_houses.insert(i+5, 'ghost', ghost, True)
new_columns.append('human')
new_columns.append('giant')
new_columns.append('werewolf')
new_columns.append('goblin')
new_columns.append('ghost')

In [18]:
# select people with specific blood status into new columns: 'pure_blood', 'half_blood' and 'muggle_born' 
# (True or False, if the person is a member of this blood group, or is considered as one)
pure_blood = pd.Series(('pure' in str(blood).lower()) for blood in data_houses.blood_status)
half_blood = pd.Series(('half' in str(blood).lower()) for blood in data_houses.blood_status)
muggle_born = pd.Series(('muggle' in str(blood).lower()) for blood in data_houses.blood_status)  

i = data_houses.columns.get_loc('blood_status')
data_houses.insert(i+1, 'pure_blood', pure_blood, True)
data_houses.insert(i+2, 'half_blood', half_blood, True)
data_houses.insert(i+3, 'muggle_born', muggle_born, True)

new_columns.append('pure_blood')
new_columns.append('half_blood')
new_columns.append('muggle_born')

In [19]:
# extract only colours (without tone) from hair colours
hair_black = pd.Series((('black' in str(hair).lower())|('dark' in str(hair).lower())) for hair in data_houses.hair_colour)
hair_red = pd.Series(('red' in str(hair).lower()) for hair in data_houses.hair_colour)
hair_brown = pd.Series((('brown' in str(hair).lower())|('auburn' in str(hair).lower())) for hair in data_houses.hair_colour)
hair_blond = pd.Series((('blond' in str(hair).lower())|('sandy' in str(hair).lower())) for hair in data_houses.hair_colour)
hair_grey = pd.Series((('grey' in str(hair).lower())|('silver' in str(hair).lower())|('mousy' in str(hair).lower())|
         ('white' in str(hair).lower())) for hair in data_houses.hair_colour)
hair_bald = pd.Series(('bald' in str(hair).lower()) for hair in data_houses.hair_colour)

i = data_houses.columns.get_loc('hair_colour')
data_houses.insert(i+1, 'hair_black', hair_black, True)
data_houses.insert(i+2, 'hair_red', hair_red, True)
data_houses.insert(i+3, 'hair_brown', hair_brown, True)
data_houses.insert(i+4, 'hair_blond', hair_blond, True)
data_houses.insert(i+5, 'hair_grey', hair_grey, True)
data_houses.insert(i+6, 'hair_bald', hair_bald, True)

new_columns.append('hair_black')
new_columns.append('hair_red')
new_columns.append('hair_brown')
new_columns.append('hair_blond')
new_columns.append('hair_grey')
new_columns.append('hair_bald')

In [20]:
# extract only colours (without tone) from eye colours
eye_green = pd.Series((('green' in str(eye).lower())|('gooseberry' in str(eye).lower())) for eye in data_houses.eye_colour)
eye_blue = pd.Series(('blue' in str(eye).lower()) for eye in data_houses.eye_colour)
eye_brown = pd.Series(('brown' in str(eye).lower()) for eye in data_houses.eye_colour)
eye_hazel = pd.Series(('hazel' in str(eye).lower()) for eye in data_houses.eye_colour)
eye_black = pd.Series((('black' in str(eye).lower())|('dark' in str(eye).lower())) for eye in data_houses.eye_colour)
eye_grey = pd.Series((('grey' in str(eye).lower())|('silver' in str(eye).lower())) for eye in data_houses.eye_colour)
eye_scarlet = pd.Series(('scarlet' in str(eye).lower()) for eye in data_houses.eye_colour)

i = data_houses.columns.get_loc('eye_colour')
data_houses.insert(i+1, 'eye_green', eye_green, True)
data_houses.insert(i+2, 'eye_blue', eye_blue, True)
data_houses.insert(i+3, 'eye_brown', eye_brown, True)
data_houses.insert(i+4, 'eye_hazel', eye_hazel, True)
data_houses.insert(i+5, 'eye_black', eye_black, True)
data_houses.insert(i+6, 'eye_grey', eye_grey, True)
data_houses.insert(i+7, 'eye_scarlet', eye_scarlet, True)

new_columns.append('eye_green')
new_columns.append('eye_blue')
new_columns.append('eye_brown')
new_columns.append('eye_hazel')
new_columns.append('eye_black')
new_columns.append('eye_grey')
new_columns.append('eye_scarlet')

In [21]:
# point out info about loyalty into new columns: 'order_of_the_phoenix', 'dumbledores_army' and 'death_eater' 
# (True or False, if the person is loyal to the specific group)

order_of_the_phoenix = pd.Series(('Order of the Phoenix'.lower() in str(loyal).lower()) for loyal in data_houses.loyalty)
dumbledores_army = pd.Series(("Dumbledore's Army".lower() in str(loyal).lower()) for loyal in data_houses.loyalty)
death_eater = pd.Series((('Death Eater'.lower() in str(loyal).lower())|
                         ('Voldemort'.lower() in str(loyal).lower())) for loyal in data_houses.loyalty)

i = data_houses.columns.get_loc('loyalty')
data_houses.insert(i+1, 'order_of_the_phoenix', order_of_the_phoenix, True)
data_houses.insert(i+2, 'dumbledores_army', dumbledores_army, True)
data_houses.insert(i+3, 'death_eater', death_eater, True)

new_columns.append('order_of_the_phoenix')
new_columns.append('dumbledores_army')
new_columns.append('death_eater')

In [22]:
# point out info about characteristic skills into new columns: 
#   - 'skilled' (if this person is highly skilled or not - True/False)
#   - 'prefect' (if this person is/was a prefect or not - True/False)
#   - 'auror' (if this person is/was a auror or not - True/False)
#   - 'quidditch' (if this person is correlated with quidditch or not - True/False)
#   - 'curse' (if this person s correlated with unforgivable curses or not - True/False)

skilled = pd.Series(('skill' in str(skills).lower()) for skills in data_houses.skills)
prefect = pd.Series(('prefect' in str(skills).lower()) for skills in data_houses.skills)
auror = pd.Series(('auror' in str(skills).lower()) for skills in data_houses.skills)

quidditch_words = ['quidditch', 'chaser', 'beater', 'keeper', 'seeker', 'quaffle', 'bludger', 'snitch', 'captain']
quidditch = pd.Series(len(list(set(quidditch_words)&set(str(skills).lower().split())))>0 for skills in data_houses.skills)

unforgivable_curse = ['avada','kedavra','cruciatus','imperius']
curse = pd.Series(len(list(set(unforgivable_curse)&set(str(skills).lower().split())))>0 for skills in data_houses.skills)

i = data_houses.columns.get_loc('skills')
data_houses.insert(i+1, 'skilled', skilled, True)
data_houses.insert(i+2, 'prefect', prefect, True)
data_houses.insert(i+3, 'auror', auror, True)
data_houses.insert(i+4, 'quidditch', quidditch, True)
data_houses.insert(i+5, 'curse', curse, True)

new_columns.append('skilled')
new_columns.append('prefect')
new_columns.append('auror')
new_columns.append('quidditch')
new_columns.append('curse')

In [23]:
# convert Hogwart Houses names to numeric values
house = str_to_int(data_houses.house, 'house')
i = len(data_houses.columns)
data_houses.insert(i, 'Y_house', house, True)
new_columns.append('Y_house')

house mapper:
{'Gryffindor': 0, 'Ravenclaw': 1, 'Slytherin': 2, 'Hufflepuff': 3, nan: nan}


In [24]:
clean_data = data_houses[new_columns]*1
clean_data

Unnamed: 0,first_name,last_name,sex,student,teacher,wand_length,wand_wood,wand_core_src,wand_core_part,patronus_n,human,giant,werewolf,goblin,ghost,pure_blood,half_blood,muggle_born,hair_black,hair_red,hair_brown,hair_blond,hair_grey,hair_bald,eye_green,eye_blue,eye_brown,eye_hazel,eye_black,eye_grey,eye_scarlet,order_of_the_phoenix,dumbledores_army,death_eater,skilled,prefect,auror,quidditch,curse,Y_house
0,Harry,Potter,0.0,1.0,0.0,11.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,Ronald,Weasley,0.0,1.0,0.0,12.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,Hermione,Granger,1.0,1.0,0.0,10.0,2.0,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Albus,Dumbledore,0.0,0.0,1.0,15.0,3.0,3.0,1.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Rubeus,Hagrid,0.0,0.0,1.0,16.0,4.0,,,4.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Neville,Longbottom,0.0,1.0,0.0,13.0,5.0,1.0,1.0,5.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Fred,Weasley,0.0,1.0,0.0,,,,,6.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,George,Weasley,0.0,1.0,0.0,,,,,6.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,Ginevra,Weasley,1.0,1.0,0.0,,,,,7.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Dean,Thomas,0.0,1.0,0.0,,,,,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Export clean data to csv file

In [25]:
clean_data.to_csv('hogwart_houses.csv') 

In [26]:
mapper

{'wand_wood': {'holly': 0,
  'ash': 1,
  'vine': 2,
  'elder': 3,
  'oak': 4,
  'cherry': 5,
  nan: nan,
  'willow': 7,
  'mahogany': 8,
  'cypress': 9,
  'chestnut': 10,
  'fir': 11,
  'alder': 12,
  'hazel': 13,
  'hornbeam': 14,
  'hawthorn': 15,
  'walnut': 16,
  'birch': 17,
  'cedar': 18,
  'elm': 19,
  'yew': 20,
  'snakewood': 21},
 'wand_core_src': {'phoenix': 0,
  'unicorn': 1,
  'dragon': 2,
  'thestral': 3,
  nan: nan,
  'basilisk': 5},
 'wand_core_part': {'feather': 0,
  'hair': 1,
  'heartstring': 2,
  nan: nan,
  'horn': 4},
 'patronus': {'stag': 0,
  'terrier': 1,
  'otter': 2,
  'phoenix': 3,
  'none': 4,
  'non-corporeal': 5,
  'unknown': 6,
  'horse': 7,
  'fox': 8,
  'doe': 9,
  'wolf': 10,
  'cat': 11,
  'weasel': 12,
  'swan': 13,
  'hare': 14,
  'squirrel': 15,
  nan: nan,
  'boar': 17},
 'house': {'Gryffindor': 0,
  'Ravenclaw': 1,
  'Slytherin': 2,
  'Hufflepuff': 3,
  nan: nan}}