#### importing modules & reproducibility

In [1]:
# import modules
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt

# Fixing random state for reproducibility
np.random.seed(387)

# ***Preprocessing***

In [2]:
# read data
characters = pd.read_csv('datasets/Characters.csv', sep=';')
hp1 = pd.read_csv('datasets/Harry Potter 1.csv', sep=';')
hp2 = pd.read_csv('datasets/Harry Potter 2.csv', sep=';')
hp3 = pd.read_csv('datasets/Harry Potter 3.csv', sep=';')

## ***profiling structure***
### fixing books data

In [3]:
trilogy = [hp1, hp2, hp3] 

# changing books columns and values to lower case
'''if not other cols would be created because of name differences'''
for book in trilogy:
    book.columns = hp1.columns.str.lower()
    book['character'] = book['character'].str.lower() # character column
    book['sentence'] = book['sentence'].str.lower()   # sentence column

In [4]:
# vertical concatenation of books
combined = pd.concat([hp1, hp2, hp3], ignore_index=True)
combined.head()

Unnamed: 0,character,sentence
0,dumbledore,"i should've known that you would be here, prof..."
1,mcgonagall,"good evening, professor dumbledore."
2,mcgonagall,"are the rumors true, albus?"
3,dumbledore,"i'm afraid so, professor."
4,dumbledore,the good and the bad.


### fixing characters data

In [5]:
# changing characters columns to lower case
'''if not other cols would be created because of name differences'''
characters.columns = characters.columns.str.lower()

In [6]:
# columns to date format
characters['birth'] = pd.to_datetime(characters['birth'], errors='coerce')
characters['death'] = pd.to_datetime(characters['death'], errors='coerce')

In [7]:
# get non numerical values
non_num = characters.select_dtypes(exclude=['int64','float64','datetime','timedelta'])
num     = characters.select_dtypes(exclude=['object'])

non_num.columns

Index(['name', 'gender', 'job', 'house', 'wand', 'patronus', 'species',
       'blood status', 'hair colour', 'eye colour', 'loyalty', 'skills'],
      dtype='object')

In [21]:
# changing character non_num values to lowercase
for column in non_num.columns:
    non_num[column] = non_num[column].str.lower() #lowercase

characters[non_num.columns] = non_num # replace in original table
characters.head(2) # inspect

Unnamed: 0,id,name,gender,job,house,wand,patronus,species,blood status,hair colour,eye colour,loyalty,skills,birth,death
0,1,harry james potter,male,student,gryffindor,"11"" holly phoenix feather",stag,human,half-blood,black,bright green,albus dumbledore | dumbledore's army | order o...,parseltongue| defence against the dark arts | ...,1980-07-31,NaT
1,2,ronald bilius weasley,male,student,gryffindor,"12"" ash unicorn tail hair",jack russell terrier,human,pure-blood,red,blue,dumbledore's army | order of the phoenix | hog...,wizard chess | quidditch goalkeeping,1980-03-01,NaT


#### inspecting tables

In [10]:
# column names
print(characters.head(3), '\n \n')

   id                   name  gender      job       house  \
0   1     Harry James Potter    Male  Student  Gryffindor   
1   2  Ronald Bilius Weasley    Male  Student  Gryffindor   
2   3  Hermione Jean Granger  Female  Student  Gryffindor   

                                 wand              patronus species  \
0         11"  Holly  phoenix feather                  Stag   Human   
1          12" Ash unicorn tail hair   Jack Russell terrier   Human   
2  10¾"  vine wood dragon heartstring                 Otter   Human   

  blood status hair colour    eye colour  \
0   Half-blood       Black  Bright green   
1   Pure-blood         Red          Blue   
2  Muggle-born       Brown         Brown   

                                             loyalty  \
0  Albus Dumbledore | Dumbledore's Army | Order o...   
1  Dumbledore's Army | Order of the Phoenix | Hog...   
2  Dumbledore's Army | Order of the Phoenix | Hog...   

                                              skills      birth deat

# Plots :
### Characters with the most sentences
#### top 15 of the first 3 movies 