#### importing modules & reproducibility

In [1]:
# import modules
import pandas as pd
import seaborn as sns
import numpy as np

# Fixing random state for reproducibility
np.random.seed(387)

# ***Preprocessing***

In [2]:
# read data
characters = pd.read_csv('datasets/Characters.csv', sep=';')
hp1 = pd.read_csv('datasets/Harry Potter 1.csv', sep=';')
hp2 = pd.read_csv('datasets/Harry Potter 2.csv', sep=';')
hp3 = pd.read_csv('datasets/Harry Potter 3.csv', sep=';')

# inspect
print('books database ----> \n',hp1.head(2), '\n \n')
print('characters database ----> \n',characters.head(1))

books database ----> 
     Character                                           Sentence
0  Dumbledore  I should've known that you would be here, Prof...
1  McGonagall                Good evening, Professor Dumbledore. 
 

characters database ----> 
    Id                Name Gender      Job       House  \
0   1  Harry James Potter   Male  Student  Gryffindor   

                          Wand Patronus Species Blood status Hair colour  \
0  11"  Holly  phoenix feather     Stag   Human   Half-blood       Black   

     Eye colour                                            Loyalty  \
0  Bright green  Albus Dumbledore | Dumbledore's Army | Order o...   

                                              Skills         Birth Death  
0  Parseltongue| Defence Against the Dark Arts | ...  31 July 1980   NaN  


## ***profiling structure***
### fixing books data

In [3]:
trilogy = [hp1, hp2, hp3] 

# changing books columns and values to lower case
'''if not other cols would be created because of name differences'''
for book in trilogy:
    book.columns = hp1.columns.str.lower()
    book['character'] = book['character'].str.lower() # character column
    book['sentence'] = book['sentence'].str.lower()   # sentence column
    
print(trilogy[0:1])

[       character                                           sentence
0     dumbledore  i should've known that you would be here, prof...
1     mcgonagall                good evening, professor dumbledore.
2     mcgonagall                        are the rumors true, albus?
3     dumbledore                          i'm afraid so, professor.
4     dumbledore                              the good and the bad.
...          ...                                                ...
1582      hagrid  oh, listen, harry, if that dolt of a cousin of...
1583       harry  but hagrid, we're not allowed to do magic away...
1584      hagrid  i do. but your cousin don't, do he? eh? off yo...
1585    hermione        feels strange to be going home, doesn't it?
1586       harry                    i'm not going home. not really.

[1587 rows x 2 columns]]


In [4]:
# vertical concatenation of books
combined = pd.concat([hp1, hp2, hp3], ignore_index=True)
combined

Unnamed: 0,character,sentence
0,dumbledore,"i should've known that you would be here, prof..."
1,mcgonagall,"good evening, professor dumbledore."
2,mcgonagall,"are the rumors true, albus?"
3,dumbledore,"i'm afraid so, professor."
4,dumbledore,the good and the bad.
...,...,...
4920,hermione,"how fast is it, harry?"
4921,harry,lumos.
4922,harry,i solemnly swear that i am up to no good.
4923,harry,mischief managed.


### fixing characters data

In [11]:
characters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140 entries, 0 to 139
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            140 non-null    int64 
 1   name          140 non-null    object
 2   gender        139 non-null    object
 3   job           121 non-null    object
 4   house         101 non-null    object
 5   wand          132 non-null    object
 6   patronus      130 non-null    object
 7   species       140 non-null    object
 8   blood status  123 non-null    object
 9   hair colour   123 non-null    object
 10  eye colour    86 non-null     object
 11  loyalty       89 non-null     object
 12  skills        113 non-null    object
 13  birth         127 non-null    object
 14  death         42 non-null     object
dtypes: int64(1), object(14)
memory usage: 16.5+ KB


In [6]:
# changing books columns and values to lower case
'''if not other cols would be created because of name differences'''
characters.columns = characters.columns.str.lower()

In [14]:
# divide numerical and non-numerical columns
numerical     = characters.select_dtypes(exclude='object')
non_numerical = characters.select_dtypes(exclude=['int64','float64'])

In [16]:
# changing column dtypes
characters['gender'].astype('category')

0        Male
1        Male
2      Female
3        Male
4        Male
        ...  
135    Female
136      Male
137      Male
138      Male
139      Male
Name: gender, Length: 140, dtype: category
Categories (2, object): ['Female', 'Male']

#### inspecting tables

In [17]:
# column names
print(characters.columns,'\n')
print(hp1.columns)

Index(['id', 'name', 'gender', 'job', 'house', 'wand', 'patronus', 'species',
       'blood status', 'hair colour', 'eye colour', 'loyalty', 'skills',
       'birth', 'death'],
      dtype='object') 

Index(['character', 'sentence'], dtype='object')


In [9]:
# 3 tables shapes
print("hp1 shape:", hp1.shape,'\n',
      "hp2 shape:", hp2.shape,'\n',
      "hp3 shape:", hp3.shape)

hp1 shape: (1587, 2) 
 hp2 shape: (1700, 2) 
 hp3 shape: (1638, 2)


# Plots :
### Characters with the most sentences
#### top 15 of the first 3 movies 