### Import necessary libraries

In [203]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import copy
from operator import itemgetter
import re

### Load necessary data from csv as dataframe

In [204]:
movie_df = pd.read_csv('..\Harry Potter-20220610T024151Z-001\Movies.csv', sep=',', quotechar='"')
movie_df.head(2)

Unnamed: 0,Movie ID,Movie Title,Release Year,Runtime,Budget,Box Office
0,1,Harry Potter and the Philosopher's Stone,2001,152,"$125,000,000","$1,002,000,000"
1,2,Harry Potter and the Chamber of Secrets,2002,161,"$100,000,000","$880,300,000"


In [205]:
chapter_df = pd.read_csv('..\Harry Potter-20220610T024151Z-001\Chapters.csv', sep=',', quotechar='"', encoding='Latin-1')
chapter_df.head(2)

Unnamed: 0,Chapter ID,Chapter Name,Movie ID,Movie Chapter
0,1,Doorstep Delivery,1,1
1,2,The Vanishing Glass,1,2


In [206]:
dialogue_df = pd.read_csv('..\Harry Potter-20220610T024151Z-001\Dialogue.csv', sep=',', quotechar='"', encoding='Latin-1')
dialogue_df.head(2)

Unnamed: 0,Dialogue ID,Chapter ID,Place ID,Character ID,Dialogue
0,1,1,8,4,I should have known that you would be here...P...
1,2,1,8,7,"Good evening, Professor Dumbledore. Are the ru..."


In [207]:
place_df = pd.read_csv('..\Harry Potter-20220610T024151Z-001\Places.csv', sep=',', quotechar='"', encoding='Latin-1')
place_df.head(2)

Unnamed: 0,Place ID,Place Name,Place Category
0,1,Flourish & Blotts,Diagon Alley
1,2,Gringotts Wizarding Bank,Diagon Alley


In [208]:
character_df = pd.read_csv('..\Harry Potter-20220610T024151Z-001\Characters.csv', sep=',', quotechar='"', encoding='Latin-1')
character_df.head(2)

Unnamed: 0,Character ID,Character Name,Species,Gender,House,Patronus,Wand (Wood),Wand (Core)
0,1,Harry Potter,Human,Male,Gryffindor,Stag,Holly,Phoenix Feather
1,2,Ron Weasley,Human,Male,Gryffindor,Jack Russell Terrier,,


In [209]:
spell_df = pd.read_csv('..\Harry Potter-20220610T024151Z-001\Spells.csv', sep=',', quotechar='"')
spell_df = spell_df.replace(u'\xa0', u' ', regex=True).replace(u'\s+', u' ', regex=True)
spell_df.head(2)

Unnamed: 0,Spell ID,Incantation,Spell Name,Effect,Light
0,1,Accio,Summoning Charm,Summons an object,
1,2,Aguamenti,Water-Making Spell,Conjures water,Icy blue


### Analyze

#### Generate basic data

In [210]:
# check if duplicated incantation
if len(spell_df[spell_df.duplicated(subset=['Incantation'])]) > 0:
    print('========>>>>>>>> FOUND DUPLICATED INCANTATION!!')
else:
    spell_list = spell_df.to_dict('records')
    spell_list = sorted(spell_list, key=lambda x: len(x['Incantation']), reverse=True)
    print(spell_list[:2])
    dialogue_list = dialogue_df.to_dict('records')
    print(dialogue_list[:2])

[{'Spell ID': 39, 'Incantation': 'Peskipiksi Pesternomi', 'Spell Name': 'Peskipiksi Pesternomi', 'Effect': 'Nothing (supposedly captures pixies)', 'Light': nan}, {'Spell ID': 41, 'Incantation': 'Piertotum Locomotor', 'Spell Name': 'Piertotum Locomotor', 'Effect': 'Animates target', 'Light': 'Green'}]
[{'Dialogue ID': 1, 'Chapter ID': 1, 'Place ID': 8, 'Character ID': 4, 'Dialogue': 'I should have known that you would be here...Professor McGonagall.'}, {'Dialogue ID': 2, 'Chapter ID': 1, 'Place ID': 8, 'Character ID': 7, 'Dialogue': 'Good evening, Professor Dumbledore. Are the rumours true, Albus?'}]


In [211]:
dialogue_spell_list = []
for dialogue_dict in dialogue_list:
    dialogue_spell_dict = dialogue_dict
    tmp_dialogue = dialogue_dict['Dialogue'].lower()
    is_break = False
    for spell_dict in spell_list:
        incantation = spell_dict['Incantation']
        searching_regex = r'\b{}\b'.format(incantation.lower())
        searching_result = re.search(searching_regex, tmp_dialogue)
        if searching_result:
            dialogue_spell_dict.update(spell_dict)
            dialogue_spell_list.append(dialogue_spell_dict)
            tmp_dialogue = re.sub(searching_result.group(0), '', tmp_dialogue)
print(dialogue_spell_list[:2])

[{'Dialogue ID': 226, 'Chapter ID': 10, 'Place ID': 42, 'Character ID': 3, 'Dialogue': "Are you sure that's a real spell? Well, it's not very good, is it? Of course I've only tried a few simple spells myself, and they've all worked for me. For example...  Oculus Reparo.  That's better, isn't it? Holy Cricket, you're Harry Potter. I'm Hermione Granger...and you are...?", 'Spell ID': 37, 'Incantation': 'Oculus Reparo', 'Spell Name': 'Oculus Reparo', 'Effect': 'Repairs glasses', 'Light': nan}, {'Dialogue ID': 393, 'Chapter ID': 16, 'Place ID': 59, 'Character ID': 3, 'Dialogue': 'Oh, move over!  Alohomora.  Get in.', 'Spell ID': 4, 'Incantation': 'Alohomora', 'Spell Name': 'Unlocking Charm', 'Effect': 'Unlocks target', 'Light': 'Blue'}]


In [212]:
dialogue_spell_df = pd.DataFrame(dialogue_spell_list)
dialogue_spell_df = dialogue_spell_df.merge(chapter_df, how='left' ,left_on='Chapter ID', right_on='Chapter ID')
dialogue_spell_df.to_csv('Dialogue spell.csv')
dialogue_spell_df.head(2)

Unnamed: 0,Dialogue ID,Chapter ID,Place ID,Character ID,Dialogue,Spell ID,Incantation,Spell Name,Effect,Light,Chapter Name,Movie ID,Movie Chapter
0,226,10,42,3,"Are you sure that's a real spell? Well, it's n...",37,Oculus Reparo,Oculus Reparo,Repairs glasses,,Ron and Hermione,1,10
1,393,16,59,3,"Oh, move over! Alohomora. Get in.",4,Alohomora,Unlocking Charm,Unlocks target,Blue,Three-Headed Sentinel,1,16


#### Global functions

In [213]:
def combine_2_series(serie_1, serie_2):
    output = copy.deepcopy(serie_1)
    for index, value in output.items():
        output[index] = '{}({})'.format(value.strip(), serie_2[index])
    return output

#### Casting-spell count per spell across movies

In [214]:
groupBy_spell_df = dialogue_spell_df.groupby(['Spell ID'])['Spell ID'].count().reset_index(name='Casting-spell count')
groupBy_spell_df = groupBy_spell_df.merge(spell_df, how='left' ,left_on='Spell ID', right_on='Spell ID')
groupBy_spell_df.head(2)

Unnamed: 0,Spell ID,Casting-spell count,Incantation,Spell Name,Effect,Light
0,1,6,Accio,Summoning Charm,Summons an object,
1,2,1,Aguamenti,Water-Making Spell,Conjures water,Icy blue


In [215]:
# prepare chart data
chart_title = 'Casting-spell count per spell across movies'
x_label = 'Spell Name'
x = groupBy_spell_df['Spell Name']
y1_label = 'Casting-spell count'
y1 = groupBy_spell_df['Casting-spell count']
# Create figure with secondary y-axis
fig = make_subplots()
# Add traces
fig.add_trace(
    go.Bar(x=y1, y=x,name=x_label, width=.7, orientation='h')
)
# Add figure title
fig.update_layout(
    title_text=chart_title,
    height=600,
    barmode='stack',
    font=dict(size=14),
    bargap=0.5,
    yaxis={'categoryorder':'total descending'},
)

fig.show()

#### Spell count per character across movies

In [216]:
groupBy_character_df = dialogue_spell_df.groupby(['Character ID'])['Character ID'].count().reset_index(name='Casting-spell count')
groupBy_character_df = groupBy_character_df.merge(character_df, how='left' ,left_on='Character ID', right_on='Character ID')
groupBy_character_df.head(2)

Unnamed: 0,Character ID,Casting-spell count,Character Name,Species,Gender,House,Patronus,Wand (Wood),Wand (Core)
0,1,48,Harry Potter,Human,Male,Gryffindor,Stag,Holly,Phoenix Feather
1,2,8,Ron Weasley,Human,Male,Gryffindor,Jack Russell Terrier,,


In [217]:
# prepare chart data
chart_title = 'Spell count per character across movies'
x_label = 'Character Name'
x = groupBy_character_df['Character Name']
y1_label = 'Casting-spell count'
y1 = groupBy_character_df['Casting-spell count']
# Create figure with secondary y-axis
fig = make_subplots()
# Add traces
fig.add_trace(
    go.Bar(x=y1, y=x,name=x_label, width=.7, orientation='h')
)
# Add figure title
fig.update_layout(
    title_text=chart_title,
    height=600,
    barmode='stack',
    font=dict(size=14),
    bargap=0.5,
    yaxis={'categoryorder':'total descending'},
)

fig.show()

#### Spell count per spell and character across movies

In [218]:
groupBy_spell_character_df = dialogue_spell_df.groupby(['Spell ID', 'Character ID'])['Character ID'].count().reset_index(name='Casting-spell count')
groupBy_spell_character_df = groupBy_spell_character_df.merge(spell_df, how='left' ,left_on='Spell ID', right_on='Spell ID')
groupBy_spell_character_df = groupBy_spell_character_df.merge(character_df, how='left' ,left_on='Character ID', right_on='Character ID')
groupBy_spell_character_df.head(2)

Unnamed: 0,Spell ID,Character ID,Casting-spell count,Incantation,Spell Name,Effect,Light,Character Name,Species,Gender,House,Patronus,Wand (Wood),Wand (Core)
0,1,1,3,Accio,Summoning Charm,Summons an object,,Harry Potter,Human,Male,Gryffindor,Stag,Holly,Phoenix Feather
1,1,14,1,Accio,Summoning Charm,Summons an object,,Fred Weasley,Human,Male,Gryffindor,,,


In [219]:
# prepare chart data
chart_title = 'Spell count per spell and character across movies'
x_label = 'Character Name'
x = groupBy_spell_character_df['Character Name']
annotation_text = groupBy_spell_character_df['Spell Name']
y1_label = 'Casting-spell count'
y1 = groupBy_spell_character_df['Casting-spell count']
# Create figure with secondary y-axis
fig = make_subplots()
# Add traces
fig.add_trace(
    go.Bar(x=y1, y=x,name=x_label, width=.7, orientation='h', text=annotation_text, insidetextanchor="start")
)
# Add figure title
fig.update_layout(
    title_text=chart_title,
    height=800,
    barmode='stack',
    font=dict(size=14),
    bargap=.5,
    yaxis={'categoryorder':'total descending'},
)

fig.show()

In [220]:
import plotly.express as px
fig = px.bar(
    groupBy_spell_character_df, 
    y="Character Name", 
    x="Casting-spell count", 
    color="Casting-spell count", 
    orientation="h",
    color_continuous_scale='plotly3_r', 
    hover_name="Spell Name",
)
# fig.update_traces(width=.7)
fig.update_layout(
    title_text=chart_title,
    height=800,
    font=dict(size=14),
    bargap=.5,
    yaxis={'categoryorder':'total descending'},
)
fig.show()