### Import necessary libraries

In [132]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import re
import sys
sys.path.append(r'../func')
import combine_obj
import export_obj

### Load necessary data from csv as dataframe

In [133]:
movie_df = pd.read_csv('..\Harry Potter-20220610T024151Z-001\Movies.csv', sep=',', quotechar='"')
movie_df.head(2)

Unnamed: 0,Movie ID,Movie Title,Release Year,Runtime,Budget,Box Office
0,1,Harry Potter and the Philosopher's Stone,2001,152,"$125,000,000","$1,002,000,000"
1,2,Harry Potter and the Chamber of Secrets,2002,161,"$100,000,000","$880,300,000"


In [134]:
chapter_df = pd.read_csv('..\Harry Potter-20220610T024151Z-001\Chapters.csv', sep=',', quotechar='"', encoding='Latin-1')
chapter_df.head(2)

Unnamed: 0,Chapter ID,Chapter Name,Movie ID,Movie Chapter
0,1,Doorstep Delivery,1,1
1,2,The Vanishing Glass,1,2


In [135]:
dialogue_df = pd.read_csv('..\Harry Potter-20220610T024151Z-001\Dialogue.csv', sep=',', quotechar='"', encoding='Latin-1')
dialogue_df.head(2)

Unnamed: 0,Dialogue ID,Chapter ID,Place ID,Character ID,Dialogue
0,1,1,8,4,I should have known that you would be here...Professor McGonagall.
1,2,1,8,7,"Good evening, Professor Dumbledore. Are the rumours true, Albus?"


In [136]:
place_df = pd.read_csv('..\Harry Potter-20220610T024151Z-001\Places.csv', sep=',', quotechar='"', encoding='Latin-1')
place_df.head(2)

Unnamed: 0,Place ID,Place Name,Place Category
0,1,Flourish & Blotts,Diagon Alley
1,2,Gringotts Wizarding Bank,Diagon Alley


In [137]:
character_df = pd.read_csv('..\Harry Potter-20220610T024151Z-001\Characters.csv', sep=',', quotechar='"', encoding='Latin-1')
character_df.head(2)

Unnamed: 0,Character ID,Character Name,Species,Gender,House,Patronus,Wand (Wood),Wand (Core)
0,1,Harry Potter,Human,Male,Gryffindor,Stag,Holly,Phoenix Feather
1,2,Ron Weasley,Human,Male,Gryffindor,Jack Russell Terrier,,


In [138]:
spell_df = pd.read_csv('..\Harry Potter-20220610T024151Z-001\Spells.csv', sep=',', quotechar='"')
spell_df = spell_df.replace(u'\xa0', u' ', regex=True).replace(u'\s+', u' ', regex=True)
spell_df.head(2)

Unnamed: 0,Spell ID,Incantation,Spell Name,Effect,Light
0,1,Accio,Summoning Charm,Summons an object,
1,2,Aguamenti,Water-Making Spell,Conjures water,Icy blue


### Analyze

#### Generate basic data

In [139]:
# check if duplicated incantation
if len(spell_df[spell_df.duplicated(subset=['Incantation'])]) > 0:
    print('========>>>>>>>> FOUND DUPLICATED INCANTATION!!')
else:
    spell_list = spell_df.to_dict('records')
    spell_list = sorted(spell_list, key=lambda x: len(x['Incantation']), reverse=True)
    print(spell_list[:2])
    dialogue_list = dialogue_df.to_dict('records')
    print(dialogue_list[:2])

[{'Spell ID': 39, 'Incantation': 'Peskipiksi Pesternomi', 'Spell Name': 'Peskipiksi Pesternomi', 'Effect': 'Nothing (supposedly captures pixies)', 'Light': nan}, {'Spell ID': 41, 'Incantation': 'Piertotum Locomotor', 'Spell Name': 'Piertotum Locomotor', 'Effect': 'Animates target', 'Light': 'Green'}]
[{'Dialogue ID': 1, 'Chapter ID': 1, 'Place ID': 8, 'Character ID': 4, 'Dialogue': 'I should have known that you would be here...Professor McGonagall.'}, {'Dialogue ID': 2, 'Chapter ID': 1, 'Place ID': 8, 'Character ID': 7, 'Dialogue': 'Good evening, Professor Dumbledore. Are the rumours true, Albus?'}]


In [140]:
dialogue_spell_list = []
for dialogue_dict in dialogue_list:
    tmp_dialogue = dialogue_dict['Dialogue'].lower()
    is_break = False
    for spell_dict in spell_list:
        incantation = spell_dict['Incantation']
        searching_regex = r'\b{}\b'.format(incantation.lower())
        searching_result_list = re.findall(searching_regex, tmp_dialogue)
        casting_spell_count = len(searching_result_list)
        if casting_spell_count > 0:
            dialogue_spell_dict = dialogue_dict
            dialogue_spell_dict['Casting-spell count'] = casting_spell_count
            dialogue_spell_dict.update(spell_dict)
            dialogue_spell_list.append(dialogue_spell_dict)
            for searching_result in searching_result_list:
                tmp_dialogue = re.sub(searching_result, '', tmp_dialogue)
print(dialogue_spell_list[:2])

[{'Dialogue ID': 226, 'Chapter ID': 10, 'Place ID': 42, 'Character ID': 3, 'Dialogue': "Are you sure that's a real spell? Well, it's not very good, is it? Of course I've only tried a few simple spells myself, and they've all worked for me. For example...  Oculus Reparo.  That's better, isn't it? Holy Cricket, you're Harry Potter. I'm Hermione Granger...and you are...?", 'Casting-spell count': 1, 'Spell ID': 37, 'Incantation': 'Oculus Reparo', 'Spell Name': 'Oculus Reparo', 'Effect': 'Repairs glasses', 'Light': nan}, {'Dialogue ID': 393, 'Chapter ID': 16, 'Place ID': 59, 'Character ID': 3, 'Dialogue': 'Oh, move over!  Alohomora.  Get in.', 'Casting-spell count': 1, 'Spell ID': 4, 'Incantation': 'Alohomora', 'Spell Name': 'Unlocking Charm', 'Effect': 'Unlocks target', 'Light': 'Blue'}]


In [141]:
dialogue_spell_df = pd.DataFrame(dialogue_spell_list)
dialogue_spell_df = dialogue_spell_df.merge(chapter_df, how='left' ,left_on='Chapter ID', right_on='Chapter ID')
dialogue_spell_df.to_csv('Dialogue spell.csv')
dialogue_spell_df.head(2)

Unnamed: 0,Dialogue ID,Chapter ID,Place ID,Character ID,Dialogue,Casting-spell count,Spell ID,Incantation,Spell Name,Effect,Light,Chapter Name,Movie ID,Movie Chapter
0,226,10,42,3,"Are you sure that's a real spell? Well, it's not very good, is it? Of course I've only tried a few simple spells myself, and they've all worked for me. For example... Oculus Reparo. That's better, isn't it? Holy Cricket, you're Harry Potter. I'm Hermione Granger...and you are...?",1,37,Oculus Reparo,Oculus Reparo,Repairs glasses,,Ron and Hermione,1,10
1,393,16,59,3,"Oh, move over! Alohomora. Get in.",1,4,Alohomora,Unlocking Charm,Unlocks target,Blue,Three-Headed Sentinel,1,16


In [142]:
dialogue_spell_df.query("`Dialogue ID` == 2348 or `Dialogue ID` == 1872")

Unnamed: 0,Dialogue ID,Chapter ID,Place ID,Character ID,Dialogue,Casting-spell count,Spell ID,Incantation,Spell Name,Effect,Light,Chapter Name,Movie ID,Movie Chapter
28,1872,71,8,1,Lumos Maxima. Lumos Maxima. Lumos Maxima. Lumos Maxima. Lumos Maxima!,5,32,Lumos Maxima,Lumos Maxima,Produces bright light,White,Under Covers Prologue,3,1
41,2348,88,30,1,Just do it. Expecto Patronum! Expecto Patronum!,2,18,Expecto Patronum,Patronus Charm,Conjures a spirit guardian,Silver,The Patronus,3,18


In [143]:
b = dialogue_spell_df.query("`Character ID` == 1")
len(b)

48

#### INCLUDE the duplication of incantation within a dialogue

##### Casting-spell count per spell across movies

In [144]:
groupBy_spell_df = dialogue_spell_df.groupby(['Spell ID'])['Casting-spell count'].sum().reset_index(name='Casting-spell count')
groupBy_spell_df = groupBy_spell_df.merge(spell_df, how='left' ,left_on='Spell ID', right_on='Spell ID')
groupBy_spell_df.head(2)

Unnamed: 0,Spell ID,Casting-spell count,Incantation,Spell Name,Effect,Light
0,1,6,Accio,Summoning Charm,Summons an object,
1,2,1,Aguamenti,Water-Making Spell,Conjures water,Icy blue


In [145]:
dialogue_spell_df.query("`Spell ID` == 18")

Unnamed: 0,Dialogue ID,Chapter ID,Place ID,Character ID,Dialogue,Casting-spell count,Spell ID,Incantation,Spell Name,Effect,Light,Chapter Name,Movie ID,Movie Chapter
38,2337,88,30,11,"Can you do this? Yes. Very well. Close your eyes. Concentrate. Explore your past. Do you have a memory? Allow it to fill you up. Lose yourself within it. Then speak the incantation, Expecto Patronum.",1,18,Expecto Patronum,Patronus Charm,Conjures a spirit guardian,Silver,The Patronus,3,18
39,2338,88,30,1,Expecto Patronum.,1,18,Expecto Patronum,Patronus Charm,Conjures a spirit guardian,Silver,The Patronus,3,18
40,2340,88,30,1,Expecto Patronum! Expecto... Expecto... Expect...,1,18,Expecto Patronum,Patronus Charm,Conjures a spirit guardian,Silver,The Patronus,3,18
41,2348,88,30,1,Just do it. Expecto Patronum! Expecto Patronum!,2,18,Expecto Patronum,Patronus Charm,Conjures a spirit guardian,Silver,The Patronus,3,18
47,2591,96,37,1,Sirius! No. Sirius! Expecto Patronum!,1,18,Expecto Patronum,Patronus Charm,Conjures a spirit guardian,Silver,The Dementor's Kiss,3,26
49,2710,100,37,1,Expecto Patronum!,1,18,Expecto Patronum,Patronus Charm,Conjures a spirit guardian,Silver,"Truly Seen, Truly Free",3,30
53,3518,131,11,1,"""Expecto Patronum!'",1,18,Expecto Patronum,Patronus Charm,Conjures a spirit guardian,Silver,Dudley Demented,5,1
54,3521,131,11,1,"""Expecto Patronum! '",1,18,Expecto Patronum,Patronus Charm,Conjures a spirit guardian,Silver,Dudley Demented,5,1
56,3573,132,8,11,"""Expecto Patronum!'",1,18,Expecto Patronum,Patronus Charm,Conjures a spirit guardian,Silver,The Advance Guard,5,2
101,6198,194,70,1,Expecto PATRONUM!,1,18,Expecto Patronum,Patronus Charm,Conjures a spirit guardian,Silver,Ministry Interior,7,11


In [146]:
# prepare chart data
chart_title = 'Casting-spell count per spell across movies'
x_label = 'Casting-spell count'
y_label = 'Character Name'
# Create figure
fig = px.bar(
    groupBy_spell_df, 
    y="Spell Name", 
    x="Casting-spell count", 
    orientation="h",
    color_continuous_scale='plotly3_r', 
    hover_data=["Incantation"],
)
# Add figure title
fig.update_traces(width=.7)
# Add figure title
fig.update_layout(
    title_text=chart_title,
    xaxis_title="<b>{}</b>".format(x_label),
    font=dict(size=14),
    bargap=.5,
    yaxis={'categoryorder':'total descending'},
)
# Set y-axes titles
fig.update_yaxes(title_text="<b style='color:blue'>{}</b>".format(y_label), secondary_y=False)
fig.show()

In [147]:
char_id = '5f817172-f1e7-4ca3-8b60-4a7e40c99239'
export_obj.export_chart_to_html(
    fig=fig, 
    height=600, 
    chart_id=char_id,
    chart_title=chart_title,
    path_to_filename='../docs/{}.html'.format(char_id), 
    describtion_list=[
        {'This chart displays the total times that the spell is casted across the movies:': [
            'The x-axis will be the total times', 
            'The y-axis will be the spell name',
        ]},
        'The count(times) of a spell does include the duplication of a dialogue',
        'When hovering a bar, a text box will appear with the above data as well as the equivalent incantation'
    ], 
    insight_list=[
        'The spell name "Patronus Charm" is used the most with 13 times => answering for the question 4 - 1st point'
    ]
)

##### Spell count per character across movies

In [148]:
groupBy_spell_df = dialogue_spell_df.groupby(['Character ID'])['Casting-spell count'].sum().reset_index(name='Casting-spell count')
groupBy_spell_df = groupBy_spell_df.merge(character_df, how='left' ,left_on='Character ID', right_on='Character ID')
groupBy_spell_df.head(2)

Unnamed: 0,Character ID,Casting-spell count,Character Name,Species,Gender,House,Patronus,Wand (Wood),Wand (Core)
0,1,53,Harry Potter,Human,Male,Gryffindor,Stag,Holly,Phoenix Feather
1,2,8,Ron Weasley,Human,Male,Gryffindor,Jack Russell Terrier,,


##### Spell count per spell and character across movies

In [149]:
groupBy_spell_character_df = dialogue_spell_df.groupby(['Spell ID', 'Character ID'])['Casting-spell count'].sum().reset_index(name='Casting-spell count')
groupBy_spell_character_df = groupBy_spell_character_df.merge(spell_df, how='left' ,left_on='Spell ID', right_on='Spell ID')
groupBy_spell_character_df = groupBy_spell_character_df.merge(character_df, how='left' ,left_on='Character ID', right_on='Character ID')
groupBy_spell_character_df.head(2)

Unnamed: 0,Spell ID,Character ID,Casting-spell count,Incantation,Spell Name,Effect,Light,Character Name,Species,Gender,House,Patronus,Wand (Wood),Wand (Core)
0,1,1,3,Accio,Summoning Charm,Summons an object,,Harry Potter,Human,Male,Gryffindor,Stag,Holly,Phoenix Feather
1,1,14,1,Accio,Summoning Charm,Summons an object,,Fred Weasley,Human,Male,Gryffindor,,,


In [150]:
# prepare chart data
chart_title = 'Spell count per spell and character across movies'
x_label = 'Casting-spell count'
y_label = 'Character Name'
# Create figure
fig = px.bar(
    groupBy_spell_character_df, 
    y="Character Name", 
    x="Casting-spell count", 
    color="Casting-spell count", 
    orientation="h",
    color_continuous_scale='plotly3_r', 
    hover_name="Spell Name",
    hover_data=["Incantation"],
)
fig.update_traces(width=.7)
# Add figure title
fig.update_layout(
    title_text=chart_title,
    xaxis_title="<b>{}</b>".format(x_label),
    font=dict(size=14),
    bargap=.5,
    yaxis={'categoryorder':'total descending'},
    annotations=[
        {"x": total * 1.01, "y": x, "text": str(total), "showarrow": False}
        for x, total in groupBy_spell_character_df.groupby("Character Name", as_index=False).agg({"Casting-spell count": "sum"}).values
    ]
)
# Set y-axes titles
fig.update_yaxes(title_text="<b style='color:blue'>{}</b>".format(y_label), secondary_y=False)
fig.show()

In [151]:
# prepare chart data
chart_title = 'Spell count per spell and character across movies'
x_label = 'Casting-spell count'
y_label = 'Character Name'
# generate annotations
annotations = []
for x, total_duplicated, total_unique in groupBy_spell_character_df.groupby(['Character Name'])['Casting-spell count'].sum().reset_index(name='Casting-duplicated-spell count').merge(groupBy_spell_character_df.drop_duplicates(subset=['Character Name', 'Spell ID']).groupby(['Character Name'])['Casting-spell count'].count().reset_index(name='Casting-unique-spell count'), how='left' ,left_on='Character Name', right_on='Character Name').values:
    annotations.append({"x": x, "y": total_duplicated + 2, "text": "{}|{}".format(total_duplicated, total_unique), "showarrow": False})
# Create figure
fig = px.bar(
    groupBy_spell_character_df, 
    y="Casting-spell count", 
    x="Character Name", 
    color="Casting-spell count", 
    color_continuous_scale='plotly3_r', 
    hover_name="Spell Name",
    hover_data=["Incantation"],
)
fig.update_traces(width=.7)
# Add figure title
fig.update_layout(
    title_text=chart_title,
    xaxis_title="<b>{}</b>".format(x_label),
    font=dict(size=14),
    bargap=.5,
    xaxis={'categoryorder':'total descending'},
    # annotations=[
    #     {"x": x, "y": total * 1.05, "text": str(total), "showarrow": False}
    #     for x, total in groupBy_spell_character_df.groupby("Character Name", as_index=False).agg({"Casting-spell count": "sum"}).values
    # ]
    annotations=annotations,
)
# Set y-axes titles
fig.update_yaxes(title_text="<b style='color:blue'>{}</b>".format(y_label), secondary_y=False)

fig.show()

In [152]:
char_id = 'c8842f18-45aa-4d0f-a596-a9be6de37705'
export_obj.export_chart_to_html(
    fig=fig, 
    height=600, 
    chart_id=char_id,
    chart_title=chart_title,
    path_to_filename='../docs/{}.html'.format(char_id), 
    describtion_list=[
        {'This chart displays the total times of a spell casted by a character across the movies:': [
            'The x-axis will be the character name', 
            'The y-axis will be the total times',
        ]},
        'By default, the count(times) of a spell does include the duplication of the spell incantation within/between the dialogue',
        {'At the top of a bar of a character, there will be 2 numbers separated by "|":': [
            'The one before the delimiter is the total times that the character casted the spell including the above duplication',
            'The one after the delimiter is the total number of spells used by the character not including the above duplication'
        ]},
        'When hovering a bar, a text box will appear with the above data as well as the equivalent incantation',
        {'A bar of a character can have many sub-bars, which allows us to indicate the following things:': [
            'How many spells (unique spell name) are used by the character by count the number of sub-bar of the character',
            'How many times that each spell is used by the character by hovering your mouse into its equivalent sub-bar'
        ]}
    ], 
    insight_list=[
        {'Some characters have a favorite spell, but the remaining is not. If a character use many spells, and only one of these spells is used most, then we can consider that the character has a favorite spell. Otherwise, we will consider that the character does not => answering for the question 4 - 2nd point​. For example,': [
            'The favorite spell of Harry Potter is Patronus Charm',
            'With Hermione Granger, we cannot determine a favorite spell because there are many spells having the most casting times',
        ]},
        'Harry Potter is one having the most casting times. The next one is Hermione Granger, and the 3rd one is Ron Weasley',
        'As you known that the 3 characters Harry Potter, Hermione Granger and Ron Weasley in many films, but the casting times of them are big different. The most interesting thing is that Hermione Granger has nearly 3 times of casting times than Ron Weasley',
        'Harry Potter used the most unique spells. The next one is Hermione Granger, and the 3rd one is Ron Weasley'
    ]
)

#### NOT INCLUDE the duplication of incantation within a dialogue