In [5]:
# import necessary packages
import pandas as pd
import numpy as np
import os
import title_list as tl
import char_replacement as cr
import char_list as cl

In [6]:
# filter to just movies I want
directory = '/Users/macambler/Documents/GitHub/NLP_Final_Project/data/script_scraping/scripts/parsed/dialogue'

movie_files = []
for filename in os.listdir(directory):
    title = filename[:-13]
    if title in tl.title_list:
        movie_files.append(filename)
movie_files.sort()

In [7]:
# load dialogue from movies into dataframe
df = pd.DataFrame(columns=['title','line'])

for filename in movie_files:
    f = os.path.join(directory, filename)
    file = open(f, 'r')
    while True:
        line = file.readline()
        if not line:
            break
        df = pd.concat([df, pd.DataFrame({'title': [filename[:-13]], 'line': [line]})], ignore_index=True)

In [8]:
# split the lines into character and dialogue
# put these characters and dialogue into separate dataframe columns
df['character'] = ''
df['dialogue'] = ''
for i, line in enumerate(df['line']):
    df['character'][i] = df['line'][i].split('=>')[0]
    df['line'][i] = df['line'][i].replace('\n','')
    df['dialogue'][i] = df['line'][i].split('=>')[1]

In [9]:
# drop the original lines
parsed_df = df.drop(columns=['line'])
parsed_df

Unnamed: 0,title,character,dialogue
0,Avatar,VOICE,"When I was lying there in the VA hospital, wit..."
1,Avatar,VOICE,"Sooner or later though, you always have to wak..."
2,Avatar,JAKE,"They can fix a spinal, if you've got the money..."
3,Avatar,PERKY NEWSCASTER,"The Bengal tiger, extinct for over a century, ..."
4,Avatar,JAKE,I became a Marine for the hardship. To be hamm...
...,...,...,...
37856,Wizard-of-Oz-The,DOROTHY,...but most of it was beautiful. But just the ...
37857,Wizard-of-Oz-The,DOROTHY,Doesn't anybody believe me?
37858,Wizard-of-Oz-The,UNCLE HENRY,"Of course we believe you, Dorothy."
37859,Wizard-of-Oz-The,DOROTHY,"Oh, but anyway, Toto, we're home!"


In [10]:
# use the user-built character replacement list
# to replace any inconsistent character names or those with typos
for i in cr.char_replacement_ls:
    parsed_df['character'] = parsed_df['character'].str.replace(i[0],i[1])

In [11]:
# filter the characters to just those in user built character list
filtered_df = parsed_df[parsed_df['character'].isin(cl.char_list)].reset_index(drop=True)
filtered_df

Unnamed: 0,title,character,dialogue
0,Avatar,JAKE,"They can fix a spinal, if you've got the money..."
1,Avatar,JAKE,I became a Marine for the hardship. To be hamm...
2,Avatar,JAKE,Let's get it straight up front. I don't want y...
3,Avatar,JAKE,"You want a fair deal, you're on the wrong plan..."
4,Avatar,JAKE,It's just the way things are. And nobody does ...
...,...,...,...
26999,Wizard-of-Oz-The,DOROTHY,"No, Aunt Em -- this was a real, truly live pla..."
27000,Wizard-of-Oz-The,DOROTHY,...but most of it was beautiful. But just the ...
27001,Wizard-of-Oz-The,DOROTHY,Doesn't anybody believe me?
27002,Wizard-of-Oz-The,DOROTHY,"Oh, but anyway, Toto, we're home!"


In [12]:
# group by character and count lines of dialogue
table = filtered_df.groupby(['character']).count()
table

Unnamed: 0_level_0,title,dialogue
character,Unnamed: 1_level_1,Unnamed: 2_level_1
AGENT MARIA HILL,22,22
AGENT SMITH,74,74
AGNES,65,65
ALAN,226,226
ALFRED,69,69
...,...,...
WINSTON,31,31
WITCH,49,49
YODA,75,75
YONDU,82,82


In [13]:
# find only the characters with at least 100 lines of dialogue
most_lines_chars = table[table['dialogue']>=100].index.get_level_values('character')

In [15]:
most_lines_chars

Index(['ALAN', 'ANAKIN', 'ANDREW', 'ANNA', 'ARAGORN', 'BATMAN', 'BELLA',
       'BELLE', 'BENDER', 'BRIAN', 'C-3PO', 'CAMERON', 'CAPTAIN AMERICA',
       'CARL', 'CARTMAN', 'CLAIRE', 'DONKEY', 'DOROTHY', 'DORY', 'EDWARD',
       'ELIZABETH', 'ELLIE', 'ELSA', 'EVAN', 'FERRIS', 'FINN', 'FIONA',
       'FOGELL', 'FORREST', 'FOX', 'FRODO', 'GAMORA', 'GANDALF', 'GEORGE',
       'GRACE', 'GRANT', 'GRU', 'HAN', 'HARRY', 'HENRY', 'HICCUP', 'HOLLEY',
       'HÉCTOR', 'INDY', 'INIGO', 'IRON MAN', 'JACK', 'JAKE', 'JANE', 'JENNY',
       'KRISTOFF', 'KYLE', 'LEIA', 'LION', 'LLOYD', 'LOKI', 'LUCY', 'LUKE',
       'LUMIÈRE', 'MARLIN', 'MARY', 'MATER', 'MCQUEEN', 'MIGUEL', 'MORPHEUS',
       'NEMO', 'NEO', 'NICK FURY', 'OBI-WAN', 'OLAF', 'PADME', 'PETER', 'PHIL',
       'PO', 'QUILL', 'REY', 'ROCKET', 'RUSSELL', 'SAM', 'SCARECROW', 'SETH',
       'SHIFU', 'SHREK', 'SLOANE', 'STAN', 'STOICK', 'STU', 'T'CHALLA', 'THOR',
       'TIN MAN', 'VALJEAN', 'WILLIE'],
      dtype='object', name='character')

In [16]:
# filter to only these characters
final_df = filtered_df[filtered_df['character'].isin(most_lines_chars)].reset_index(drop=True)
final_df

Unnamed: 0,title,character,dialogue
0,Avatar,JAKE,"They can fix a spinal, if you've got the money..."
1,Avatar,JAKE,I became a Marine for the hardship. To be hamm...
2,Avatar,JAKE,Let's get it straight up front. I don't want y...
3,Avatar,JAKE,"You want a fair deal, you're on the wrong plan..."
4,Avatar,JAKE,It's just the way things are. And nobody does ...
...,...,...,...
18923,Wizard-of-Oz-The,DOROTHY,"No, Aunt Em -- this was a real, truly live pla..."
18924,Wizard-of-Oz-The,DOROTHY,...but most of it was beautiful. But just the ...
18925,Wizard-of-Oz-The,DOROTHY,Doesn't anybody believe me?
18926,Wizard-of-Oz-The,DOROTHY,"Oh, but anyway, Toto, we're home!"


In [17]:
# write the dataframe to a csv
final_df.to_csv('../data/processed_char_dialogue.csv')