# Extraction of Action Discriptions and Dialogue from Movie Scripts

In this notebook, we extract all action descriptions and dialogue contained in movie scripts from `scriptbase_alpha` https://github.com/EdinburghNLP/scriptbase

In [1]:
from xml.dom import minidom
import glob
import pandas as pd
import numpy as np
import itertools, re
from collections import Counter
import gender_guesser.detector as gender
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy 
import seaborn as sns
sns.set_style()
from matplotlib import pyplot as plt
plt.style.use('ggplot')
nlp = spacy.load('en')

**Load Script Files**

Note that this requires the scriptbase_j repository: https://github.com/EdinburghNLP/scriptbase

In [23]:
script_fns = files = glob.glob('/home/fhopp/projects/templeton/sony/scriptbase/scriptbase_j/*/processed/script_clean.txt')

In [24]:
print('Total number of movie scripts:', len(script_fns))

Total number of movie scripts: 917


The final dataformat will be a dataframe that is indexed by each movie title and contains all the dialogue and all the action descriptions.

Action descriptions and dialogues will be detected via their different indentation levels.

In [168]:
script_fns = glob.glob('/home/fhopp/projects/templeton/sony/scriptbase/scriptbase_j/*/processed/script.xml')

Below we extract the dialogue and all action descriptions from each movie. This may take up to 2 hours.

In [173]:
dataframes = []

for script in script_fns:
    title = script.split('/')[-3].split('(')[0].strip()
    df = pd.DataFrame(index = [title], columns =['dialogue','action'])
    
    xml = minidom.parse(script)
    
    
    # Dialogues
    dialogue_objects = []
    dialogue = xml.getElementsByTagName('speech')
    words_in_dialogue = []

    for s in dialogue:
        sentence = s.getElementsByTagName('sentence')
        word_list = [w.getElementsByTagName('word') for w in sentence]
        # loop through the XML word objects and get the actual word 
        for w in word_list:
            for x in w:
                if x.firstChild.data != list():
                    words_in_dialogue.append(x.firstChild.data)


    # Clean Dialogue
    joined_words = ' '.join(words_in_dialogue)
    joined_words = joined_words.replace(" ?","?")
    joined_words = joined_words.replace(" .",".")
    joined_words = joined_words.replace(" !","!")
    joined_words = joined_words.replace(" '","'")
    joined_words = joined_words.replace(" ,",",")
    joined_words = joined_words.replace(" ;",";")
    joined_words = joined_words.replace(" - ","-")
    joined_words = joined_words.replace("- ","-")
    joined_words = joined_words.replace(" na","na")
    joined_words = joined_words.replace(" n't","n't")
    clean_dialogue = joined_words
    df['dialogue'] = clean_dialogue

    # Action Descriptions
    action_objects = []
    action = xml.getElementsByTagName('description')
    words_in_action = []

    for s in action:
        sentence = s.getElementsByTagName('sentence')
        word_list = [w.getElementsByTagName('word') for w in sentence]
        # loop through the XML word objects and get the actual word 
        for w in word_list:
            for x in w:
                if x.firstChild.data != list():
                    words_in_action.append(x.firstChild.data)


    # Clean Actions
    joined_words = ' '.join(words_in_action)
    joined_words = joined_words.replace(" .",".")
    joined_words = joined_words.replace(" !","!")
    joined_words = joined_words.replace(" '","'")
    joined_words = joined_words.replace(" ,",",")
    joined_words = joined_words.replace(" ;",";")
    joined_words = joined_words.replace(" - ","-")
    joined_words = joined_words.replace(" n't","n't")
    clean_action = joined_words
    df['action'] = clean_action
    
    dataframes.append(df)

In [210]:
scripts = pd.concat(dataframes)

In [211]:
scripts.to_csv('scripts4scoring.csv')

In [213]:
scripts.to_pickle('scripts4scoring.pkl')