In [51]:
from collections import defaultdict
import glob
import json
import os
from os.path import join
import re
from typing import Dict, List

In [45]:
# TODO: Change DATA_DIR per your local filepath

DATA_DIR = '/home/mandar/Mandar/Trope Analysis/'

In [62]:
def parse_movie_dialogs(movie_dialogs):
    actors = set()
    dialogs = []
    for line in movie_dialogs:
        line = line.split('\n')[0]
        line = re.sub('[0-9]', '', line).strip()
        if line.startswith('Written'):
            continue
        if len(line) == 0 or line == '.' or re.match('\(([a-zA-Z ]*)\)', line):
            continue
        if line.isupper():
            actors = actors.union([line]) 
        else:
            dialogs.append(line)
    dialogs = ' '.join(dialogs)
    return dialogs

In [63]:
genre = 'Action'
movies_dialog_list = os.listdir(join(DATA_DIR, genre))
movie_dialog_dict = dict()
for movie_with_dialog in movies_dialog_list:
    movie_name = movie_with_dialog.split('_dialog.txt')[0]
    with open(join(DATA_DIR, genre, movie_with_dialog), 'r') as file:
        movie_dialogs = file.readlines()
    
    parsed_movie_dialog = parse_movie_dialogs(movie_dialogs)
    movie_dialog_dict[movie_name] = parsed_movie_dialog

In [64]:
list(movie_dialog_dict.keys())[:5]

['bourneultimatumthe', 'arcticblue', 'wildhogs', 'entrapment', 'swordfish']

In [66]:
with open(join(DATA_DIR, genre, 'bourneultimatumthe_dialog.txt'), 'r') as file:
    movie_dialogs = file.readlines()

In [68]:
movie_dialog_dict['bourneultimatumthe']

'Tony Gilroy, Scott Z. Burns & George Nolfi Based on the novels by Robert Ludlum June , Notice: This material is the property of Beach City Productions LLC (A wholly owned subsidiary of Universal City Studios, Inc.) and is intended and restricted solely for studio use by studio personnel. Distribution or disclosure of the material to unauthorized persons is prohibited. The sale, copying or reproduction of this material in any form is also prohibited. MOTION -- flat out -- it\'s us -- we\'re running -- stumbling -- breathing rushed -- blood in the snow... We are JASON BOURNE and we\'re running down an alley... Supered below:   MOSCOW BLUE LIGHTS -- from the distance -- strobing through the night -- rushing toward us -- POLICE CARS -- three of them - - SIRENS HOWLING as they bear down -- closer -- faster -- until they whip past the alley... Up against the wall -- BOURNE is hidden in the shadows. BOURNE is badly wounded -- shot through the shoulder -- bruises and broken bones from the fin

In [56]:
def get_genre_movie_list(genre: str) -> List[str]:
    """
    This function returns all the json filenames containing movie dialogs
    Args:
        genre (str): String containing genre name.
        
    Returns:
        List[str]: List of strings containing json filenames.
    """
    movie_genre_json_list = []
    movies_per_genre = os.listdir(join(DATA_DIR, 'ScreenPy', 'ParserOutput', genre))
    for movie in movies_per_genre:
        if movie.endswith('.json'):
            movie_genre_json_list.append(movie)
    return movie_genre_json_list

In [57]:
def load_json_movie_dialog_file(genre: str, movie_filename: str) -> List[List[Dict[str, str]]]:
    """
    Loads the json data contained in movie file:
    Args:
        genre (str): String containing genre name.
        movie_filename (str): String containing movie filename.
    
    Returns:
        List[List[Dict[str]]]: List of lists with each nested list containing a dictionary.
    """
    with open(join(DATA_DIR, 'ScreenPy', 'ParserOutput', genre, movie_filename), 'r') as f:
        movie_dialog_json = json.loads(f.read())

In [63]:
def parse_movie_dialog_data(movie_json_data: List[List[Dict[str, str]]]):
    """
    This function parses the movie json data, and collects the following information,
        1. Unique characters with dialogs
        2. Number of dialogs per character
        3. Dialogs of all characters concatenated into a string
    Args:
        movie_json_data (List[List[Dict[str, str]]]): Json data containing movie character names and dialogs.
        
    Returns:
        Dict[str, Any]: Dictionary with movie name as key and various nested dictionaries 
        containing data mentioned in function description.
    """
    movie_characters = set()
    movie_dialogs = list()
    dialogs_per_character = defaultdict(int)
    movie_info_dict = defaultdict()
    for scene_dialogs in movie_dialog_json:
        for dialog_info in scene_dialogs:
            if 'speaker/title' in dialog_info['head_text']:
                dialog_speaker = dialog_info['head_text']['speaker/title']
                print(f"Speaker: {dialog_speaker}")
                print(dialog_info['text'])
                character = dialog_speaker.split('(')[0].strip()
                movie_characters = movie_characters.union([character])
                dialogs_per_character[character] += 1
                movie_dialogs.append(dialog_info['text'])

    movie_info_dict['characters'] = movie_characters
    movie_info_dict['actor_dialog_count'] = dialogs_per_character
    movie_info_dict['dialogs'] = ' '.join(movie_dialogs)
    return movie_info_dict

In [59]:
genre = 'Action'
movie_genre_json_list = get_genre_movie_list(genre)

movie_filename = 'fantasticfour.json'
movie_json_data = load_json_movie_dialog_file(genre, movie_filename)

In [64]:
all_movie_dialogs = defaultdict()
all_movie_dialogs[movie_filename.split('.json')[0]] = parse_movie_dialog_data(movie_json_data)

Speaker: REED
High open space, exposed structural elements.  Obviously aimed at first time visitors to create feelings of... smallness, inadequacy.
Speaker: BEN
Good thing it ain't workin... Reed, what are we doing here?  This guy's fast-food, strip-mall science --
Speaker: REED
This wasn't our first stop, in case you forgot NASA.  And Victor's not that bad.  He's just a little... (seeing the statue) Larger than life.
Speaker: REED
He's financed some of the biggest breakthroughs of this century.
Speaker: BEN
You'd never know it.
Speaker: BEN (CONT'D)
Jesus.  That too?
Speaker: REED
Reed Richards and Ben Grimm to see --
Speaker: FEMALE RECEPTIONIST
Executive elevator, top floor.
Speaker: BEN
What's the price for a smile round here?
Speaker: REED (O.S.)
My research suggests that exposure to a high-energy cosmic storm born on solar winds might have triggered the evolution of early planetary life.
Speaker: REED (CONT'D)
In six weeks another cloud with the same elemental profile will pass E

In [65]:
all_movie_dialogs['fantasticfour']

defaultdict(None,
            {'characters': {'ALICIA',
              "ALICIA (CONT'D)",
              'ALICIA (O.S.)',
              'AUTOMATED VOICE',
              'BEN',
              "BEN (CONT'D)",
              'BEN (INTO RADIO)',
              'BEN (O.S.)',
              'BEN (ON SCREEN)',
              'BEN ACTION FIGURE',
              'BOHEMIAN GIRL #1',
              'BOHEMIAN GIRL #2',
              'BOYFRIEND',
              'BRIDGE COP',
              'BRIDGE COP #1',
              'BRIDGE COP #2',
              'BUSINESSMAN',
              'CHIEF FIREMAN',
              "CHIEF FIREMAN (CONT'D)",
              'COP',
              'DEBBIE',
              'DEBBIE (ON SCREEN)',
              'DOCTOR',
              'DOOM',
              "DOOM (CONT'D)",
              'DOOM (O.S.)',
              'ERNIE',
              'ERNIE (O.S.)',
              'FEMALE RECEPTIONIST',
              'GIRLFRIEND',
              "GIRLFRIEND (CONT'D)",
              'JOHNNY',
              "