In [338]:
from collections import Counter, defaultdict
import glob
import json
import os
from os.path import join
import re
from typing import Dict, List

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC

from xgboost import XGBClassifier

In [2]:
# TODO: Change DATA_DIR per your local filepath

DATA_DIR = '/home/mandar/Data/NCSU/TropeAnalysis/TropesDataset'

In [3]:
def get_genre_movie_list(genre: str) -> List[str]:
    """
    This function returns all the json filenames containing movie dialogs
    Args:
        genre (str): String containing genre name.
        
    Returns:
        List[str]: List of strings containing json filenames.
    """
    movie_genre_json_list = []
    movies_per_genre = os.listdir(join(DATA_DIR, 'ScreenPy', 'ParserOutput', genre))
    for movie in movies_per_genre:
        if movie.endswith('.json'):
            movie_genre_json_list.append(movie)
    return movie_genre_json_list

In [4]:
def load_json_movie_dialog_file(genre: str, movie_filename: str) -> List[List[Dict[str, str]]]:
    """
    Loads the json data contained in movie file:
    Args:
        genre (str): String containing genre name.
        movie_filename (str): String containing movie filename.
    
    Returns:
        List[List[Dict[str]]]: List of lists with each nested list containing a dictionary.
    """
    with open(join(DATA_DIR, 'ScreenPy', 'ParserOutput', genre, movie_filename), 'r') as f:
        movie_dialog_json = json.loads(f.read())
    return movie_dialog_json

In [27]:
def parse_movie_dialog_data(movie_json_data: List[List[Dict[str, str]]], 
                            verbose: bool = False):
    """
    This function parses the movie json data, and collects the following information,
        1. Unique characters with dialogs
        2. Number of dialogs per character
        3. Dialogs of all characters concatenated into a string
    Args:
        movie_json_data (List[List[Dict[str, str]]]): Json data containing movie character names and dialogs.
        verbose (bool): Boolean indicating whether raw dialogs should be printed.
        
    Returns:
        Dict[str, Any]: Dictionary with movie name as key and various nested dictionaries 
        containing data mentioned in function description.
    """
    movie_characters = set()
    movie_dialogs = list()
    dialogs_per_character = defaultdict(int)
    movie_info_dict = defaultdict()
    for scene_dialogs in movie_json_data:
        for dialog_info in scene_dialogs:
            if 'speaker/title' in dialog_info['head_text']:
                dialog_speaker = dialog_info['head_text']['speaker/title']
                if verbose:
                    print(f"Speaker: {dialog_speaker}")
                    print(dialog_info['text'])
                character = dialog_speaker.split('(')[0].strip()
                movie_characters = movie_characters.union([character])
                dialogs_per_character[character] += 1
                movie_dialogs.append(dialog_info['text'])

    movie_info_dict['characters'] = movie_characters
    movie_info_dict['actor_dialog_count'] = dialogs_per_character
    movie_info_dict['dialogs'] = ' '.join(movie_dialogs)
    return movie_info_dict

In [6]:
genre = 'Action'
genre_movie_json_list = get_genre_movie_list(genre)

In [7]:
genre_movie_json_list[:5]

['avatar.json',
 'dune.json',
 'blade.json',
 'machete.json',
 'startrekfirstcontact.json']

In [137]:
with open(join(DATA_DIR, 'films_tropes_20190501.json'), 'rb') as file:
    tvtropes_json_dict = json.load(file)

In [46]:
len(tvtropes_json_dict[action_movie_script_trope_df.Movie_trope[2]])

214

In [67]:
movie_list = [movie.split('.json')[0] for movie in genre_movie_json_list]

In [79]:
action_movie_script_trope_df = pd.read_csv(join(DATA_DIR, 'action_movie_script_trope_match.csv'))

In [80]:
# Find movies that have tropes
movie_match_df = action_movie_script_trope_df.loc[action_movie_script_trope_df.Movie_Script.isin(movie_list)].copy()
len(movie_match_df)

263

In [86]:
all_raw_movie_dialogs = defaultdict()
all_preprocess_movie_dialogs = defaultdict()
movie_trope_dict = defaultdict()

for movie_row in movie_match_df.iterrows():
    movie = movie_row[1].Movie_Script
    movie_filename = movie + '.json'
    movie_json_data = load_json_movie_dialog_file(genre, movie_filename)
    # Parse movie dialogs and preprocess text
    all_movie_dialogs[movie] = parse_movie_dialog_data(movie_json_data)
    all_preprocess_movie_dialogs[movie] = simple_preprocess(all_movie_dialogs[movie]['dialogs'])
    
    # Collect list of tropes for the movie
    movie_trope_dict[movie] = tvtropes_json_dict[movie_row[1].Movie_trope]

In [87]:
len(movie_trope_dict), len(all_preprocess_movie_dialogs)

(263, 263)

In [153]:
# Get list of all unique tropes
unique_tropes_set = list()
for tropes in movie_trope_dict.values():
    unique_tropes_set += list(set(tropes))

In [118]:
tropes_count_dict = Counter(unique_tropes_set)

In [216]:
threshold = 0.5
min_trope_count = 35
tropes_subset_list = list()
for trope, count in tropes_count_dict.items():
    if count > min_trope_count:
        tropes_subset_list.append(trope)

In [217]:
len(tropes_subset_list)

110

In [224]:
movie_tropes_subset_dict = defaultdict()
for movie, trope in movie_trope_dict.items():
    movie_tropes_subset_dict[movie] = list(set(tropes_subset_list).intersection(set(trope)))
    if len(movie_tropes_subset_dict[movie]) == 0:
        print(f'{movie} has no tropes in json file')

avengersthe2012 has no tropes in json file


## Train Doc2Vec model on Movie Scripts

In [31]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(all_preprocess_movie_dialogs.values())]

In [405]:
multi_output = MultiLabelBinarizer()
y = multi_output.fit_transform(list(movie_tropes_subset_dict.values()))

In [415]:
documents.pop(np.where(y.sum(axis=1)==0)[0][0])
y = y[y.sum(axis=1) > 0]
print(len(documents), len(y))

262 262


In [416]:
X_train_docs, X_test_docs, y_train, y_test = train_test_split(documents, y, 
                                                              train_size=0.75)

In [417]:
len(X_train_docs), len(X_test_docs), y_train.shape, y_test.shape

(196, 66, (196, 110), (66, 110))

In [None]:
vector_size = [10, 20, 50, 100, 200, 250, 300]
window_size = [2, 5, 7, 10]
epochs = [10, 20, 30]

for v in vector_size:
    for w in window_size:
        for e in epochs:
            model = Doc2Vec(X_train_docs, vector_size=v, window=w, min_count=5, 
                            epochs=e)
            X_train_doc_vectors = []
            X_test_doc_vectors = []

            for i in range(len(X_train_docs)):
                X_train_doc_vectors.append(model.docvecs[i])

            for i in range(len(X_test_docs)):
                X_test_doc_vectors.append(model.infer_vector(X_test_docs[i][0]))
            X_train_doc_vectors = pd.DataFrame(X_train_doc_vectors)
            X_test_doc_vectors = pd.DataFrame(X_test_doc_vectors)

            _fit = RandomForestClassifier(n_estimators=50)
            model = MultiOutputClassifier(estimator=_fit)
            model.fit(X_train_doc_vectors, y_train)
            
            y_hat = model.predict(X_test_doc_vectors)

            auc_class = np.zeros(len(y_test[0]))

            for i in range(len(y_test[0])):
                auc_class[i] = roc_auc_score(y_test[:, i],y_hat[:, i])
            
            print(f'Vector size: {v}, Window size: {w}, Epoch: {e}')
            print(f'Mean AUC: {np.mean(auc_class)}, Min AUC: {np.min(auc_class)}, Max AUC: {np.max(auc_class)}')

## Split data using 2-fold CV

In [444]:

print(model.score(X_train_doc_vectors, y_train))

0.9642857142857143


(0.49953174225380487, 0.45013979496738116, 0.5384615384615384)