In [1]:
from collections import Counter, defaultdict
import glob
import json
import os
from os.path import join
import re
from typing import Dict, List

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC

from xgboost import XGBClassifier

In [2]:
# TODO: Change DATA_DIR per your local filepath

DATA_DIR = '/home/mandar/Data/NCSU/TropeAnalysis/TropesDataset'

In [3]:
def get_genre_movie_list(genre: str) -> List[str]:
    """
    This function returns all the json filenames containing movie dialogs
    Args:
        genre (str): String containing genre name.
        
    Returns:
        List[str]: List of strings containing json filenames.
    """
    movie_genre_json_list = []
    movies_per_genre = os.listdir(join(DATA_DIR, 'ScreenPy', 'ParserOutput', genre))
    for movie in movies_per_genre:
        if movie.endswith('.json'):
            movie_genre_json_list.append(movie)
    return movie_genre_json_list

In [4]:
def load_json_movie_dialog_file(genre: str, movie_filename: str) -> List[List[Dict[str, str]]]:
    """
    Loads the json data contained in movie file:
    Args:
        genre (str): String containing genre name.
        movie_filename (str): String containing movie filename.
    
    Returns:
        List[List[Dict[str]]]: List of lists with each nested list containing a dictionary.
    """
    with open(join(DATA_DIR, 'ScreenPy', 'ParserOutput', genre, movie_filename), 'r') as f:
        movie_dialog_json = json.loads(f.read())
    return movie_dialog_json

In [5]:
def parse_movie_dialog_data(movie_json_data: List[List[Dict[str, str]]], 
                            verbose: bool = False):
    """
    This function parses the movie json data, and collects the following information,
        1. Unique characters with dialogs
        2. Number of dialogs per character
        3. Dialogs of all characters concatenated into a string
    Args:
        movie_json_data (List[List[Dict[str, str]]]): Json data containing movie character names and dialogs.
        verbose (bool): Boolean indicating whether raw dialogs should be printed.
        
    Returns:
        Dict[str, Any]: Dictionary with movie name as key and various nested dictionaries 
        containing data mentioned in function description.
    """
    movie_characters = set()
    movie_dialogs = list()
    dialogs_per_character = defaultdict(int)
    movie_info_dict = defaultdict()
    for scene_dialogs in movie_json_data:
        for dialog_info in scene_dialogs:
            if 'speaker/title' in dialog_info['head_text']:
                dialog_speaker = dialog_info['head_text']['speaker/title']
                if verbose:
                    print(f"Speaker: {dialog_speaker}")
                    print(dialog_info['text'])
                character = dialog_speaker.split('(')[0].strip()
                movie_characters = movie_characters.union([character])
                dialogs_per_character[character] += 1
                movie_dialogs.append(dialog_info['text'])

    movie_info_dict['characters'] = movie_characters
    movie_info_dict['actor_dialog_count'] = dialogs_per_character
    movie_info_dict['dialogs'] = ' '.join(movie_dialogs)
    return movie_info_dict

In [6]:
# Read all movie script json files for Action genre
genre = 'Action'
genre_movie_json_list = get_genre_movie_list(genre)

In [7]:
genre_movie_json_list[:5]

['avatar.json',
 'dune.json',
 'blade.json',
 'machete.json',
 'startrekfirstcontact.json']

In [8]:
# Read json file contianing movie tropes
with open(join(DATA_DIR, 'films_tropes_20190501.json'), 'rb') as file:
    tvtropes_json_dict = json.load(file)

In [9]:
# Remove .json file extension from movie filenames
movie_list = [movie.split('.json')[0] for movie in genre_movie_json_list]

In [10]:
# Read csv file mapping movie names containing script data with their tropes
action_movie_script_trope_df = pd.read_csv(join(DATA_DIR, 'action_movie_script_trope_match.csv'))

In [11]:
# Find movies that have tropes
movie_match_df = action_movie_script_trope_df.loc[action_movie_script_trope_df.Movie_Script.isin(movie_list)].copy()
len(movie_match_df)

263

In [12]:
# Parse json files with movie scripts and store them in a dict with movie name as key and script as value
# Additionally, preprocess the movie script text by converting them to lowercase 
all_raw_movie_dialogs = defaultdict()
all_preprocess_movie_dialogs = defaultdict()
movie_trope_dict = defaultdict()

for movie_row in movie_match_df.iterrows():
    movie = movie_row[1].Movie_Script
    movie_filename = movie + '.json'
    movie_json_data = load_json_movie_dialog_file(genre, movie_filename)
    # Parse movie dialogs and preprocess text
    all_raw_movie_dialogs[movie] = parse_movie_dialog_data(movie_json_data)
    all_preprocess_movie_dialogs[movie] = simple_preprocess(all_raw_movie_dialogs[movie]['dialogs'])
    
    # Collect list of tropes for the movie
    movie_trope_dict[movie] = tvtropes_json_dict[movie_row[1].Movie_trope]

In [13]:
len(movie_trope_dict), len(all_preprocess_movie_dialogs)

(263, 263)

In [14]:
# Get list of all unique tropes per movie
unique_tropes_set = list()
for tropes in movie_trope_dict.values():
    unique_tropes_set += list(set(tropes))

In [15]:
# Get movie count per trope
tropes_count_dict = Counter(unique_tropes_set)

In [16]:
# Select tropes which appear in at least min_trope_count movies
threshold = 0.5
min_movie_per_trope_count = 100
tropes_subset_list = list()
for trope, count in tropes_count_dict.items():
    if count > min_movie_per_trope_count:
        tropes_subset_list.append(trope)

In [17]:
print(f'Number of tropes present in at least {min_movie_per_trope_count} movies: {len(tropes_subset_list)}')

Number of tropes present in at least 100 movies: 5


In [18]:
# For each movie filter out tropes which appear in less than min_trope_count movies
movie_tropes_subset_dict = defaultdict()
for movie, trope in movie_trope_dict.items():
    movie_tropes_subset_dict[movie] = list(set(tropes_subset_list).intersection(set(trope)))
    if len(movie_tropes_subset_dict[movie]) == 0:
        print(f'{movie} has no tropes in json file')

landofthedead has no tropes in json file
tristanandisolde has no tropes in json file
programthe has no tropes in json file
next has no tropes in json file
getawaythe has no tropes in json file
newyorkminute has no tropes in json file
entrapment has no tropes in json file
warrior has no tropes in json file
crouchingtigerhiddendragon has no tropes in json file
supergirl has no tropes in json file
cradle2thegrave has no tropes in json file
mariachiel has no tropes in json file
deepcover has no tropes in json file
surrogates has no tropes in json file
hostage has no tropes in json file
stuntmanthe has no tropes in json file
ticker has no tropes in json file
avengersthe2012 has no tropes in json file
bountyhunterthe has no tropes in json file
battlelosangeles has no tropes in json file
defiance has no tropes in json file
hardtokill has no tropes in json file
kingkong has no tropes in json file
dune has no tropes in json file
rambofirstbloodiithemission has no tropes in json file
g.i.jane ha

## Train Doc2Vec model on Movie Scripts

In [19]:
# Convert documents to TaggedDocument to train doc2vec models
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(all_preprocess_movie_dialogs.values())]
len(documents)

263

In [20]:
multi_output = MultiLabelBinarizer()
y = multi_output.fit_transform(list(movie_tropes_subset_dict.values()))

In [21]:
np.where(y.sum(axis=1)==0)[0]

array([  1,   8,   9,  10,  33,  38,  47,  49,  53,  75,  78,  81,  82,
       101, 109, 123, 146, 149, 150, 155, 165, 176, 191, 200, 204, 231,
       240, 244, 251, 258])

## Remove documents with no tropes

In [22]:
new_documents = []
documents_with_no_tropes = np.where(y.sum(axis=1)==0)[0]
for idx, document in enumerate(documents):
    if idx not in documents_with_no_tropes:
        new_documents.append(document)
        
y = y[y.sum(axis=1) > 0]
print(f'Number of movies left: {len(new_documents)}')

Number of movies left: 233


## Split new set of documents into train and test set

In [23]:
X_train_docs, X_test_docs, y_train, y_test = train_test_split(new_documents, y, 
                                                              train_size=0.75)

In [24]:
len(X_train_docs), len(X_test_docs), y_train.shape, y_test.shape

(174, 59, (174, 5), (59, 5))

## Train TF-IDF vectors and train xgboost classifier

In [25]:
def train_eval_classifiers(X_train, X_test, y_train, y_test, classifier = 'xgb') -> np.array:
    if classifier == 'xgb':
        _fit = XGBClassifier(eval_metric='logloss', n_estimators=100)
    elif classifier == 'rf':
        _fit = RandomForestClassifier(n_estimators=50)
    mutli_out_classifier = MultiOutputClassifier(estimator=_fit)
    mutli_out_classifier.fit(X_train, y_train)

    y_hat = mutli_out_classifier.predict(X_test)

    auc_class = np.zeros(len(y_test[0]))

    for i in range(len(y_test[0])):
        auc_class[i] = roc_auc_score(y_test[:, i],y_hat[:, i])

    return auc_class

In [26]:
X_tfidf_train_docs = [' '.join(x[0]) for x in X_train_docs]
X_tfidf_test_docs = [' '.join(x[0]) for x in X_test_docs]

In [27]:
max_features = [10, 50, 100, 500, 1000, 1500, 2000, 5000, 10000, 20000, 25000]
for feature in max_features:
    tfidf_fit = TfidfVectorizer(max_features=feature, stop_words='english').fit(raw_documents=X_tfidf_train_docs)
    X_tfidf_train_vec = tfidf_fit.transform(X_tfidf_train_docs)
    X_tfidf_test_vec = tfidf_fit.transform(X_tfidf_test_docs)
    print(X_tfidf_train_vec.shape, X_tfidf_test_vec.shape, len(y_train[0]))
    rf_auc_class = train_eval_classifiers(X_tfidf_train_vec, X_tfidf_test_vec, y_train, y_test, classifier='rf')
    xgb_auc_class = train_eval_classifiers(X_tfidf_train_vec, X_tfidf_test_vec, y_train, y_test)
    print(f'Random Forest with vocabulary size: {feature}')
    print(f"Mean AUC: {round(np.mean(rf_auc_class), 2)}, Median AUC: {round(np.median(rf_auc_class), 2)}, Min AUC: {round(np.min(rf_auc_class), 2)}, Max AUC: {round(np.max(rf_auc_class), 2)}")
    print(f'XGB with vocabulary size: {feature}')
    print(f"Mean AUC: {round(np.mean(xgb_auc_class), 2)}, Median AUC: {round(np.median(xgb_auc_class), 2)}, Min AUC: {round(np.min(xgb_auc_class), 2)}, Max AUC: {round(np.max(xgb_auc_class), 2)}")

(174, 10) (59, 10) 5




Random Forest with vocabulary size: 10
Mean AUC: 0.53, Median AUC: 0.53, Min AUC: 0.43, Max AUC: 0.65
XGB with vocabulary size: 10
Mean AUC: 0.54, Median AUC: 0.53, Min AUC: 0.44, Max AUC: 0.63
(174, 50) (59, 50) 5




Random Forest with vocabulary size: 50
Mean AUC: 0.53, Median AUC: 0.53, Min AUC: 0.49, Max AUC: 0.59
XGB with vocabulary size: 50
Mean AUC: 0.53, Median AUC: 0.55, Min AUC: 0.47, Max AUC: 0.58
(174, 100) (59, 100) 5




Random Forest with vocabulary size: 100
Mean AUC: 0.52, Median AUC: 0.51, Min AUC: 0.5, Max AUC: 0.58
XGB with vocabulary size: 100
Mean AUC: 0.52, Median AUC: 0.53, Min AUC: 0.47, Max AUC: 0.57
(174, 500) (59, 500) 5




Random Forest with vocabulary size: 500
Mean AUC: 0.52, Median AUC: 0.53, Min AUC: 0.45, Max AUC: 0.57
XGB with vocabulary size: 500
Mean AUC: 0.55, Median AUC: 0.55, Min AUC: 0.5, Max AUC: 0.6
(174, 1000) (59, 1000) 5




Random Forest with vocabulary size: 1000
Mean AUC: 0.52, Median AUC: 0.52, Min AUC: 0.45, Max AUC: 0.62
XGB with vocabulary size: 1000
Mean AUC: 0.55, Median AUC: 0.55, Min AUC: 0.52, Max AUC: 0.57
(174, 1500) (59, 1500) 5




Random Forest with vocabulary size: 1500
Mean AUC: 0.54, Median AUC: 0.56, Min AUC: 0.47, Max AUC: 0.57
XGB with vocabulary size: 1500
Mean AUC: 0.55, Median AUC: 0.51, Min AUC: 0.45, Max AUC: 0.73
(174, 2000) (59, 2000) 5




Random Forest with vocabulary size: 2000
Mean AUC: 0.52, Median AUC: 0.5, Min AUC: 0.47, Max AUC: 0.63
XGB with vocabulary size: 2000
Mean AUC: 0.57, Median AUC: 0.55, Min AUC: 0.5, Max AUC: 0.69
(174, 5000) (59, 5000) 5




Random Forest with vocabulary size: 5000
Mean AUC: 0.5, Median AUC: 0.47, Min AUC: 0.43, Max AUC: 0.56
XGB with vocabulary size: 5000
Mean AUC: 0.53, Median AUC: 0.53, Min AUC: 0.44, Max AUC: 0.62
(174, 10000) (59, 10000) 5




Random Forest with vocabulary size: 10000
Mean AUC: 0.51, Median AUC: 0.5, Min AUC: 0.49, Max AUC: 0.55
XGB with vocabulary size: 10000
Mean AUC: 0.56, Median AUC: 0.55, Min AUC: 0.52, Max AUC: 0.59
(174, 20000) (59, 20000) 5




Random Forest with vocabulary size: 20000
Mean AUC: 0.52, Median AUC: 0.53, Min AUC: 0.48, Max AUC: 0.57
XGB with vocabulary size: 20000
Mean AUC: 0.57, Median AUC: 0.56, Min AUC: 0.54, Max AUC: 0.62
(174, 25000) (59, 25000) 5




Random Forest with vocabulary size: 25000
Mean AUC: 0.54, Median AUC: 0.57, Min AUC: 0.48, Max AUC: 0.58
XGB with vocabulary size: 25000
Mean AUC: 0.58, Median AUC: 0.58, Min AUC: 0.54, Max AUC: 0.61


In [None]:

# xgb_auc_class = train_eval_classifiers(X_tfidf_train_vec, X_tfidf_test_vec, y_train, 
#                                        y_test, classifier='xgb')

# print('XGBoost')
# print(f"Mean AUC: {round(np.mean(xgb_auc_class), 2)}, Median AUC: {round(np.median(xgb_auc_class), 2)}, Min AUC: {round(np.min(xgb_auc_class), 2)}, Max AUC: {round(np.max(xgb_auc_class), 2)}")

## Train doc2vec vectors and train xgboost classifier

In [None]:
vector_size = [10, 20, 50, 100, 200]
window_size = [5]
epochs = [10]

for v in vector_size:
    for w in window_size:
        for e in epochs:
            dm_model = Doc2Vec(X_train_docs, vector_size=v, window=w, epochs=e, 
                            min_count=5, dm=1)
            dbow_model = Doc2Vec(X_train_docs, vector_size=v, window=w, epochs=e, 
                            min_count=5, dm=0)
            
            X_train_dm_dv = []
            X_train_dbow_dv = []
            
            X_test_dm_dv = []
            X_test_dbow_dv = []
            
            for i in range(len(X_train_docs)):
                X_train_dm_dv.append(dm_model.docvecs[i])
                X_train_dbow_dv.append(dbow_model.docvecs[i])

            for i in range(len(X_test_docs)):
                X_test_dm_dv.append(dm_model.infer_vector(X_test_docs[i][0]))
                X_test_dbow_dv.append(dbow_model.infer_vector(X_test_docs[i][0]))
                
            X_train_dv = pd.concat([pd.DataFrame(X_train_dm_dv),
                                            pd.DataFrame(X_train_dbow_dv)], axis=1)
            X_test_dv = pd.concat([pd.DataFrame(X_test_dm_dv),
                                             pd.DataFrame(X_test_dbow_dv)], axis=1)

#             _fit = RandomForestClassifier(n_estimators=50)
            auc_class = train_eval_classifiers(X_train_dv, X_test_dv, y_train, y_test)
            print(f'Vector size: {v}, Window size: {w}, Epoch: {e}')
            print(f"Mean AUC: {round(np.mean(auc_class), 2)}, Median AUC: {round(np.median(auc_class), 2)}, Min AUC: {round(np.min(auc_class), 2)}, Max AUC: {round(np.max(auc_class), 2)}")

In [None]:
pd.concat([pd.DataFrame([[1,2,3]]), pd.DataFrame([[4,5,6]])], axis=1)

In [None]:
dm_model.docvecs[0]