In [1]:
from collections import defaultdict
import json
import logging
import os
from os.path import join
import pickle
from typing import Dict, List, Tuple

from fuzzywuzzy import fuzz, process
import numpy as np
import pandas as pd


logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)

# create formatter
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

# add formatter to ch
ch.setFormatter(formatter)

# add ch to logger
logger.addHandler(ch)

In [197]:
DATA_DIR = '/home/mandar/Mandar/Trope Analysis/'

## Load TV Tropes json file

In [198]:
with open(join(DATA_DIR, 'TropesDataset', 'films_tropes_20190501.json'), 'rb') as file:
    tvtropes_dict = json.loads(file.read())

In [199]:
len(tvtropes_dict), list(tvtropes_dict.keys())[:5]

(11846,
 ['ABCsOfDeath2',
  'ABeautifulDayInTheNeighborhood',
  'ABeautifulMind',
  'ABetterTomorrow',
  'ABirdersGuideToEverything'])

In [200]:
movie_name_dict = dict()
tvtropes_movies_list = list(tvtropes_dict.keys())
for movie in tvtropes_movies_list:
    movie_name_dict[movie.lower()] = movie

## Read movie script filenames and match them with movies in tvtropes

In [201]:
subdirs = next(os.walk(join(DATA_DIR, 'ScreenPy/ParserOutput/')))[1]

In [202]:
def fuzzy_string_match(movie_name:str, movie_list: List[str], partial_match=False):
    if partial_match:
        movie_match_score = [(x, fuzz.partial_ratio(movie_name, x)) for x in movie_list]
    else:
        movie_match_score = [(x, fuzz.token_sort_ratio(movie_name, x)) for x in movie_list]
    movie_match_score = sorted(movie_match_score, key=lambda x: x[1], reverse=True)
    return movie_match_score

## Perform fuzzy matching of movie names with scripts and movie names with tropes

In [203]:
genre = 'Action'
genre_idx = subdirs.index(genre)
logger.info(f'Genre: {subdirs[genre_idx]}')
movie_tropes_not_found = 0
movie_tropes_found = 0
movie_match_score_1 = dict()
movie_match_score_2 = dict()

for file in os.listdir(join(DATA_DIR, 'ScreenPy', 'ParserOutput', subdirs[genre_idx])):
    if file.endswith('.pkl'):
        movie_name = file.split('.pkl')[0]
        if movie_name in movie_name_dict.keys():
            movie_match_score_1[movie_name] = (movie_name, 100)
            movie_match_score_2[movie_name] = (movie_name, 100)
        else:
            partial_match_score = fuzzy_string_match(movie_name, list(movie_name_dict.keys()), partial_match=True)
            token_match_score = fuzzy_string_match(movie_name, list(movie_name_dict.keys()))
            
            movie_match_score_1[movie_name] = partial_match_score[:5]
            movie_match_score_2[movie_name] = token_match_score[:5]

2021-01-11 19:29:00,143 - INFO - Genre: Action


In [205]:
def find_perfect_match_movies(movie_match_score: Dict[str, List[Tuple[str, int]]]):
    perfect_match = 0
    perfect_match_movie_dict = dict()
    for movie, movie_match in movie_match_score.items():
        if len(movie_match) == 2:
            if movie == movie_match[0]:
                perfect_match += 1
                perfect_match_movie_dict[movie] = movie_match[0]
        else:
            for m in movie_match:
                if movie == m[0] or sorted(movie) == sorted(m[0]):
                    perfect_match += 1
                    perfect_match_movie_dict[movie] = m[0]
                    break
    return perfect_match_movie_dict

In [206]:
perfect_match_movie_dict_1 = find_perfect_match_movies(movie_match_score_1)
perfect_match_movie_dict_2 = find_perfect_match_movies(movie_match_score_2)

In [207]:
len(perfect_match_movie_dict_1), len(perfect_match_movie_dict_2)

(192, 195)

In [208]:
l1 = []
l2 = []
for movie, movie_match in perfect_match_movie_dict_2.items():
    l1.append(movie)
    l2.append(movie_name_dict[movie_match])

In [209]:
movie_match_df = pd.DataFrame(l1, columns=['Movie_Script'])
movie_match_df['Movie_trope'] = l2

In [210]:
movie_match_df.head()

Unnamed: 0,Movie_Script,Movie_trope
0,ninjaassassin,NinjaAssassin
1,landofthedead,LandOfTheDead
2,meninblack3,MenInBlack3
3,perfectcreature,PerfectCreature
4,ghostandthedarknessthe,TheGhostAndTheDarkness


In [211]:
movie_match_df.to_csv('action_movie_script_trope_match_2.csv')

In [212]:
for movie, movie_match in movie_match_score_2.items():
    if movie not in perfect_match_movie_dict_2.keys():
        print(movie, movie_match)

batman [('antman', 83), ('bataan', 83), ('thebatman', 80), ('batman1989', 75), ('blankman', 71)]
godzilla [('godzilla1954', 80), ('godzilla1998', 80), ('godzilla2000', 80), ('godzilla2014', 80), ('shingodzilla', 80)]
someonetowatchoverme [('iwantsomeonetoeatcheesewith', 64), ('somewhere', 62), ('sweetwater', 60), ('onetwothree', 58), ('onetouchofvenus', 57)]
escapefroml.a. [('escapefromla', 88), ('escaperoom', 78), ('escapefromalcatraz', 71), ('escapefromnewyork', 67), ('escapefromsobibor', 67)]
kingdomthe [('vikingdom', 74), ('kingdomofheaven', 72), ('kingofthieves', 70), ('kingdomofthespiders', 69), ('kingofhearts', 64)]
rush [('crush', 89), ('hush', 75), ('push', 75), ('crash', 67), ('cyrus', 67)]
heavymetal [('whenharrymetsally', 67), ('turbulence3heavymetal', 65), ('heavyweights', 64), ('heavytrip', 63), ('navyseals', 63)]
megamind [('homeagain', 71), ('mermaid', 67), ('meganismissing', 64), ('masterminds', 63), ('theomegaman', 63)]
maskthe [('askfather', 75), ('mash', 73), ('mask

In [242]:
for movie, trope_movie in movie_name_dict.items():
    if 'rushhour' in movie:
        print(movie, trope_movie)

rushhour RushHour


In [244]:
len(tvtropes_dict['TheDarkKnightRises'])

814

In [245]:
def save_movie_dialog_files():
    for genre in os.listdir(os.path.join('ScreenPy', 'ParserOutput')):
        if not os.path.exists(os.path.join('Movie_dialogs', genre)):
            os.makedirs(os.path.join('Movie_dialogs', genre))
        for file in os.listdir(os.path.join('ScreenPy', 'ParserOutput', genre)):
            if file.endswith('.pkl'):
                with open(os.path.join('ScreenPy', 'ParserOutput', genre, file), 'rb') as f:
                    movie_dialog = pickle.load(f)
                with open(os.path.join('Movie_dialogs', genre, file), 'wb') as f:
                    pickle.dump(movie_dialog, f)