In [63]:
from collections import defaultdict
import json
import logging
import os
from os.path import join
import pickle
from typing import Dict, List, Tuple

from fuzzywuzzy import fuzz, process
import numpy as np
import pandas as pd


logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)

# create formatter
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

# add formatter to ch
ch.setFormatter(formatter)

# add ch to logger
logger.addHandler(ch)

In [2]:
DATA_DIR = '/home/mandar/Mandar/Trope Analysis/'

## Load TV Tropes json file

In [3]:
logger.info('Loading tvtropes json file')
with open(join(DATA_DIR, 'tvtropes.json'), 'rb') as file:
    tvtropes_dict = json.loads(file.read())

2021-01-03 15:10:27,574 - INFO - Loading tvtropes json file


In [30]:
# print number of movies with tropes and first 5 movie names
len(tvtropes_dict), list(tvtropes_dict.keys())[:5]

(13609,
 ['ABBATheMovie',
  'ABCsOfDeath2',
  'ABNKKBSNPLAko',
  'ABattleOfWits',
  'ABeautifulDayInTheNeighborhood'])

In [5]:
movie_name_dict = dict()
tvtropes_movies_list = list(tvtropes_dict.keys())
for movie in tvtropes_movies_list:
    movie_name_dict[movie.lower()] = movie

## Read movie script filenames and match them with movies in tvtropes

In [6]:
subdirs = next(os.walk(join(DATA_DIR, 'ScreenPy/ParserOutput/')))[1]

In [7]:
def fuzzy_string_match(movie_name:str, movie_list: List[str], partial_match=False):
    if partial_match:
        movie_match_score = [(x, fuzz.partial_ratio(movie_name, x)) for x in movie_list]
    else:
        movie_match_score = [(x, fuzz.token_sort_ratio(movie_name, x)) for x in movie_list]
    movie_match_score = sorted(movie_match_score, key=lambda x: x[1], reverse=True)
    return movie_match_score

## Perform fuzzy matching of movie names with scripts and movie names with tropes

In [8]:

genre = 'Action'
genre_idx = subdirs.index(genre)
logger.info(f'Genre: {subdirs[genre_idx]}')
movie_tropes_not_found = 0
movie_tropes_found = 0
movie_match_score_1 = dict()
movie_match_score_2 = dict()

for file in os.listdir(join(DATA_DIR, 'ScreenPy', 'ParserOutput', subdirs[genre_idx])):
    if file.endswith('.pkl'):
        movie_name = file.split('.pkl')[0]
        if movie_name in movie_name_dict.keys():
            movie_match_score_1[movie_name] = (movie_name, 100)
            movie_match_score_2[movie_name] = (movie_name, 100)
        else:
            partial_match_score = fuzzy_string_match(movie_name, list(movie_name_dict.keys()), partial_match=True)
            token_match_score = fuzzy_string_match(movie_name, list(movie_name_dict.keys()))
            
            movie_match_score_1[movie_name] = partial_match_score[:5]
            movie_match_score_2[movie_name] = token_match_score[:5]

2021-01-03 15:14:32,191 - INFO - Genre: Action


In [24]:
len(movies_match_score_1)

290

In [27]:
def find_perfect_match_movies(movie_match_score: Dict[str, List[Tuple[str, int]]]):
    perfect_match = 0
    perfect_match_movie_dict = dict()
    for movie, movie_match in movie_match_score.items():
        if len(movie_match) == 2:
            if movie == movie_match[0]:
                perfect_match += 1
                perfect_match_movie_dict[movie] = movie_match[0]
        else:
            for m in movie_match:
                if movie == m[0] or sorted(movie) == sorted(m[0]):
                    perfect_match += 1
                    perfect_match_movie_dict[movie] = m[0]
                    break
    return perfect_match_movie_dict

In [28]:
perfect_match_movie_dict_1 = find_perfect_match_movies(movies_match_score_1)
perfect_match_movie_dict_2 = find_perfect_match_movies(movies_match_score_2)

In [29]:
len(perfect_match_movie_dict_1), len(perfect_match_movie_dict_2)

(192, 192)

In [13]:
for genre in os.listdir(os.path.join('ScreenPy', 'ParserOutput')):
    if not os.path.exists(os.path.join('Movie_dialogs', genre)):
        os.makedirs(os.path.join('Movie_dialogs', genre))
    for file in os.listdir(os.path.join('ScreenPy', 'ParserOutput', genre)):
        if file.endswith('.pkl'):
            with open(os.path.join('ScreenPy', 'ParserOutput', genre, file), 'rb') as f:
                movie_dialog = pickle.load(f)
            with open(os.path.join('Movie_dialogs', genre, file), 'wb') as f:
                pickle.dump(movie_dialog, f)

In [12]:
movies_match_score_1['batman']

[('batman1989', 100),
 ('batmanandrobin', 100),
 ('batmanandrobinserial', 100),
 ('batmanbegins', 100),
 ('batmanforever', 100)]

## Look at movie names that do not match perfectly

In [52]:
count = 0
for movie, movie_name_match in movies_match_score_2.items():
    if movie not in perfect_match_movie_dict_2.keys():
        print(movie, movie_name_match)

batman [('antman', 83), ('bataan', 83), ('obaltan', 77), ('batman1989', 75), ('baran', 73)]
godzilla [('godzilla1954', 80), ('godzilla1998', 80), ('godzilla2000', 80), ('godzilla2014', 80), ('shingodzilla', 80)]
someonetowatchoverme [('iwantsomeonetoeatcheesewith', 64), ('somewhere', 62), ('sweetwater', 60), ('seventyoneintothefire', 59), ('onetwothree', 58)]
escapefroml.a. [('escapefromla', 88), ('escaperoom', 78), ('escapefromalcatraz', 71), ('escapefromnewyork', 67), ('escapefromsobibor', 67)]
chaos [('chains', 73), ('cats', 67), ('chariot', 67), ('chicago', 67), ('cops', 67)]
kingdomthe [('vikingdom', 74), ('kingdomofheaven', 72), ('kingofthieves', 70), ('kingdomofthespiders', 69), ('kingofhearts', 64)]
rush [('crush', 89), ('hush', 75), ('push', 75), ('crash', 67), ('cyrus', 67)]
heavymetal [('whenharrymetsally', 67), ('turbulence3heavymetal', 65), ('heavyweights', 64), ('heavytrip', 63), ('navyseals', 63)]
megamind [('homeagain', 71), ('mermaid', 67), ('meganismissing', 64), ('ma

In [61]:
for movie in movie_name_dict.keys():
    if 'rushhour' in movie:
        print(movie)

rushhour


In [50]:
print(fuzz.partial_ratio('lordoftheringsfellowshipoftheringthe', 'thelordoftheringsthefellowshipofthering'))
print(fuzz.token_sort_ratio('lordoftheringsfellowshipoftheringthe', 'thelordoftheringsthefellowshipofthering'))

92
88


In [66]:
trope_count_dict = defaultdict(int)
for movie, tvtrope_list in tvtropes_dict.items():
    for trope in tvtrope_list:
        trope_count_dict[trope] += 1

In [69]:
max(trope_count_dict.values())

2069