In [1]:
import json
import logging
import os
import pickle
from typing import List

from fuzzywuzzy import fuzz, process
import numpy as np
import pandas as pd


logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)

# create formatter
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

# add formatter to ch
ch.setFormatter(formatter)

# add ch to logger
logger.addHandler(ch)

### Load TV Tropes json file

In [2]:
logger.info('Loading tvtropes json file')
with open('tvtropes.json', 'rb') as file:
    tvtropes_dict = json.loads(file.read())

2020-12-27 23:23:01,284 - INFO - Loading tvtropes json file


In [3]:
len(tvtropes_dict), list(tvtropes_dict.keys())[:5]

(13609,
 ['ABBATheMovie',
  'ABCsOfDeath2',
  'ABNKKBSNPLAko',
  'ABattleOfWits',
  'ABeautifulDayInTheNeighborhood'])

In [5]:
movie_name_dict = dict()
tvtropes_movies_list = list(tvtropes_dict.keys())
for movie in tvtropes_movies_list:
    movie_name_dict[movie.lower()] = movie

### Load movie scripts

In [6]:
subdirs = next(os.walk('ScreenPy/ParserOutput/'))[1]

In [9]:
def fuzzy_string_match(movie_name:str, movie_list: List[str], partial_match=False):
    if partial_match:
        movie_match_score = [(x, fuzz.partial_ratio(movie_name, x)) for x in movie_list]
    else:
        movie_match_score = [(x, fuzz.token_sort_ratio(movie_name, x)) for x in movie_list]
    movie_match_score = sorted(movie_match_score, key=lambda x: x[1], reverse=True)
    return movie_match_score

In [10]:
genre = 'Action'
genre_idx = subdirs.index(genre)
logger.info(f'Genre: {subdirs[genre_idx]}')
movie_tropes_not_found = 0
movie_tropes_found = 0
movies_match_score_1 = dict()
movies_match_score_2 = dict()

for file in os.listdir(os.path.join('ScreenPy/ParserOutput/', subdirs[genre_idx])):
    if file.endswith('.pkl'):
        movie_name = file.split('.pkl')[0]
        if movie_name in movie_name_dict.keys():
            movies_match_score_1[movie_name] = (movie_name, 100)
            movies_match_score_2[movie_name] = (movie_name, 100)
        else:
            partial_match_score = fuzzy_string_match(movie_name, list(movie_name_dict.keys()), partial_match=True)
            token_match_score = fuzzy_string_match(movie_name, list(movie_name_dict.keys()))
            
            movies_match_score_1[movie_name] = partial_match_score[:5]
            movies_match_score_2[movie_name] = token_match_score[:5]

2020-12-27 23:24:22,876 - INFO - Genre: Action


['Music',
 'Short',
 'Mystery',
 'Biography',
 'Musical',
 'History',
 'Drama',
 'Action',
 'Sci-Fi',
 'War',
 'Film-Noir',
 'Romance',
 'Crime',
 'Adventure',
 'Animation',
 'Horror',
 'Family',
 'Comedy',
 'Western',
 'Thriller',
 'Fantasy',
 'Sport']

In [13]:
for genre in os.listdir(os.path.join('ScreenPy', 'ParserOutput')):
    if not os.path.exists(os.path.join('Movie_dialogs', genre)):
        os.makedirs(os.path.join('Movie_dialogs', genre))
    for file in os.listdir(os.path.join('ScreenPy', 'ParserOutput', genre)):
        if file.endswith('.pkl'):
            with open(os.path.join('ScreenPy', 'ParserOutput', genre, file), 'rb') as f:
                movie_dialog = pickle.load(f)
            with open(os.path.join('Movie_dialogs', genre, file), 'wb') as f:
                pickle.dump(movie_dialog, f)

In [None]:
movies_match_score['fugitivethe']

In [None]:
count = 0
for movie_name, trope_movie_name in movies_match_score.items():
    if len(trope_movie_name) > 2:
#         print(movie_name, trope_movie_name)
        count += 1
count

In [None]:
count

In [None]:
sample_movie = 'fugitive'
for movie in movie_name_dict.keys():
    if sample_movie in movie:
        print(movie, sample_movie)
        print(fuzz.token_sort_ratio(movie, sample_movie))

In [None]:
sorted(token_sort_ratio, key=lambda x: x[1], reverse=True)[:5]

In [None]:
for movie in movie_name_dict.keys():
    if 'darkknight' in movie:
        print(movie)