## Imports

In [27]:
from fuzzywuzzy import fuzz
import numpy as np
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
import pickle
import random
import re
import seaborn as sns
import string
from sklearn.decomposition import NMF
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances

## Open CSV and Clean Columns

In [28]:
# Open dataframe
movie_df = pd.read_csv('../csvs/wiki_movie_plots.csv')

In [29]:
# Drop duplicates
movie_df = movie_df.drop_duplicates(subset = ['Wiki Page'], keep = 'first').reset_index(drop = True)

## Main Functions

### Text Preprocessing

#### Main Cleaning

In [30]:
# Remove apostrophes
remove_apostrophes = lambda x: x.replace('\'', '')

# Keep only letters
remove_numbers = lambda x: ' '.join(re.sub('\w*\d\w*', ' ', x).split())

# Remove new line characters
no_new_line = lambda x: x.replace('\n',' ')

# Make them lowercase and remove punctuation
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x.lower()).strip()

#### Part of speech

In [31]:
# Get part of speech for lemmatization
def get_wordnet_pos(word):
    ''' 
    Map POS tag to first character lemmatize() accepts.
    '''
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

#### Lemmatization

In [32]:
# Lemmatization function
def lemmatizer(text):
    '''
    Lemmatizes a given string.
    '''
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in word_tokenize(text)]
    lemmatized_text = ' '.join(tokens)
    return lemmatized_text

#### Named Entity Recognition

In [33]:
# Named Entity function
def named_entities(text):
    '''
    Replaces all named entities
    before vectorization.
    '''
    for k, v in entities.items():
        text = text.replace(k, v)
    return text

### Modeling EDA

#### Topic Words

In [34]:
# Display topic top words
def display_topic_words(model, feature_names, no_top_words, topic_names=None):
    '''
    Display the top words for each topic.
    '''
    
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic: ", ix)
        else:
            print("\n", ix, "-", topic_names[ix], "\n")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

#### Topic Movies

In [35]:
def top_movies_for_topic(topic):
    return [plot_df["Title"][x] for x in list(np.argsort(doc_topic[:,topic])[::-1][0:100])]

#### Topic Words and Movies

In [36]:
# Display topic top words/documents
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    '''
    Display top words and documents for each topics.
    '''
    for topic_idx, topic in enumerate(H):
        print(f"\nTopic: {topic_idx}\n")
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        print("\n")
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print(documents[doc_index])

#### Movie Topics

In [37]:
# Top topics for movie_id
def movie_topics(movie):
    '''
    Get the top topics for the given movie_id.
    
    '''
    
    return [topics[x] for x in list(topic_vectors.iloc[movie_to_id[movie],:].sort_values(ascending = False).index)]

#### Normalize Vector

In [38]:
# Normalize topic vectors so movie_topics works propertly
def normalize_vector(vector):
    '''
    Normalize a vector with ranks.
    '''
    norm = vector.argsort()
    ranks = np.empty_like(norm)
    ranks[norm] = np.arange(len(vector))
    return ranks

In [39]:
def top_movies_for_topic(topic):
    return [plot_df["Title"][x] for x in list(np.argsort(doc_topic[:,topic])[::-1][0:100])]

### Recommendation

#### Spell Check for Input

In [40]:
def spell_check(movie_input, movie_titles):
    '''
    Gives you the most likely movie name based on
    what you type in.
    '''
    
    most_similar = 0
    for movie in movie_titles:
        ratio = fuzz.ratio(movie_input, movie)
        if ratio > most_similar:
            most_similar = ratio
            closest_movie = movie
    return closest_movie

#### Recommend Movie

In [41]:
def recommend_movie(movie_input, n_recs = 1):
    '''
    Recommends movie based on highest cosine similarity.
    '''

    if n_recs < 1:
        return ("You chose to receive 0 recommendations.")
    
    # Empty list of ranks
    ranks = []

    # Retrieved individual movies by splitting at comma
    movie_input = movie_input.split(",")

    # Clean up white space for each entry
    for idx in range(len(movie_input)):
        movie_input[idx] = movie_input[idx].strip()

    # For each movie in the list of inputted movies
    for idx, movie in enumerate(movie_input):
        
        # Returns the closest movie title if typo
        movie = spell_check(movie, movie_titles)
        print(movie)
        # Edits the entry in the movie input list
        movie_input[idx] = movie

        # Turn movie string into row index for movie
        movie = movie_to_id[movie]
        
        # Cosine distances for the given movie to all others
        dists = [dist[0] for dist in pairwise_distances(doc_topic, doc_topic[movie].reshape(1,-1))]

        # Sort the distances from closest to furthest, excluding the movie itself, and retain movie ids
        rec_movie_ids = np.argsort(dists)[1:]
        
        # Add this movie's ranks to the ranks list
        ranks.append(rec_movie_ids)
     
    # Create a dictionary of "average" ranks per movie
    rank_dict = {}
    
    # Loop through each movie and add the ranks up
    for i in range(len(movie_input)):
        for idx, movie in enumerate(ranks[i]):
            try:
                rank_dict[movie] += idx
            except:
                rank_dict[movie] = idx
    
    # Generate and return movie recommendation(s), and spell checked movie input
    if n_recs == 1:
        movie_recommendation = [id_to_movie[min(rank_dict, key = rank_dict.get)]]
        return movie_recommendation
    else:
        movie_recommendations = [id_to_movie[x[0]] for x in sorted(list(rank_dict.items()), key = lambda x: x[1])][:int(n_recs)]
        return movie_recommendations

## Main Action

### Preprocessing

#### Clean movie titles

In [42]:
# Get rid of spaces on the ends of titles
movie_df["Title"] = movie_df["Title"].str.strip()

In [43]:
# Fix movie title with typos
movie_df.iloc[14640,1] = "The Conjuring"

#### Puncutation, lowercase

In [44]:
# Clean punctuation
movie_df["Plot"] = movie_df["Plot"].map(remove_apostrophes).map(remove_numbers).map(punc_lower)

#### Lemmatization

In [46]:
movie_df["Plot"] = movie_df["Plot"].apply(lambda x: lemmatizer(x))

In [47]:
with open("pickles/movie_df","wb") as file:
    pickle.dump(movie_df,file)

### Pickled Starting Point

In [48]:
# Open the pickle
with open("pickles/movie_df","rb") as file:
    movie_df = pickle.load(file)

In [49]:
# Drop movies with short plots to avoid overfitting
movie_df = movie_df[movie_df["Plot"].apply(lambda x: len(x) > 400)].reset_index(drop = True)

### Vectorization Prep

#### Named Entity Recognition

In [50]:
# Named Entity Declaration
entities = {' new york ':' new_york ',
            ' los angeles ':' los_angeles ',
            ' van helsing ':' van_helsing ',
            ' high school ':' high_school ',
            ' united state ':' united_states ',
            ' united states ':' united_states ',
            ' hong kong ':' hong_kong ',
            ' kingdom ':' king ',
            ' world war ':' world_war ',
            ' world_war ii ': ' world_war ',
            ' gun shot ':' gun_shot ',
            ' performance ':' perform ',
            ' commit suicide ':' commit_suicide ',
            ' central park ':' central_park ',
            ' police officer ':' police_officer ',
            ' steal money ':' steal_money ',
            ' college student ':' college_student ',
            ' set free ':' set_free ',
            ' haunt house ':' haunted_house ',
            ' marry ':' marriage ',
            ' investigate ':' investigation ',
            ' develops ':' develop ',
            ' teacher ':' teach ',
            ' form story ':' form_story ',
            ' dr ':' doctor ',
            ' best friend ':' best_friend ',
            ' childhood friend ':' childhood_friend ',
            ' close friend ':' close_friend ',
            ' car accident ':' car_accident ',
            ' commits suicide ':' suicide ',
            ' commit suicide ':' suicide ',
            ' happily ':' happy ',
            ' small town ':' small_town ',
            ' writer ':' write ',
            ' writes ':' write ',
            ' heart attack ':' heart_attack ',
            ' die ':' death ',
            ' dead ':'death ',
            ' small town ':' small_town ',
            ' player ':' play ',
            ' night club ':' night_club ',
            ' singer ':' sing ',
            ' police station ':' police_station ',
            ' destroyed ':' destroy ',
            ' competition ':' compete ',
            ' cross country ':' cross_country ',
            ' marries ':' marriage ',
            ' air force ':' air_force ',
            ' married ':' marriage ',
            ' newly wed ':' newly_wed ',
            ' romantically ':' romantic ',
            ' seek revenge ':' seek_revenge ',
            ' reading ':' read ',
            ' sings ':' sing '
           }

In [51]:
# Named Entity Application
movie_df["Plot"] = movie_df["Plot"].apply(lambda x: named_entities(x))

#### Stopwords

##### Initiate

In [52]:
# Add general English stopwords without apostrophes
more_stopwords = []

for word in list(stopwords.words('english')):
    more_stopwords.append(word.replace('\'',''))

# Join's the stop words above to the standard English list
stop_words = text.ENGLISH_STOP_WORDS.union(more_stopwords)

##### Add

In [53]:
# Misc Category
other = ['rama','later','night','away','manner','door',
         'left','new','away','way','process','purpose','sens',
         'el','pas','section','good','multiple','attractive',
         'favorite','calcutta','interested','repeatedly','thing',
         'von','time','leaf','dinner','babu','big','inside',
         'outside','window','rao','day','hand','hard','end',
         'yearold','face','second','unable','reason','happens'
         ,'meantime','problem','life','true','past','care','sight'
         ,'eventually','year','ago','long','old','lose','present',
         'great','need','age','soon','head','happy','honest','head',
         'arm','role','department','result','room','wall','sudden',
         'suddenly','house','hall','different','elder','beautiful',
         'young','handsome','real','actually','truth','really','mistake',
         'set','large','despite','final','trip','store','east','park',
         'small','social','bad','couple','home','mate','exact','london',
         'india','paris','case','fall']

# Verbs
verb = ['come','leave','stay','say','tell','make','help','meet',
        'know','like','asks','use','want','follow','stake','kill',
        'pull','try','visit','return','let','stop','start','ask',
        'miss','lot','talk','reveals','run','begin','explains',
        'decides','change','open','run','walk','attempt','plan',
        'refuse','complete','decision','inform','pick','confuse',
        'attach','parking','approach','dislike','raise','lift',
        'increase','choose','dy','rest','look','rid','look',
        'realizes','spend','arrives','fail','turn','hold',
        'confronts','turn','realize','chase','knock','grab',
        'cause','throw','agrees','include','cause','manages',
        'arrive','happen','decide','reach','ride','fall','appear',
        'wake','watch','eat','cut','lock','attack','watch','hears',
        'wish','revolves','sends','play','sent','feel','think','focus',
        'described','save','share','attend','board','cross','accompany',
        'grow','save','lead','played','join','involve','involves',
        'receives','love']

# People
people = ['man','woman','girl','boy','sir','madam','professor',
         'guy','doc','boss','mr','person','lady','men']
         
# Names
name = ['michael','peter','sam','john','jane','max','tim',
        'curtis','jimmy','charlie','elizabeth','mike','paul',
        'nick','jimmy','eddie','tony','henry','paul','joes',
        'joe','emily','lily','amy','edward','frank','johnny',
        'helen','ben','diane','frank','johnny','martin','george',
        'anne','lucy','linda','leo','carl','alice','bobby',
        'martha','tom','jerry','rachel','ross','jenny','ann',
        'jennifer','lloyd','raj','walter','james','mary','steve',
        'billy','norman','ann','ray','jonathan','arthur','nikki',
        'frederick','jason','jessica','david','mia','katherine',
        'judy','steven','julie','susan','cynthia','shane','allan',
        'alex','sally','kim','lou','victor','ash','harris','wendy',
        'adam','grace','jim','glen','terry','al','margaret','carrie',
        'danny','alan','robert','christine','jack','thomas','ralph',
        'charlotte','nancy','simon','jake','pete','joseph','jacob',
        'hank','kelly','anna','stephen','dan','sean','larry','sarah',
        'karl','jackie','carter','scott','pete','harry','kate','eve',
        'phil','dean','cole','graham','jordan','phyllis','bob','sue',
        'rita','michelle','diana','mark','daniel','matt','lisa','duke',
        'morgan','marie','raymond','karen','maria','todd','janet','fred',
        'richard','annie','drake','julia','francis','charles','stewart',
        'richards','olivia','lawrence','lee','jeff','ellen','andy','andrew',
        'ruth','ed','miller','jones','taylor','kumar','shankar','ajay',
        'signh','prakash','prasad','joan','rahul','li','chris','singh',
        'khan','mohan','krishna','ravi','rajah','anand','vijay','kapoor',
        'raja','radha','lakshmi']

# Family
family = ['family','son','brother','sister','child','wife','daughter',
          'mother','husband','father','parent','uncle','cousin','grandfather',
          'aunt']

add_stop_words = other + verb + people + name + family

# Join's the stop words above to the standard English list
stop_words = stop_words.union(add_stop_words)

### Vectorization

In [54]:
# Create the vectorizer object
vectorizer = TfidfVectorizer(ngram_range = (1,3), stop_words = stop_words, min_df = .01, binary = False)

# Create the doc_word sparse matrix
doc_word = vectorizer.fit_transform(movie_df["Plot"])

# Create a dataframe for easy labeleled viewing
doc_word_df = pd.DataFrame(doc_word.toarray(), columns = vectorizer.get_feature_names())

### NMF Topic Modelling

In [55]:
# Create and NMF object with 35 topics
nmf = NMF(n_components = 35)

# Fit the doc_word sparse matrix
doc_topic = nmf.fit_transform(doc_word)

#### Topic Names

In [56]:
# Topic Names
topics = \
{
0:"Relationships / Sex",
1:"Marriage",
2:"Modern War",
3:"Police",
4:"Village",
5:"Medical",
6:"Royalty",
7:"Acting",
8:"School",
9:"Money",
10:"Sports",
11:"Gangs",
12:"Ships / Water",
13:"Music",
14:"Western",
15:"Driving",
16:"College / Relationships",
17:"Pregnancy / Adoption",
18:"Office Life",
19:"Tales / Journeys",
20:"Writing",
21:"Horse-Racing / Ranch",
22:"Aliens / Destruction",
23:"Train / Travel",
24:"Murder / Crime",
25:"Bollywood / Mobster",
26:"Combat",
27:"Airplanes",
28:"Prison / Justice",
29:"City Life",
30:"Heist",
31:"Death / Spirits",
32:"Animals",
33:"Dancing / Performance",
34:"Drug Crime"
}

#### Display Topic Words

In [57]:
# Top 15 terms per topic for evaluation
display_topic_words(nmf, vectorizer.get_feature_names(), 15, topic_names = topics)


 0 - Relationships / Sex 

relationship, party, sex, apartment, date, hotel, kiss, affair, wedding, invite, boyfriend, break, divorce, feeling, morning

 1 - Marriage 

marriage, wedding, rich, wealthy, proposal, divorce, arrange, lover, accepts, live, bride, finally, shock, affair, pregnant

 2 - Modern War 

german, army, soldier, war, british, officer, camp, general, american, colonel, troop, order, prisoner, command, nazi

 3 - Police 

police, arrest, inspector, prison, jail, escape, criminal, police_officer, crime, release, drug, commissioner, officer, police inspector, sentence

 4 - Village 

village, villager, temple, land, people, landlord, local, farmer, city, chief, respect, poor, priest, form, teach

 5 - Medical 

doctor, hospital, patient, medical, nurse, treat, cure, surgery, treatment, psychiatrist, mental, death, experiment, work, scientist

 6 - Royalty 

king, prince, queen, princess, palace, lord, castle, royal, minister, sword, forest, court, defeat, order, louis

#### Individual Movie Topics

In [58]:
# Normalized topic vectors
topic_vectors = []
for i in range(doc_topic.shape[1]):
    topic_vectors.append(normalize_vector(np.array(pd.DataFrame(doc_topic)[i])))
    
topic_vectors = pd.DataFrame(topic_vectors).transpose()

#### Create Movie IDs

In [59]:
# Create a list of movie ids and movie titles
movie_ids = movie_df["Title"].index.tolist()
movie_titles = movie_df["Title"].tolist()

# Create dictionarys to access them both ways
movie_to_id = {}
id_to_movie = {}

# Populate movie to id
for idx in range(len(movie_titles)):
    movie_to_id[movie_titles[idx]] = movie_ids[idx]

# Populate id to movie
for idx in range(len(movie_titles)):
    id_to_movie[movie_ids[idx]] = movie_titles[idx]

### Pickles

In [60]:
with open('pickles/jupyter_pickles','wb') as file:
    pickle.dump(movie_to_id, file)
    pickle.dump(id_to_movie, file)
    pickle.dump(movie_titles, file)
    pickle.dump(doc_topic, file)

## Test

In [62]:
recommend_movie('get out, the ring, 3')

Get Out
The Ring
3


['Himalaya Singh']