### Imports

In [99]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import seaborn as sns
from fuzzywuzzy import fuzz
import pickle
import re
import string
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.feature_extraction import text
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from tqdm import tqdm_notebook as tqdm

In [117]:
with open("movie_titles","wb") as file:
    pickle.dump(movie_titles, file)

In [115]:
movie_titles = plot_df["Title"].tolist()

### Main Functions

#### Text preprocessing

In [22]:
# Remove apostrophes
remove_apostrophes = lambda x: x.replace('\'', '')

# Keep only letters
remove_numbers = lambda x: ' '.join(re.sub('\w*\d\w*', ' ', x).split())

# Remove new line characters
no_new_line = lambda x: x.replace('\n',' ')

# Make them lowercase and remove punctuation
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x.lower()).strip()

#### Part of speech

In [23]:
# Get part of speech for lemmatization
def get_wordnet_pos(word):
    ''' 
    Map POS tag to first character lemmatize() accepts.
    '''
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

#### Lemmatization

In [24]:
# Lemmatize function
def lemmatizer(text):
    '''
    Lemmatize a given string.
    '''
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in word_tokenize(text)]
    lemmatized_text = ' '.join(tokens)
    return lemmatized_text

#### Named Entity Recognition

In [25]:
def named_entities(text):
    for k, v in entities.items():
        text = text.replace(k, v)
    return text

#### Display Topic Words

In [26]:
def display_topic_words(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic: ", ix)
        else:
            print("\n", ix+1, "-", topic_names[ix], "\n")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

#### Cosine Similarity

In [27]:
def cos_sim(matrix, row1, row2):
    return cosine_similarity(matrix[row1].reshape(1,-1), matrix[row2].reshape(1,-1))

#### Spell Check for Input

In [87]:
def spell_check(movie_input, movie_titles):
    '''
    Gives you the most likely movie name based on
    what you type in.
    '''
    
    most_similar = 0
    for movie in movie_titles:
        ratio = fuzz.ratio(movie_input, movie)
        if ratio > most_similar:
            most_similar = ratio
            closest_movie = movie
    return closest_movie,  most_similar

### Preprocessing

#### Puncutation, lowercase

In [28]:
#plot_df["Plot"] = plot_df["Plot"].map(remove_apostrophes).map(remove_numbers).map(punc_lower)

#### Lemmatization

In [29]:
#plot_df["Plot"] = plot_df["Plot"].apply(lambda x: lemmatizer(x))

### Pickled Starting Point

In [30]:
with open("pickles/plot_df","rb") as file:
    plot_df = pickle.load(file)

In [31]:
# Drop movies with short plots to avoid overfitting
plot_df = plot_df[plot_df["Plot"].apply(lambda x: len(x) > 400)].reset_index(drop = True)

In [32]:
plot_df = plot_df[plot_df["Release Year"] >= 1980]

In [33]:
plot_df.reset_index(drop = True, inplace = True)

### Vectorization Prep

#### Named Entity Recognition

In [34]:
# Named Entity Declaration
entities = {"new york":"new_york",
            "los angeles":"los_angeles",
            "van helsing":"van_helsing",
            "high school":"high_school",
            "fall in love":"fall_in_love",
            "united state":"united_states",
            "united states":"united_states",
            "hong kong":"hong_kong",
            "kingdom":"king",
            "world war":"world_war",
            "world_war ii": "world_war",
            "gun shot":"gun_shot",
            "performance":"perform",
            "commit suicide":"commit_suicide",
            "central park":"central_park",
            "police officer":"police_officer",
            "steal money":"steal_money",
            "college student":"college_student",
            "set free":"set_free"
           }

In [35]:
# Named Entity Application
plot_df["Plot"] = plot_df["Plot"].apply(lambda x: named_entities(x))

#### Stopwords

##### Initiate

In [36]:
# Add general English stopwords without apostrophes
more_stopwords = []

for word in list(stopwords.words('english')):
    more_stopwords.append(word.replace('\'',''))

# Join's the stop words above to the standard English list
stop_words = text.ENGLISH_STOP_WORDS.union(more_stopwords)

# Peek
list(sorted(stop_words))[:5]

['a', 'about', 'above', 'across', 'after']

##### Add

In [37]:
# Misc Category
other = ['rama','later','night','away','manner','door',
         'left','new','away','way','process','purpose','sens',
         'el','pas','section','good','multiple','attractive',
         'favorite','calcutta','interested','repeatedly','thing',
         'von','time','leaf','dinner','babu','big','inside',
         'outside','window','rao','day','hand','hard','end',
         'yearold','face','second','unable','reason','happens'
         ,'meantime','problem','life','true','past','care','sight'
         ,'eventually','year','ago','long','old','lose','present',
         'great','need','age','soon']

# Verbs
verb = ['come','leave','stay','say','tell','make','help','meet',
        'know','like','asks','use','want','follow','stake','kill',
        'pull','try','visit','return','let','stop','start','ask',
        'miss','lot','talk','reveals','run','begin','explains',
        'decides','change','open','run','walk','attempt','plan',
        'refuse','complete','decision','inform','pick','confuse',
        'attach','parking','approach','dislike','raise','lift',
        'increase','choose','dy','rest','look','rid','look',
        'realizes','spend','arrives','fail','turn','hold',
        'confronts','turn','realize']

# People
people = ['man','woman','girl','boy','sir','madam','professor',
         'guy','doc','boss','mr','person']
         
# Names
name = ['michael','peter','sam','john','jane','max','tim',
        'curtis','jimmy','charlie','elizabeth','mike','paul',
        'nick','jimmy','eddie','tony','henry','paul','joes',
        'joe','emily','lily','amy','edward','frank','johnny',
        'helen','ben','diane','frank','johnny','martin','george',
        'anne','lucy','linda','leo','carl','alice','bobby',
        'martha','tom','jerry','rachel','ross','jenny','ann',
        'jennifer','lloyd','raj','walter','james','mary','steve',
        'billy','norman','ann','ray','jonathan','arthur','nikki',
        'frederick','jason','jessica','david','mia','katherine',
        'judy','steven','julie','susan','cynthia','shane','allan',
        'alex','sally','kim','lou','victor','ash','harris','wendy',
        'adam','grace','jim','glen','terry','al','margaret','carrie',
        'danny','alan','robert','christine','jack','thomas','ralph',
        'charlotte','nancy','simon','jake','pete','joseph','jacob',
        'hank','kelly','anna','stephen','dan','sean','larry','sarah',
        'karl','jackie','carter','scott','pete','harry','kate','eve',
        'phil','dean','cole','graham','jordan','phyllis','bob','sue',
        'rita','michelle','diana','mark','daniel','matt','lisa','duke',
        'morgan','marie','raymond','karen','maria','todd','janet','fred',
        'richard','annie','drake','julia','francis','charles','stewart',
        'richards','olivia','lawrence','lee','jeff','ellen','andy','andrew',
        'ruth','ed','miller','jones','taylor','kumar','shankar','ajay',
        'signh','prakash','prasad','joan','rahul','li','chris']

# Family
family = ['family','son','brother','sister','child','wife','daughter',
          'mother','husband','father','parent','uncle']

add_stop_words = other + verb + people + name + family

# Join's the stop words above to the standard English list
stop_words = stop_words.union(add_stop_words)

# Display the first five alpabetically
list(sorted(stop_words))[:5]

['a', 'about', 'above', 'across', 'adam']

### Vectorization

In [38]:
vectorizer = TfidfVectorizer(ngram_range = (1,3), stop_words = stop_words, min_df = .01, binary = False)

doc_word = vectorizer.fit_transform(plot_df["Plot"])

doc_word_df = pd.DataFrame(doc_word.toarray(), columns = vectorizer.get_feature_names())
doc_word_df.head(2)

Unnamed: 0,abandon,abduct,ability,able,aboard,abroad,abruptly,absence,abuse,abusive,...,wound,wreck,write,writer,writes,wrong,yard,yell,young,youth
0,0.0,0.0,0.0,0.086091,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.079347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050541,0.0


### NMF Topic Modelling

In [39]:
# Initialize model with X topics
nmf = NMF(n_components = 40)
doc_topic = nmf.fit_transform(doc_word)

#### Document Topic Probabilities

In [40]:
doc_topic_df = pd.DataFrame(doc_topic.round(5),
             index = plot_df["Title"],
             columns = range(40))
doc_topic_df.head(2)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Airplane!,0.01268,0.0,0.00082,0.0,0.0,0.0,0.00335,0.0,0.0,0.01688,...,0.0117,0.02625,0.0,0.0,0.0177,0.0,0.0,0.00013,0.0,0.0
Alligator,0.00132,0.00455,0.02335,0.0,0.0,0.00422,0.0,0.0,0.01441,0.01701,...,0.00193,0.0,0.0199,0.0,0.01175,0.0,0.00059,0.0,0.00175,0.01637


In [41]:
# Top X terms for topic
display_topic_words(nmf, vectorizer.get_feature_names(), 15)


Topic:  0
party, invite, kiss, sex, morning, date, sleep, room, drive, home, upset, drink, phone, drunk, apartment

Topic:  1
marriage, married, marry, wedding, love, arrange, proposal, couple, marries, house, bride, cousin, groom, happy, happily

Topic:  2
murder, case, suspect, killer, detective, investigation, investigate, evidence, murderer, crime, police, victim, serial, solve, death

Topic:  3
defeat, king, battle, fight, power, evil, master, sword, save, princess, magic, prince, capture, warrior, martial

Topic:  4
movie, star, role, play, climax, love, director, actor, lead, scene, actress, happy, act, plot, hero

Topic:  5
body, dead, appear, room, ghost, strange, scream, lock, spirit, blood, house, die, haunt, suddenly, hears

Topic:  6
money, pay, sell, debt, buy, owner, steal, business, bank, rich, loan, million, property, earn, offer

Topic:  7
school, student, teacher, high_school, class, college, study, classmate, principal, teach, university, graduate, education, grade

In [46]:
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print(f"\nTopic: {topic_idx}\n")
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        print("\n")
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print(documents[doc_index])

In [47]:
display_topics(nmf.components_, doc_topic, vectorizer.get_feature_names(), plot_df["Title"], 10, 20)


Topic: 0

party invite kiss sex morning date sleep room drive home


Bachelorette
No Strings Attached
 A Teacher
First Time, TheThe First Time
Weekend
Spectacular Now, TheThe Spectacular Now
 Tadpole
Keep the Lights On
The Hours and Times
Chance
Trainwreck
Elegy
Superbad
Last Vegas
Drinking Buddies
St. Elmo's Fire
Shoot the Moon
The Pallbearer
I Give It a Year
American Reunion

Topic: 1

marriage married marry wedding love arrange proposal couple marries house


Ishtam
Shuva
Thathayya Premaleelalu
Sasirekha Parinayam
Sangama
Mohabbat
Aa Okkati Adakku
Kangaroo
Donga
Ennamma Kannu
Inimey Ippadithan
Kan Simittum Neram
Ogo Bodhu Shundori
Yemaindi Ee Vela
Anjada Gandu
Jallikattu Kaalai
Aan Paavam
Adda
Thaali Pudhusu
5

Topic: 2

murder case suspect killer detective investigation investigate evidence murderer crime


Into the White Night
The Deal
RV: Resurrected Victims
F/X2
Missing You
Badges of Fury
Ulterior Motive
The Accidental Detective
Cruel Doubt
The Silence of the Hams
White Night
T

### Create Movie <-> ID Dictionaries

In [42]:
# Create a list of movie names
movie_name = plot_df["Title"].tolist()

# Create a list of movie ids
movie_id = plot_df["Title"].index.tolist()

# Create dictionarys to access them both days
movie_to_id = {}
id_to_movie = {}

# Populate movie to id
for idx in range(len(movie_name)):
    movie_to_id[movie_name[idx]] = movie_id[idx]

# Populate id to movie
for idx in range(len(movie_name)):
    id_to_movie[movie_id[idx]] = movie_name[idx]

### Recommend Movie

In [49]:
def recommend_movie(movies):
    '''
    Recommends movie based on highest cosine similarity.
    '''
    
    # Empty list of ranks
    ranks = []
    
    # For each movie in the list of inputted movies
    for movie in movies:
        
        # Turn movie string into row index for movie
        movie = movie_to_id[movie]

        # Cosine distances for the given movie to all others
        dists = [dist[0] for dist in pairwise_distances(doc_topic, doc_topic[movie].reshape(1,-1))]

        # Sort the distances from closest to furthest, excluding the movie itself, and retain movie ids
        rec_movie_ids = np.argsort(dists)[1:]
        
        # Add this movie's ranks to the ranks list
        ranks.append(rec_movie_ids)
     
    # Create a dictionary of "average" ranks per movie
    rank_dict = {}
    
    # Loop through each movie and add the ranks up
    for i in range(len(movies)):
        for idx, movie in enumerate(ranks[i]):
            try:
                rank_dict[movie] += idx
            except:
                rank_dict[movie] = idx
    
    # Generate and return movie recommendation
    movie_recommendation = id_to_movie[min(rank_dict, key = rank_dict.get)]
    return movie_recommendation

In [85]:
def main():
    '''
    Main function to run the recommendation tool.
    '''
    
    # Create the empty movie list for the user to populate
    movie_list = []
    
    # User selects number of movies they will input

    n_movies = input("How many movies would you like to input? ")
    
    try:
        n_movies = int(n_movies)
    except:
        n_movies = input("Please enter a number: ")
        n_movies = int(n_movies)
        

    # User fills in the list of movies (customized input strings)
    for i in range(n_movies):
        movie_input = input(f"Movie Selection #{i+1}: ")
        
        # Make sure the movie is in the list (witht spellcheck)
        true_movie, ratio = spell_check(movie_input, plot_df["Title"].tolist())
        
        # If no movie titles are close, keep trying
        while ratio < .5:
            movie_input = input(f"Movie Not Found! Select Another...\nMovie Selection #{i+1}: ")
            true_movie, ratio = spell_check(movie_input, plot_df["Title"].tolist())
        
        print(f"Movie Selection #{i+1}: {true_movie}.")
        # Add the movie to the list
        movie_list.append(true_movie)

    # Generate the recommended movie
    recommended_movie = recommend_movie(movie_list)

    # Print out the movies requested
    print(f"\nYou requested:")
    
    for idx, movie in enumerate(movie_list):
        print(f"   ({idx+1}) - {movie}")
    
    # Printed out the recommended movie
    print(f"\nWe suggest you watch {recommended_movie} :)")

In [86]:
main()

How many movies would you like to input? 3
Movie Selection #1: get out
Movie Selection #1: Get Out.
Movie Selection #2: findinf nemo
Movie Selection #2: Finding Nemo.
Movie Selection #3: the lion, the witch and the wardrobe
Movie Selection #3: The Chronicles of Narnia: The Lion, the Witch and the Wardrobe.

You requested:
   (1) - Get Out
   (2) - Finding Nemo
   (3) - The Chronicles of Narnia: The Lion, the Witch and the Wardrobe

We suggest you watch The Hunger Games :)


### Ratings

In [112]:
# Pull in the 
movies = pd.read_csv("ml-latest/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [53]:
movies.shape

(3883, 3)

In [54]:
# New columns with movie title and year for merging
plot_df["Movie Title"] = plot_df["Title"].map(str) + " (" + plot_df["Release Year"].map(str) + ")"

In [101]:
plot_df.merge(movies, on = ["Movie Title"]).drop(columns = ["Wiki Page"])

Unnamed: 0,Title,Release Year,Wiki Page,Plot,Movie Title,movie_id,Genre
0,Airplane!,1980,Airplane!,a a parody film airplane tell it story intermi...,Airplane! (1980),2791,Comedy
1,Alligator,1980,Alligator_(film),a teenage girl purchase a baby american alliga...,Alligator (1980),2525,Action|Horror|Sci-Fi
2,American Gigolo,1980,American_Gigolo,julian kaye be a male escort in los_angeles wh...,American Gigolo (1980),3649,Drama
3,Atlantic City,1980,Atlantic_City_(1980_film),sally susan sarandon be a young waitress in an...,Atlantic City (1980),2130,Crime|Drama|Romance
4,Friday the 13th,1980,Friday_the_13th_(1980_film),in the summer of at camp crystal lake two coun...,Friday the 13th (1980),1974,Horror
5,Herbie Goes Bananas,1980,Herbie_Goes_Bananas,loosely pick up where herbie go to monte carlo...,Herbie Goes Bananas (1980),2050,Adventure|Children's|Comedy
6,Melvin and Howard,1980,Melvin_and_Howard,in the opening scene howard hughes loses contr...,Melvin and Howard (1980),2988,Drama
7,My Bodyguard,1980,My_Bodyguard,clifford peache life in an upscale chicago lux...,My Bodyguard (1980),2240,Drama
8,Ordinary People,1980,Ordinary_People,the jarretts be an uppermiddleclass family in ...,Ordinary People (1980),1956,Drama
9,Popeye,1980,Popeye_(film),popeye a sailor arrives at the small coastal t...,Popeye (1980),2088,Adventure|Comedy|Musical
