In [115]:
# Setup environment
from IPython.display import display
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from gensim.models.word2vec import Word2Vec
from collections import Counter
from string import punctuation
from ast import literal_eval


In [165]:
# Pull in cleaned movies, preprocessed movies, and user preferences
movie_data = pd.read_csv('data/preprocessed_movies.csv')
user_data = ['Comedy, Romance, Will Smith, skip, skip']

In [168]:
def make_recommendation(movie_data: pd.DataFrame, user_data: list):
    """
    Args:
        movie_data (df): dataframe containing movie attributes
        user_data (list): list containing user movie preferences
    """
    # Create copy of last row of dataset which will be used for user inputs
    new_row = movie_data.iloc[-1:, :].copy(deep=True)
    
    # Add user input to the new row
    new_row.iloc[-1] = " ".join([str.lower(x) for x in user_data])
    
    # Add new row to the dataset
    movie_data = movie_data.append(new_row)
    
    # Vectorize matrix
    count = CountVectorizer(stop_words='english')
    count_matrix = count.fit_transform(movie_data.soup)
    
    # Cosine Similarity to get a similarity matrix
    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    
    # Sort similarities from highest to lowest
    scores = list(enumerate(cosine_sim[-1,:]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    
    # Match similarities
    ranked_titles = []
    for i in range(1,6):
        indx = scores[i][0]
        ranked_titles.append([movie_data['Series_Title'].iloc[indx], movie_data['Genre'].iloc[indx], movie_data['IMDB_Rating'].iloc[indx]])
    
    return ranked_titles

In [169]:
make_recommendation(movie_data=movie_data, user_data=user_data)

  movie_data = movie_data.append(new_row)


[['It Happened One Night', 'comedy  romance', 8.1],
 ['Roman Holiday', 'comedy  romance', 8.0],
 ['The Philadelphia Story', 'comedy  romance', 7.9],
 ['Vicky Donor', 'comedy  romance', 7.8],
 ['Amélie', 'comedy  romance', 8.3]]