In [276]:
# Setup movie data preprocessing environment
import sys
import pandas as pd
import numpy as np
from IPython.display import display

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from collecting_user_data import User

from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise import SVD

In [277]:
# Ingest cleaned movie data
movies = pd.read_csv("../data/movies.csv")

# Ingest ratings csv file to gather user ratings of movies
ratings = pd.read_csv("../data/ratings_small.csv")

In [278]:
# Convert values to int if possible
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [279]:
# Add rating_id to movies dataframe to look up ratings from other users when using collaborative filtering
id_map = pd.read_csv('../data/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['rating_id', 'id']
movies = movies.merge(id_map, on="id")

In [280]:
display(movies)

Unnamed: 0,genres,id,overview,popularity,poster_path,release_date,runtime,title,vote_average,keywords,soup,rating_id
0,"['animation', 'comedy', 'family']",862,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,1995-10-30,81.0,Toy Story,7.7,"['jealousy', 'toy', 'boy', 'friendship', 'frie...",jealousy toy boy friendship friends rivalry bo...,1
1,"['adventure', 'fantasy', 'family']",8844,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,1995-12-15,104.0,Jumanji,6.9,"['boardgame', 'disappearance', ""basedonchildre...",boardgame disappearance basedonchildren'sbook ...,2
2,"['romance', 'comedy']",15602,A family wedding reignites the ancient feud be...,11.712900,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,1995-12-22,101.0,Grumpier Old Men,6.5,"['fishing', 'bestfriend', 'duringcreditsstinge...",fishing bestfriend duringcreditsstinger oldmen...,3
3,"['comedy', 'drama', 'romance']",31357,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,1995-12-22,127.0,Waiting to Exhale,6.1,"['basedonnovel', 'interracialrelationship', 's...",basedonnovel interracialrelationship singlemot...,4
4,['comedy'],11862,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,1995-02-10,106.0,Father of the Bride Part II,5.7,"['baby', 'midlifecrisis', 'confidence', 'aging...",baby midlifecrisis confidence aging daughter m...,5
...,...,...,...,...,...,...,...,...,...,...,...,...
7310,"['crime', 'drama', 'thriller', 'western']",338766,A divorced dad and his ex-con brother resort t...,12.565896,/6YOrNBdoXvT8aC5VPLkkN6t5z0V.jpg,2016-08-12,102.0,Hell or High Water,7.2,"['desperation', 'texas', 'bankrobber', 'brothe...",desperation texas bankrobber brother shooting ...,161582
7311,"['drama', 'thriller']",314420,A night out turns deadly when three girls brea...,6.236714,/dw0xE56whflv0OgfEw3lwdWxLRD.jpg,2015-01-25,75.0,Body,5.0,['mansion'],mansion drama thriller,161830
7312,"['comedy', 'horror', 'sciencefiction']",390989,The new installment of the Sharknado franchise...,4.574494,/jcP3HFXF1BIW9LmBrDusbbDZjBG.jpg,2016-07-31,85.0,Sharknado 4: The 4th Awakens,4.3,"['sharkattack', 'sequel', 'farce', 'lasvegas',...",sharkattack sequel farce lasvegas creaturefeat...,161918
7313,['drama'],159550,A man must cope with the loss of his wife and ...,0.038998,/yWp7PgydSlxlhl7benKhTnCvRjN.jpg,2001-09-23,85.0,The Last Brickmaker in America,7.0,"['friendship', 'brickmaking']",friendship brickmaking drama,161944


Train Collaborative Filtering Model

In [315]:
# Create copies of df's so we don't transform the df in unusual ways by accident
ratings_df = ratings.copy(deep=True)
movies_df = movies.copy(deep=True)

# Instantiate reader object and svd object
reader = Reader(rating_scale=(1, 5))
algo = SVD()

# Load data into surprise dataset format with reader
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

svd = SVD(n_epochs=5)
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Train the model on the entire dataset by converting the CF dataset into a Surprice Trainset object
trainset = data.build_full_trainset()
svd.fit(trainset)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9194  0.9165  0.9063  0.9155  0.9144  0.9144  0.0044  
MAE (testset)     0.7090  0.7112  0.7020  0.7090  0.7075  0.7078  0.0031  
Fit time          0.91    0.91    0.90    0.90    0.90    0.91    0.00    
Test time         0.07    0.15    0.07    0.15    0.07    0.11    0.04    


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x29a6d5ee0>

In [293]:
# Define function to apply to each row of a dataframe that would estimate the movie likeness for each user in question
def get_est(similar_users, movie, model):
    total_est = 0
    num_users = len(similar_users)
    if not num_users:
        return total_est
    
    for user in similar_users:
        est = model.predict(uid=user, iid=movie).est
        total_est += est
    
    return total_est / num_users

Make Appropriate Recommendation

In [283]:
# Grabbing the inputs from the user
user = User()
titles = list(movies.title.values)
user.get_searchTerms()
user.get_fav_movie(titles)

In [284]:
print("User search terms: ", user.search_terms)
print("User favorite movie: ", user.fav_movie)

User search terms:  ['comedy romance', 'cute funny christmas']
User favorite movie:  The Holiday


In [316]:
def top_movies_rec(movies = movies):
    ranked_titles = []
    ranked_movies = movies.copy(deep=True)
    ranked_movies = ranked_movies.sort_values("popularity", ascending=False)
    for idx in range(10):
        ranked_titles.append([movies.title.iloc[idx], movies.rating_id.iloc[idx]])
    return ranked_titles

In [286]:
def content_based_rec(movies = movies, search_terms = []):
    ranked_titles = []
    
    df = movies.copy(deep=True)
    
    # Calculate the average "vote_average" to limit the recs to movies that are above average
    rating_avg = round(df.vote_average.mean(), 0)
    df = df[df.vote_average >= rating_avg]
    
    # Creating a copy of the last row of the dataset, which we will use to input the user's input
    new_row = df.iloc[-1,:].copy()
    
    # Adding the input to the new row
    new_row.iloc[-2] = " ".join(search_terms)
  
    # Adding the new row to the dataset
    df = df.append(new_row)
  
    # Vectorizing the entire matrix
    count = CountVectorizer(analyzer = 'word', stop_words='english', ngram_range=(1, 2))
    count_matrix = count.fit_transform(df['soup'])

    # Pairwise cosine similarity and creating the cosine matrix
    cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
  
    # Sorting cosine similarities by highest to lowest
    sim_scores = list(enumerate(cosine_sim2[-1,:]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Append top ranked df based on search terms to ranked_titles
    for i in range(1, 51):
        indx = sim_scores[i][0]
        ranked_titles.append([df['title'].iloc[indx], df['rating_id'].iloc[indx]])
        
    return ranked_titles

In [287]:
def collab_based_rec(movies = movies, ratings = ratings, fav_movie = None):
    pass
    

In [312]:
def hybrid_rec(movies = movies, search_terms = [], fav_movie = None):
    df = movies.copy(deep=True)
    
    # Get content-based recommendations
    content_results = content_based_rec(movies=movies, search_terms=search_terms)
    
    # Transform content based recs to a df for SVD estimations
    content_recs_df = pd.DataFrame(content_results).reset_index()
    content_recs_df.columns = ['content_rank', 'title', 'id']
    
    # Get fav_movie id for the User's favorite movie to analyize
    fav_movie_id = df[df.title == user.fav_movie]['rating_id'].values[0]
    
    # Get similiar users that have highly rated the users fav_movie in the past (top 15 users)
    similar_users = set(ratings[(ratings.movieId == fav_movie_id) & (ratings.rating >= 4.5)]["userId"])

    # Estimate the ratings the User would give each movie based on the similar users (predicted) ratings
    content_recs_df['est'] = content_recs_df.apply(lambda x: get_est(similar_users, x.id, svd), axis=1)
    
    # Sort by est and content_rank to re-rank the movies in the content-based list to be more personalized to the user
    content_recs_df = content_recs_df.sort_values(["est", "content_rank"], ascending=[False, True])
    
    # Collect the top 10 recommendations for the User
    ranked_titles = []
    for i in range(10):
        ranked_titles.append([content_recs_df['title'].iloc[i], content_recs_df['id'].iloc[i]])
        
    return content_recs_df

In [313]:
def make_recommendation(movies = movies, ratings = ratings, search_terms = [], fav_movie = None):
    if (not search_terms or search_terms == ['', '']) and not fav_movie:
        print("top_movie_rec")
        return top_movies_rec(movies)
    elif not fav_movie and search_terms:
        print("content_based_rec")
        return content_based_rec(movies=movies, search_terms=search_terms)
    elif fav_movie and (not search_terms or search_terms == ['', '']):
        print("collab_based_rec")
        return collab_based_rec(movies=movies, ratings=ratings, fav_movie=fav_movie)
    else:
        print("hybrid_rec")
        return hybrid_rec(movies=movies, search_terms=search_terms, fav_movie=fav_movie)

In [314]:
make_recommendation(movies=movies, ratings=ratings, search_terms=user.search_terms, fav_movie=user.fav_movie)

hybrid_rec


  df = df.append(new_row)


Unnamed: 0,content_rank,title,id,est
0,0,The Goodbye Girl,3244,0
1,1,Pajama Party,3924,0
2,2,Hardball,4771,0
3,3,Serial,5242,0
4,4,Chasing Papi,6295,0
5,5,I'm No Angel,7081,0
6,6,Lover Come Back,8385,0
7,7,Mr. Blandings Builds His Dream House,8711,0
8,8,Starting Over,26435,0
9,9,About Last Night,109372,0


In [291]:
display(movies[movies.title == "The Holiday"])

Unnamed: 0,genres,id,overview,popularity,poster_path,release_date,runtime,title,vote_average,keywords,soup,rating_id
5311,"['comedy', 'romance']",1581,"Two women, one (Cameron Diaz) from America and...",14.043416,/ixNtpuq8OVp4IckgzkSJIflFDkw.jpg,2006-12-08,136.0,The Holiday,6.7,"['holiday', 'londonengland', 'filmmaking', 'ch...",holiday londonengland filmmaking christmaspart...,49286


In [292]:
ratings[ratings.movieId == 49286]

Unnamed: 0,userId,movieId,rating,timestamp
2212,15,49286,2.5,1465880683
21337,149,49286,4.0,1437091824
29174,212,49286,2.5,1218401171
29850,213,49286,3.0,1462638263
32939,238,49286,2.5,1459365691
52563,382,49286,3.0,1371778367
53120,384,49286,4.0,1171807726
71364,500,49286,4.0,1233450559
72165,505,49286,3.5,1340407434
80311,547,49286,2.0,1214927078
