In [19]:
# Setup movie data preprocessing environment
import sys
import pandas as pd
import numpy as np
from IPython.display import display

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from collecting_user_data import User

from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise import SVD
from surprise.model_selection import KFold
from surprise import accuracy

import difflib
import random

In [20]:
# Ingest cleaned movie data
movies = pd.read_csv("../data/movies.csv")

# Ingest ratings csv file to gather user ratings of movies
ratings = pd.read_csv("../data/ratings_small.csv")

In [21]:
# Convert values to int if possible
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [22]:
# Add rating_id to movies dataframe to look up ratings from other users when using collaborative filtering
id_map = pd.read_csv('../data/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['rating_id', 'id']
movies = movies.merge(id_map, on="id")

In [23]:
display(movies)

Unnamed: 0,genres,id,overview,popularity,poster_path,release_date,runtime,title,vote_average,keywords,soup,rating_id
0,"['animation', 'comedy', 'family']",862,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,1995-10-30,81.0,Toy Story,7.7,"['jealousy', 'toy', 'boy', 'friendship', 'frie...",jealousy toy boy friendship friends rivalry bo...,1
1,"['adventure', 'fantasy', 'family']",8844,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,1995-12-15,104.0,Jumanji,6.9,"['boardgame', 'disappearance', ""basedonchildre...",boardgame disappearance basedonchildren'sbook ...,2
2,"['romance', 'comedy']",15602,A family wedding reignites the ancient feud be...,11.712900,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,1995-12-22,101.0,Grumpier Old Men,6.5,"['fishing', 'bestfriend', 'duringcreditsstinge...",fishing bestfriend duringcreditsstinger oldmen...,3
3,"['comedy', 'drama', 'romance']",31357,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,1995-12-22,127.0,Waiting to Exhale,6.1,"['basedonnovel', 'interracialrelationship', 's...",basedonnovel interracialrelationship singlemot...,4
4,['comedy'],11862,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,1995-02-10,106.0,Father of the Bride Part II,5.7,"['baby', 'midlifecrisis', 'confidence', 'aging...",baby midlifecrisis confidence aging daughter m...,5
...,...,...,...,...,...,...,...,...,...,...,...,...
7310,"['crime', 'drama', 'thriller', 'western']",338766,A divorced dad and his ex-con brother resort t...,12.565896,/6YOrNBdoXvT8aC5VPLkkN6t5z0V.jpg,2016-08-12,102.0,Hell or High Water,7.2,"['desperation', 'texas', 'bankrobber', 'brothe...",desperation texas bankrobber brother shooting ...,161582
7311,"['drama', 'thriller']",314420,A night out turns deadly when three girls brea...,6.236714,/dw0xE56whflv0OgfEw3lwdWxLRD.jpg,2015-01-25,75.0,Body,5.0,['mansion'],mansion drama thriller,161830
7312,"['comedy', 'horror', 'sciencefiction']",390989,The new installment of the Sharknado franchise...,4.574494,/jcP3HFXF1BIW9LmBrDusbbDZjBG.jpg,2016-07-31,85.0,Sharknado 4: The 4th Awakens,4.3,"['sharkattack', 'sequel', 'farce', 'lasvegas',...",sharkattack sequel farce lasvegas creaturefeat...,161918
7313,['drama'],159550,A man must cope with the loss of his wife and ...,0.038998,/yWp7PgydSlxlhl7benKhTnCvRjN.jpg,2001-09-23,85.0,The Last Brickmaker in America,7.0,"['friendship', 'brickmaking']",friendship brickmaking drama,161944


Train Collaborative Filtering Model

In [24]:
ratings_df = ratings.copy(deep=True)
movies_df = movies.copy(deep=True)

# Instantiate reader object and svd object
reader = Reader(rating_scale=(1, 5))
algo = SVD()

# Load data into surprise dataset format with reader
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

svd = SVD(verbose=True, n_epochs=5)
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Train the model on the entire dataset by converting the CF dataset into a Surprice Trainset object
trainset = data.build_full_trainset()
svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9195  0.9170  0.9100  0.9090  0.9134  0.9138  0.0040  
MAE (testset)     0.7110  0.7088  0.7068  0.7038  0.7094  0.7080  0.0025  
Fit time          0.73    0.75    0.74    0.76    0.73    0.74    0.01    
Test time         0.06    0.10    0.06    0.06    0.10    0.08    0.02    
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x177f8fb80>

In [25]:
(svd.predict(uid = 1, iid = 1029))

Prediction(uid=1, iid=1029, r_ui=None, est=3.300204502831739, details={'was_impossible': False})

In [8]:
display(ratings[ratings.userId == 1])

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


Make Appropriate Recommendation

In [26]:
# Grabbing the inputs from the user
user = User()
titles = list(movies.title.values)
user.get_searchTerms()
user.get_fav_movie(titles)

In [28]:
print("User search terms: ", user.search_terms)
print("User favorite movie: ", user.fav_movie)

User search terms:  ['', '']
User favorite movie:  The Dark Knight


In [29]:
def top_movies_rec(movies = movies):
    ranked_titles = []
    ranked_movies = movies.copy(deep=True)
    ranked_movies = ranked_movies.sort_values("popularity", ascending=False)
    for idx in range(10):
        ranked_titles.append([movies.title.iloc[idx], movies.id.iloc[idx]])
    return ranked_titles

In [54]:
def content_based_rec(movies = movies, search_terms = []):
    ranked_titles = []
    
    df = movies.copy(deep=True)
    
    # Calculate the average "vote_average" to limit the recs to movies that are above average
    rating_avg = round(df.vote_average.mean(), 0)
    df = df[df.vote_average >= rating_avg]
    
    # Creating a copy of the last row of the dataset, which we will use to input the user's input
    new_row = df.iloc[-1,:].copy()
    
    # Adding the input to the new row
    new_row.iloc[-2] = " ".join(search_terms)
  
    # Adding the new row to the dataset
    df = df.append(new_row)
  
    # Vectorizing the entire matrix
    count = CountVectorizer(analyzer = 'word', stop_words='english', ngram_range=(1, 2))
    count_matrix = count.fit_transform(df['soup'])

    # Pairwise cosine similarity and creating the cosine matrix
    cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
  
    # Sorting cosine similarities by highest to lowest
    sim_scores = list(enumerate(cosine_sim2[-1,:]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Append top ranked df based on search terms to ranked_titles
    for i in range(1, 51):
        indx = sim_scores[i][0]
        ranked_titles.append([df['title'].iloc[indx], df['id'].iloc[indx]])
        
    return ranked_titles

In [55]:
def collab_based_rec(movies = movies, ratings = ratings, fav_movie = None):
    
    

IndentationError: expected an indented block (3291132181.py, line 3)

In [56]:
def hybrid_rec(movies = movies, search_terms = [], fav_movie = None):
    return "hybrid_rec"

In [57]:
def make_recommendation(movies = movies, ratings = ratings, search_terms = [], fav_movie = None):
    if (not search_terms or search_terms == ['', '']) and not fav_movie:
        return top_movies_rec(movies)
    elif not fav_movie and search_terms:
        return content_based_rec(movies=movies, search_terms=search_terms)
    elif fav_movie and (not search_terms or search_terms == ['', '']):
        return collab_based_rec(movies=movies, ratings=ratings, fav_movie=fav_movie)
    else:
        return hybrid_rec(movies=movies, search_terms=search_terms, fav_movie=fav_movie)

In [58]:
make_recommendation(movies=movies, ratings=ratings, search_terms=user.search_terms, fav_movie=user.fav_movie)

'The Dark Knight'

In [59]:
content_results = content_based_rec(movies, ['comedy, romance'])
print(content_results)

  df = df.append(new_row)


[['The Goodbye Girl', 14741], ['Pajama Party', 53617], ['Hardball', 20857], ['Serial', 41034], ['Chasing Papi', 24621], ["I'm No Angel", 34456], ['Lover Come Back', 40894], ['Mr. Blandings Builds His Dream House', 32294], ['Starting Over', 54663], ['About Last Night', 222899], ['My Man Godfrey', 13562], ["She's Gotta Have It", 27995], ['Jump Tomorrow', 25985], ['Worth Winning', 32330], ['Happy Accidents', 22230], ['The Owl and the Pussycat', 42597], ['30 YEARS TO LIFE', 41756], ['Butterflies Are Free', 55106], ['Cactus Flower', 28289], ['The Thrill of It All', 22233], ["What's Up, Doc?", 6949], ['Desk Set', 24203], ['Safety Last!', 22596], ['The Bachelor and the Bobby-Soxer', 27437], ['Unfaithfully Yours', 33115], ['The 40 Year Old Virgin', 6957], ['Bunny and the Bull', 35395], ['Obvious Child', 248774], ['The Ugly Truth', 20943], ['Café au Lait', 47507], ['One Fine Day', 7300], ['Twelfth Night', 44705], ['It Happened One Night', 3078], ['One Crazy Summer', 18282], ['Nothing in Common'

In [66]:
content_recs_df = pd.DataFrame(content_results).reset_index()
content_recs_df.columns = ['content_rank', 'title', 'id']
display(content_recs_df)

Unnamed: 0,content_rank,title,id
0,0,The Goodbye Girl,14741
1,1,Pajama Party,53617
2,2,Hardball,20857
3,3,Serial,41034
4,4,Chasing Papi,24621
5,5,I'm No Angel,34456
6,6,Lover Come Back,40894
7,7,Mr. Blandings Builds His Dream House,32294
8,8,Starting Over,54663
9,9,About Last Night,222899


In [46]:
fav_movie_id = movies[movies.title == user.fav_movie]['rating_id'].values[0]

In [47]:
print(fav_movie_id)

58559


In [127]:
similar_users = set(ratings[(ratings.movieId == fav_movie_id) & (ratings.rating >= 4.5)]["userId"][:15])

In [128]:
print(similar_users)

{38, 40, 73, 42, 13, 46, 15, 78, 61, 56, 89, 26, 29, 62, 31}


In [129]:
def get_est(similar_users, movie, model):
    total_est = 0
    num_users = len(similar_users)
    for user in similar_users:
        est = model.predict(uid=user, iid=movie).est
        total_est += est
    
    return total_est / num_users

In [130]:
content_recs_df['est'] = content_recs_df.apply(lambda x: get_est(similar_users, x.id, svd), axis=1)

In [131]:
display(content_recs_df)

Unnamed: 0,content_rank,title,id,est
0,0,The Goodbye Girl,14741,3.676259
1,1,Pajama Party,53617,3.676259
2,2,Hardball,20857,3.676259
3,3,Serial,41034,3.676259
4,4,Chasing Papi,24621,3.676259
5,5,I'm No Angel,34456,3.676259
6,6,Lover Come Back,40894,3.676259
7,7,Mr. Blandings Builds His Dream House,32294,3.676259
8,8,Starting Over,54663,3.676259
9,9,About Last Night,222899,3.676259


In [132]:
display(content_recs_df.sort_values(["est", "content_rank"], ascending=[False, True]))

Unnamed: 0,content_rank,title,id,est
44,44,Addicted to Love,2058,3.754097
25,25,The 40 Year Old Virgin,6957,3.730149
30,30,One Fine Day,7300,3.716078
0,0,The Goodbye Girl,14741,3.676259
1,1,Pajama Party,53617,3.676259
2,2,Hardball,20857,3.676259
3,3,Serial,41034,3.676259
4,4,Chasing Papi,24621,3.676259
5,5,I'm No Angel,34456,3.676259
6,6,Lover Come Back,40894,3.676259
