In [None]:
# Jon's notebook

# remember to add non-visual graphics

In [None]:
# Contents with hyperlinks within document?

# Overview

- business / data understanding - what kind of data are you using? and what makes it well-suited for the business problem
- data prep - why did you choose the data prep steps that you did? and what was the result
- modeling - what modeling packages did you use, which models within the packages, and what tuning steps did you take?
- evaluation - How well did your model perform? Relevant metrics. Validation approach

# Business and Data Understanding

what kind of data are you using? and what makes it well-suited for the business problem

# Data Preparation 

feature engineering, using pipelines or using unsupervised techniques

## Imports

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from surprise import Dataset, Reader, accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise.prediction_algorithms import SVD, SVDpp, NMF, BaselineOnly, NormalPredictor 

import pickle

## Loading in data files

### Links from MovieLens to IMDB and TMDB

- for IMDB: https://www.imdb.com/tt then movie id
- for TMDB: https://www.themoviedb.org/movie/ then movie id

In [21]:
links = pd.read_csv('data/links.csv')
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [24]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [25]:
links.isna().sum()

movieId    0
imdbId     0
tmdbId     8
dtype: int64

### Movie IDs with Title and Genres

only movies with at least one rating or tag are included, per MovieLens

Movie IDs are consistent across all four data files, and correspond with the __[MovieLens.org/movies/](https://movielens.org/movies/)__ URL

Genres are a pipe-separated list selected from the following:
- Action
- Adventure
- Animation
- Children's
- Comedy
- Crime
- Documentary
- Drama
- Fantasy
- Film-Noir
- Horror
- Musical
- Mystery
- Romance
- Sci-Fi
- Thriller
- War
- Western
- (no genres listed)

In [22]:
movies = pd.read_csv('data/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [26]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [27]:
movies.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

### User Ratings (min. 20 per userid)

user id is the same with tags.csv below too

Users are selected for inclusion at random per MovieLens

Ratings are made on a 5 star scale, with half-star increments (0.5 - 5.0)

In [19]:
ratings = pd.read_csv('data/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [28]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [29]:
ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

### Tags provided by user

Tags are "user-generated metadata about movies...typically a single word or short phrase"

Timestamp represents seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970

In [20]:
tags = pd.read_csv('data/tags.csv')
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [30]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [31]:
tags.isna().sum()

userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

## Combined dataframe with movies and ratings

In [45]:
df = pd.merge(movies, ratings, on='movieId')
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [46]:
len(df)

100836

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movieId    100836 non-null  int64  
 1   title      100836 non-null  object 
 2   genres     100836 non-null  object 
 3   userId     100836 non-null  int64  
 4   rating     100836 non-null  float64
 5   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 5.4+ MB


In [48]:
df.isna().sum()

movieId      0
title        0
genres       0
userId       0
rating       0
timestamp    0
dtype: int64

In [49]:
df.describe()

Unnamed: 0,movieId,userId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,19435.295718,326.127564,3.501557,1205946000.0
std,35530.987199,182.618491,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,1199.0,177.0,3.0,1019124000.0
50%,2991.0,325.0,3.5,1186087000.0
75%,8122.0,477.0,4.0,1435994000.0
max,193609.0,610.0,5.0,1537799000.0


# Modeling

- Rationale, results, limitations & recommendations
- Multiple different packages & model explainability tools

In [None]:
# start with dummy / simple model
# then clear explanation why you went from one model to the next
# cross val scores?
# why is this going to help your model?

# limitations: cold start.  At least talk about.  Maybe hybrid model
# with content-based filtering to start?

## Surprise Data and train/test split

In [50]:
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(df[['movieId', 'userId', 'rating']], reader)

In [51]:
# train/test split at 80/20

trainset, testset = train_test_split(data, test_size=0.2, random_state=24)


In [55]:
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items, '\n')
print('Number of ratings: ', trainset.n_ratings, '\n')

Number of users:  9010 

Number of items:  610 

Number of ratings:  80668 



In [56]:
(trainset.n_ratings) / len(df)

0.7999920663255187

## Baseline Simple Model - Random Rating

The NormalPredictor algorithm predicts a random rating based on the distribution of the training set, which is assumed to be normal.

In [57]:
# from Daniel

# Instantiate and fit model
baseline = NormalPredictor()
baseline.fit(trainset)

# Return test predictions for model fit on trainset
predictions = baseline.test(testset)

# Save RMSE score to variable
baseline_normal = accuracy.rmse(predictions)

RMSE: 1.4215


## Baseline Simple Model - Baseline Estimate

In [59]:
# from Daniel

# Instantiate and fit model
baseline2 = BaselineOnly()
baseline2.fit(trainset)

# Return test predictions for model fit on trainset
predictions = baseline2.test(testset)

# Save RMSE score to variable
baseline_only = accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 0.8723


## SVD and Cross-validating a Simple Model

In [62]:
svd_basic = SVD(random_state=24)
cross_validate(svd_simple, data, measures=['RMSE'], cv=3, n_jobs=-2, verbose=True)

Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8808  0.8923  0.8744  0.8825  0.0074  
Fit time          2.81    2.83    2.80    2.81    0.01    
Test time         0.22    0.17    0.19    0.19    0.02    


{'test_rmse': array([0.88082278, 0.89233774, 0.87438007]),
 'fit_time': (2.806281328201294, 2.8297955989837646, 2.799684762954712),
 'test_time': (0.2196483612060547, 0.16658329963684082, 0.19325041770935059)}

In [63]:
# from Daniel

# Fit to trainset and predict on the testset for evaluation
svd_basic.fit(trainset)
predictions = svd_basic.test(testset)
svd_simple = accuracy.rmse(predictions)

RMSE: 0.8740


In [None]:
# svd_simple = SVD(verbose=True, n_epochs=10)
# cross_validate(svd_simple, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

In [None]:
# Canvas - 

# trainset, testset = train_test_split(jokes, test_size=0.2)

In [None]:
# print(len(testset))

## GridSearchCV

In [65]:
# from Canvas


# dictionary of hyperparameters
svd_param_grid = {'n_factors':[20, 100],
                  'n_epochs': [5, 10], 
                  'lr_all': [0.002, 0.005],
                  'reg_all': [0.4, 0.6]}
# instantiate
svd_gs_model = GridSearchCV(SVD,param_grid=svd_param_grid,n_jobs = -2,joblib_verbose=5)

# model is fot and best_params scored by RMSE
svd_gs_model.fit(data)
svd_gs_model.best_params['rmse']

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done  50 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-2)]: Done  76 out of  80 | elapsed:   20.8s remaining:    1.0s
[Parallel(n_jobs=-2)]: Done  80 out of  80 | elapsed:   21.3s finished


{'n_factors': 20, 'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}

In [None]:
# svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
# svd.fit(trainset)
# predictions = svd.test(testset)
# print(accuracy.rmse(predictions))

In [None]:
# towards Data Science blog

# trainset = data.build_full_trainset()
# svd.fit(trainset)

In [None]:
# svd.predict(uid=10, iid=100)

In [None]:
# Returns:

# Prediction(uid=10, iid=100, r_ui=None, est=4.051206489275292, details={'was_impossible': False})

In [None]:
# import difflib
# import random

# def get_book_id(book_title, metadata):
    
#     """
#     Gets the book ID for a book title based on the closest match in the metadata dataframe.
#     """
    
#     existing_titles = list(metadata['title'].values)
#     closest_titles = difflib.get_close_matches(book_title, existing_titles)
#     book_id = metadata[metadata['title'] == closest_titles[0]]['id'].values[0]
#     return book_id

# def get_book_info(book_id, metadata):
    
#     """
#     Returns some basic information about a book given the book id and the metadata dataframe.
#     """
    
#     book_info = metadata[metadata['id'] == book_id][['id', 'isbn', 
#                                                     'authors', 'title', 'original_title']]
#     return book_info.to_dict(orient='records')

# def predict_review(user_id, book_title, model, metadata):
    
#     """
#     Predicts the review (on a scale of 1-5) that a user would assign to a specific book. 
#     """
    
#     book_id = get_book_id(book_title, metadata)
#     review_prediction = model.predict(uid=user_id, iid=book_id)
#     return review_prediction.est

# def generate_recommendation(user_id, model, metadata, thresh=4):
    
#     """
#     Generates a book recommendation for a user based on a rating threshold. Only
#     books with a predicted rating at or above the threshold will be recommended
#     """
    
#     book_titles = list(metadata['title'].values)
#     random.shuffle(book_titles)
    
#     for book_title in book_titles:
#         rating = predict_review(user_id, book_title, model, metadata)
#         if rating >= thresh:
#             book_id = get_book_id(book_title, metadata)
#             return get_book_info(book_id, metadata)

In [None]:
# code for 2D visualization of multi-dimensional recommendation (scatter plot)
# towards data science post

# Evaluation