In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
# %matplotlib notebook

# Imports
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from xgboost import XGBRegressor

import os
import pickle
import wordcloud as wc
import scipy.sparse as sparse
import xgboost as xgb
import random
import jenkspy
import statsmodels.tools.tools as stattools
from itertools import combinations

import sklearn
from sklearn import tree
from sklearn.model_selection import KFold,train_test_split
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction import DictVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

import surprise
from surprise import Reader, Dataset
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering, accuracy  
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

In [2]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

In [3]:
if IN_COLAB :
  Dataset_path = "/content/drive/MyDrive/Colab Notebooks/DataSets/"
else :
  Dataset_path = "./DataSets/"

ml_path = Dataset_path + "ml-latest/"

In [4]:
ratings = pd.read_csv( ml_path + "ratings.csv")

Movies_metadata = pd.read_csv(Dataset_path+"MoviesInfo.csv")

In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27753444 entries, 0 to 27753443
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 847.0 MB


### just keep rating year between 1995, 2001

In [6]:
from datetime import datetime

# strftime('%Y-%m-%d %H:%M:%S')
ratings['year_rated'] = ratings['timestamp'].apply(lambda x: int( datetime.fromtimestamp( x ).strftime('%Y') ) )

ratings = ratings[ ratings['year_rated'].between(1995, 2001)].copy()

ratings.reset_index(drop=True, inplace=True)

In [7]:
print( f"number of Ratings : { ratings.shape[0] }")
print( f"number of movies : { ratings.groupby('movieId').count().shape[0] }")
print( f"number of users : { ratings.groupby('userId').count().shape[0] }")
print( f"range of rating : ( { np.min( ratings.groupby('rating').count().index )}, {np.max( ratings.groupby('rating').count().index )})  ")

number of Ratings : 7329482
number of movies : 4937
number of users : 103827
range of rating : ( 1.0, 5.0)  


### just keep rating that user and movie are upper than threshhold

In [9]:
min_movie_ratings = 20
filter_Movies = ratings['movieId'].value_counts() > min_movie_ratings
filter_Movies = filter_Movies[filter_Movies].index.tolist()

min_user_ratings = 20
filter_users = ratings['userId'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

df_new = ratings[(ratings['movieId'].isin(filter_Movies)) & (ratings['userId'].isin(filter_users))]
print('The original data frame shape:\t{}'.format(ratings.shape))
print('The new data frame shape:\t{}'.format(df_new.shape))

The original data frame shape:	(7329482, 5)
The new data frame shape:	(6921125, 5)


In [10]:
print( f"number of movies : { df_new.groupby('movieId').count().shape[0] }")
print( f"number of users : { df_new.groupby('userId').count().shape[0] }")
print( f"range of rating : ( { np.min( df_new.groupby('rating').count().index )}, {np.max( df_new.groupby('rating').count().index )})  ")

number of movies : 4393
number of users : 64140
range of rating : ( 1.0, 5.0)  


## initialize data

In [12]:
reader = Reader(rating_scale=(1., 5.))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.25)

In [None]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=5, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')  

### User-user Collaborative Filtering
https://surprise.readthedocs.io/en/stable/knn_inspired.html

In [None]:
# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)

In [None]:
# we can now query for specific predicions
uid = str(10)  # raw user id
iid = str(10)  # raw item id

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

In [None]:
# run the trained model against the testset
# get RMSE
print("User-based Model : Test Set")
test_pred = algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

# if you wanted to evaluate on the trainset
print("User-based Model : Training Set")
train_pred = algo.test(trainset.build_testset())
accuracy.rmse(train_pred)

### Item-Item Collaborative Filtering

In [None]:
# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)

In [None]:
# run the trained model against the testset
# get RMSE
print("Item-based Model : Test Set")
test_pred = algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

# if you wanted to evaluate on the trainset
print("Item-based Model : Training Set")
train_pred = algo.test(trainset.build_testset())
accuracy.rmse(train_pred)

### Matrix Factorization

In [None]:
# —– SVD —– #

param_grid = {'n_factors': [110, 120, 140, 160], 
              'n_epochs': [90, 100, 110], 
              'lr_all': [0.001, 0.003, 0.005, 0.008],
              'reg_all': [0.08, 0.1, 0.15] }

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)
algo = gs.best_estimator['rmse']
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
# Use the new parameters with the train data
svd = SVD(n_factors=160, n_epochs=100, lr_all=0.005, reg_all=0.1)
svd.fit(trainset)
test_pred = svd.test(testset)
print("SVD : Test Set")
accuracy.rmse(test_pred, verbose=True)

In [None]:
import difflib
import random

def get_Movie_info(Movie_id, metadata):
    
    """
    Returns some basic information about a Movie given the Movie id and the metadata dataframe.
    """
    
    Movie_info = metadata[metadata['movieId'] == Movie_id][['movieId', 'genres', 
                                                            'title', 'original_title', 'year', 'duration']]
    return Movie_info.to_dict(orient='records')

def generate_recommendation(user_id, model, metadata, thresh=4):
    
    """
    Generates a Movie recommendation for a user based on a rating threshold. Only
    Movies with a predicted rating at or above the threshold will be recommended
    """
    
    Movies_ID = ratings.movieId.unique().tolist()
    random.shuffle(Movies_ID)
    
    for Movie_ID in Movies_ID:
        rating = model.predict(uid=user_id, iid=Movie_ID).est
        if rating >= thresh:
            return get_Movie_info(Movie_ID, metadata) , rating

In [None]:
generate_recommendation(51737, svd, Movies_metadata)

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, n_iter=500, verbose=3, random_state=1)

Movies_embedding = tsne.fit_transform(svd.qi)

projection = pd.DataFrame(columns=['x', 'y'], data=Movies_embedding)

projection['title'] = Movies_metadata['original_title']

In [None]:
import plotly.express as px

fig = px.scatter(
    projection, x='x', y='y'
)
fig.show()

In [None]:
import datapane as dp

def plot_Movies(Movies_ID, plot_name):
    
    Movie_indices = []
    for Movie in Movies_ID:
        Movie_indices.append( Movies_ID.index(Movie) )
        
    Movies_vector_df = projection.iloc[ Movie_indices ]
    
    fig = px.scatter(
    Movies_vector_df, x='x', y='y', text='title',
    )
    fig.show()

In [None]:
Movies_ID = ratings.movieId.unique().tolist()

plot_Movies(Movies_ID, plot_name='Movies_embedding')