In [132]:
import pandas as pd
import numpy as np
import random

import plotly
from plotly.offline import plot, iplot
import plotly.graph_objs as go
from plotly.offline import *

from sklearn.metrics import ndcg_score

from surprise.prediction_algorithms import SVD, KNNWithMeans, KNNBasic, knns
from surprise.model_selection import GridSearchCV, cross_validate, train_test_split
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy

In [35]:
df_init = pd.read_csv('./Movie_data/ratings.csv')
df_init.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
userId       100836 non-null int64
movieId      100836 non-null int64
rating       100836 non-null float64
timestamp    100836 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [36]:
df = df_init.drop(columns='timestamp')
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [74]:
rating_count = df.groupby(["userId"]).size()
rating_count

userId
1       232
2        29
3        39
4       216
5        44
       ... 
606    1115
607     187
608     831
609      37
610    1302
Length: 610, dtype: int64

In [51]:
unique_rating_count = df.groupby(["userId"])['movieId'].nunique()
(unique_rating_count - rating_count).sum()

0

We can confirm that each user has only rated each movie at most once.

In [121]:
movie_count = df.groupby(["movieId"]).size()
top_movies = movie_count.sort_values(ascending = False)
top_movies[0:5]

movieId
356     329
318     317
296     307
593     279
2571    278
dtype: int64

In [10]:
from surprise import Reader, Dataset
reader = Reader()
df_3 = Dataset.load_from_df(df,reader)

In [11]:
trainset, testset = train_test_split(df_3, test_size=0.2)

In [143]:
ratings_dist = df['rating'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = ratings_dist.index,
               text = ['{:.1f} %'.format(val) for val in (ratings_dist.values / df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = ratings_dist.values,
               )
layout = dict(title = 'Distribution of Ratings',
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
graph_r = go.Figure(data=[trace], layout=layout)
plotly.offline.plot(graph_r)

'temp-plot.html'

In [148]:
moive_rated_dist = df.groupby(['movieId'])['rating'].count()

# Create trace
trace = go.Histogram(x = moive_rated_dist.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 500,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per Movie',
                   xaxis = dict(title = 'Number of Ratings Per Movie'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
plotly.offline.plot(fig)

'temp-plot.html'

In [147]:
moive_rated_dist = df.groupby(['userId'])['rating'].count()

# Create trace
trace = go.Histogram(x = moive_rated_dist.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 1000,
                                  size = 2))
# Create layout
layout = go.Layout(title = 'Distribution Of Number of Ratings Per User',
                   xaxis = dict(title = 'Number of Ratings Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
plotly.offline.plot(fig)

'temp-plot.html'

In [15]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), KNNWithMeans(), KNNBasic()]:
    # Perform cross validation
    results = cross_validate(algorithm, df_3, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,0.879673,7.706104,0.448233
KNNWithMeans,0.904229,0.131619,2.637263
KNNBasic,0.957331,0.096304,2.421421


In [16]:
svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
svd.fit(trainset)
predictions = svd.test(testset)
print(accuracy.rmse(predictions))

RMSE: 0.8806
0.8805897220186978


In [55]:
sim_pearson = {'name':'pearson', 'user_based':False}
knn_means = knns.KNNWithMeans(sim_options=sim_pearson)
knn_means.fit(trainset)
predictions = knn_means.test(testset)
print(accuracy.rmse(predictions))

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8997
0.8997281871809383


In [139]:
def suggest_movies(x):
    if 5 < rating_count[x] <=15:
        for y in list(range(0, len(df['movieId']))):
            user_prediction = knn_means.predict(x, y)
            pred_rating.update({user_prediction[1]: user_prediction[3]})
        pred_rating = {k: v for k, v in sorted(pred_rating.items(), key=lambda item: item[1])}
        rec = list(pred_rating)[-6:-1]
    elif rating_count[x] > 15:
        pred_rating = {}
        for y in list(range(0, len(df['movieId']))):
            user_prediction = svd.predict(x, y)
            pred_rating.update({user_prediction[1]: user_prediction[3]})
        pred_rating = {k: v for k, v in sorted(pred_rating.items(), key=lambda item: item[1])}
        rec = list(pred_rating)[-6:-1]
    else:
        rec = random.sample(list(top_movies[0:50]), 5) 
    return rec, pred_rating

In [151]:
predictions = suggest_movies(6)[1]

In [152]:
pred = []
n=0
for key, value in predictions.items():
    pred.append(value)
    n = n+1
scores = np.asarray([pred])
true_relevance = np.asarray([df['rating'].tolist()])
ndcg_score(true_relevance, scores)

0.9754626214400527

In [142]:
params = {'n_factors': [60, 65, 75 ],
         'reg_all': [0.04, 0.045, 0.05]}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
g_s_svd.fit(df_3)
print(g_s_svd.best_score)
print(g_s_svd.best_params)


{'rmse': 0.8689738364847812, 'mae': 0.6676408262947351}
{'rmse': {'n_factors': 75, 'reg_all': 0.05}, 'mae': {'n_factors': 75, 'reg_all': 0.05}}
