In [22]:
from dataprocessing import *
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
# The basic idea is to sort films by 
# 1. the average ratings across users 
# 2. watched time by users 
# 3. the number of genres the movie includes 
# -> all of these are generic features

In [3]:
# Building dataset for non-personalised system
dfn = pd.merge(dfr, dfm, on='movieId')
dfn

Unnamed: 0,userId,movieId,rating,title,genres
0,0,0,2,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,7,0,6,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,9,0,6,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,11,0,10,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,12,0,6,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...
8991508,41175,5854,7,Only the Lonely (1991),Comedy|Romance
8991509,41426,5854,8,Only the Lonely (1991),Comedy|Romance
8991510,42269,5854,6,Only the Lonely (1991),Comedy|Romance
8991511,42335,5854,6,Only the Lonely (1991),Comedy|Romance


In [4]:
train_users, test_users = train_test_split(np.arange(n_userIds), test_size=0.3, random_state=3)

In [5]:
train = dfn[dfn['userId'].isin(train_users)].copy()
train

Unnamed: 0,userId,movieId,rating,title,genres
1,7,0,6,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,12,0,6,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
6,14,0,8,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
7,19,0,8,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
11,27,0,8,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...
8991506,40901,5854,7,Only the Lonely (1991),Comedy|Romance
8991507,41058,5854,7,Only the Lonely (1991),Comedy|Romance
8991509,41426,5854,8,Only the Lonely (1991),Comedy|Romance
8991511,42335,5854,6,Only the Lonely (1991),Comedy|Romance


In [6]:
test = dfn[dfn['userId'].isin(test_users)].copy()
test

Unnamed: 0,userId,movieId,rating,title,genres
0,0,0,2,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,9,0,6,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,11,0,10,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
5,13,0,6,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
8,22,0,6,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...
8991501,40204,5854,8,Only the Lonely (1991),Comedy|Romance
8991502,40354,5854,5,Only the Lonely (1991),Comedy|Romance
8991503,40396,5854,7,Only the Lonely (1991),Comedy|Romance
8991508,41175,5854,7,Only the Lonely (1991),Comedy|Romance


In [7]:
len(test['userId'].unique())

13031

In [8]:
len(train['userId'].unique())

30404

In [9]:
# Count the films that have full rating -> we can see that it's not enough to have on feature for filtering
len(train[train['rating'] == 10])

937832

In [10]:
# Feature data build
# Get the averaage rating  for each film
average_rating = train.groupby('movieId')['rating'].mean()
train['averageRating'] = train['movieId'].map(average_rating)

# Get the number of rating for each film
counts = train['movieId'].value_counts()
train['rateTime'] = train['movieId'].map(counts)

# Get the number of genres for each film
train['genresCount'] = train['genres'].str.count('\\|') + 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['averageRating'] = train['movieId'].map(average_rating)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['rateTime'] = train['movieId'].map(counts)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['genresCount'] = train['genres'].str.count('\\|') + 1


In [11]:
# Drop the irelevant columns to save some space
train = train.drop(['genres', 'rating', 'userId'], axis=1)

In [12]:
# Drop duplicates in the dataset
train = train.drop_duplicates()

In [13]:
# Sort the dataset by the averageRating, ratetime and number of genres in order.
train = train.sort_values(['averageRating', 'rateTime', 'genresCount'], ascending=[False, False, False])

In [14]:
train

Unnamed: 0,movieId,title,averageRating,rateTime,genresCount
5075163,1234,"Shawshank Redemption, The (1994)",8.898349,16901,1
395067,53,"Godfather, The (1972)",8.830375,11832,2
642994,87,"Usual Suspects, The (1995)",8.730222,14208,3
208898,38,Schindler's List (1993),8.702888,14991,2
1051252,151,Double Indemnity (1944),8.665615,1585,3
...,...,...,...,...,...
8620415,4316,Glitter (2001),2.337398,246,3
6110550,1554,Barney's Great Adventure (1998),2.304636,151,2
8913386,5386,Pokemon 4 Ever (a.k.a. Pokémon 4: The Movie) (...,2.280000,150,4
8913849,5388,Pokémon Heroes (2003),2.075472,106,2


In [15]:
ar = 5855 - len(train[['averageRating']].value_counts())
print(f'{ar} films that have the same average rating.\n')
rt = 5855 - len(train[['averageRating', 'rateTime']].value_counts())
print(f'{rt} films that have the same average rating and the same number of rate time.\n')
gc = 5855 - len(train[['averageRating', 'rateTime', 'genresCount']].value_counts())
print(f'{gc} films that have the same average rating, the same number of rate time and the same number of genres.\n')

216 films that have the same average rating.

44 films that have the same average rating and the same number of rate time.

10 films that have the same average rating, the same number of rate time and the same number of genres.



In [16]:
r = train.head(30)

In [18]:
# Build evaluation dataframe
# The Idea is to apply different metrics for evaluation. Metrics used: MSE, RMSE, Recall30 and Precision

In [19]:
# A list of top 30 
movie_list = r['movieId'].values

# Get rows that has the same movie id as the recommended 30 films
e_df = test.loc[test['movieId'].isin(movie_list)]

e_df = e_df.drop(['title', 'genres'], axis = 1)

# Make a new column for predicted rating
e_df['pred'] = e_df['movieId'].map(average_rating)

In [20]:
e_df

Unnamed: 0,userId,movieId,rating,pred
208897,0,38,10,8.702888
208899,3,38,10,8.702888
208900,9,38,8,8.702888
208902,11,38,10,8.702888
208908,25,38,10,8.702888
...,...,...,...,...
8853375,38268,5107,10,8.611940
8853380,39915,5107,8,8.611940
8853381,40652,5107,8,8.611940
8853383,41001,5107,8,8.611940


In [26]:
# Evaluation in the metric of mse and rmse
mse = mean_squared_error(e_df['rating'].values, e_df['pred'].values)
mse
rmse = np.sqrt(mse)
rmse

1.5548202889350093

In [27]:
mse

2.417466130883946

In [43]:
# Evaluation in the metric of mse and rmse
good_rate_threshold = 7
all_rec = len(e_df)
tp = len(e_df[(e_df['rating'] >= good_rate_threshold) & (e_df['pred'] >= good_rate_threshold)])
precision =  tp / all_rec

gt_good = len(test[(test['rating'] >= good_rate_threshold)])
recall30 = tp / gt_good

In [44]:
precision

0.8883418457338302

In [45]:
recall30

0.03625924808339771