In [67]:
!pip install surprise



In [68]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Reader, Dataset
from surprise import accuracy
from surprise.model_selection import GridSearchCV, cross_validate, train_test_split
from surprise import SVD
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds

import warnings
from collections import OrderedDict
from datetime import date
warnings.filterwarnings('ignore')


In [69]:
df_train = pd.read_csv('train.csv')

In [70]:
df_train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739
2,146790,5459,5.0,1076215539
3,106362,32296,2.0,1423042565
4,9041,366,3.0,833375837


In [71]:
df_train.rating.unique()

array([4. , 4.5, 5. , 2. , 3. , 1. , 3.5, 2.5, 0.5, 1.5])

In [72]:
df_train.shape

(10000038, 4)

In [73]:
# Sample the data to 50 000 rows for easy computation and generate the same random rows for each iteration.
df_train1 = df_train.sample(n=50000, random_state = 42)

In [74]:
df_train1.shape

(50000, 4)

In [75]:
df_train1.head()

Unnamed: 0,userId,movieId,rating,timestamp
5954604,122380,31445,2.5,1159967140
1460784,22380,56775,4.0,1343936580
6631832,104339,356,2.5,1111529397
5396407,64877,6874,4.0,1513800297
8150654,63164,2762,5.0,1005315064


In [76]:
df_test = pd.read_csv('test.csv')

In [77]:
df_test.head()

Unnamed: 0,userId,movieId
0,5,788
1,68,7438
2,336,40412
3,803,3822
4,547,903


In [78]:
# Drop the timestamp column on the train data
df_train1 = df_train1.drop('timestamp', axis=1)

In [79]:
df_train1.head()

Unnamed: 0,userId,movieId,rating
5954604,122380,31445,2.5
1460784,22380,56775,4.0
6631832,104339,356,2.5
5396407,64877,6874,4.0
8150654,63164,2762,5.0


In [81]:
#Assign X as the original df_train1 dataframe and y as the user_id column of df_train1.
X = df_train1.copy()
y = df_train1['userId']

#Split into training and test datasets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state=42)

In [82]:
#create a pivot_table function
p_table = X_train.pivot_table(index='userId', values='rating',  columns='movieId')

p_table.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,202497,202719,203244,203334,203881,204698,204926,205327,205425,205573
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,,,,,,,,,,,...,,,,,,,,,,
12,,,,,,,,,,,...,,,,,,,,,,
18,,,,,,,,,,,...,,,,,,,,,,
21,,,,,,,,,,,...,,,,,,,,,,
31,,,,,,,,,,,...,,,,,,,,,,


In [83]:
#replace the null values with zeros
p_table1 = p_table.copy().fillna(0)

In [84]:
#Compute the cosine similarity matrix using the the imputed data
sim_table = cosine_similarity(p_table1, p_table1)

In [85]:
#Convert into pandas dataframe 
sim_table1 = pd.DataFrame(sim_table, index=p_table.index, columns=p_table.index)

sim_table1.head(10)

userId,2,12,18,21,31,46,64,69,72,80,...,162492,162497,162498,162508,162512,162516,162519,162521,162528,162529
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Model 1 - Mean rating

In [86]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(userId, movieId):
    
    #Check if movieId exists in p_table
    if movieId in p_table:
        #Compute the mean of all the ratings given to the movie
        mean_rating = p_table[movieId].mean()
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        mean_rating = 3.0
    
    return mean_rating

# Model 2 - Weigted Mean Rating

In [87]:
#User Based Collaborative Filter using Weighted Mean Ratings
def compute_ratings(id_user, id_movie):
    #Check if userId exists in sim_table
    if id_user in sim_table1:
        #Check if movieId exists in p_table
        if id_movie in p_table:
            sim_scores = sim_table1[id_user]
    
            #Get the user ratings for the movie in question
            m_ratings = p_table[id_movie]
        
            #Extract the indices containing NaN in the m_ratings series
            idx = m_ratings[m_ratings.isnull()].index
        
            #Drop the NaN values from the m_ratings Series
            m_ratings = m_ratings.dropna()
        
            #Drop the corresponding cosine scores from the sim_scores series
            sim_scores = sim_scores.drop(idx)
        
            #Compute the final weighted mean
            movie_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum()
            #Get the similarity scores for the user in question with every other user
            return movie_rating  
        
    else:
        #Default to a rating of 3.0 in the absence of any information
        return 3.0
    
   

In [88]:
compute_ratings(104339, 356) # predicts rating for user_id and movie_id 104339, 356

4.194514819974226

In [90]:
#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [91]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    
    #Construct a list of user-movie tuples from the testing dataset
    id_pairs = zip(X_test['userId'], X_test['movieId'])
    
    #Predict the rating for every user-movie tuple
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test['rating'])
    
    #Return the final RMSE score
    return rmse(y_true, y_pred)

In [103]:
#score(cf_user_mean)

In [93]:
#score(compute_ratings)

# Model 3 - SVD Algorithm

In [94]:
reader = Reader(rating_scale = (0.5,5))
#dataset creation
data = Dataset.load_from_df(df_train1, reader)

In [95]:
#Define the SVD algorithm object
svd = SVD()
#Evaluate the performance in terms of RMSE
cross_validate(svd, data, measures=['RMSE'], cv = 3)

{'test_rmse': array([0.99955105, 0.99791348, 1.0027022 ]),
 'fit_time': (2.6784374713897705, 2.439722776412964, 2.4147040843963623),
 'test_time': (0.1509091854095459, 0.22616028785705566, 0.1150822639465332)}

In [96]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x64078fb370>

In [97]:
trainset, testset = train_test_split(data, test_size=.25)

In [98]:

pred_svd = svd.test(testset)

In [99]:
# Then compute RMSE
accuracy.rmse(pred_svd)

RMSE: 0.6932


0.6932082150095444

# Prepare Submission

In [100]:
df_test["rating"] = round(df_test.apply(lambda x: svd.predict(x["userId"], x["movieId"]).est, axis=1),1)
df_test["Id"] = df_test.apply(lambda x: f"{x['userId']:.0f}_{x['movieId']:.0f}", axis=1)
submission = df_test[["Id", "rating"]]

In [101]:
submission.to_csv('Ndimphiwe_submission_2.csv',index=False)

In [102]:
submission.head()

Unnamed: 0,Id,rating
0,5_788,3.1
1,68_7438,3.7
2,336_40412,3.7
3,803_3822,3.8
4,547_903,4.1
