1. Load the movie ratings data (as in the HW3-recommender-system) and use matrix factorization technique(s) and predict the missing ratings from the test data. Measure the RMSE. You should use sklearn library. [10 pts]

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix, csr_matrix
from scipy.spatial.distance import jaccard, cosine 
from pytest import approx

In [2]:
MV_users = pd.read_csv('data/users.csv')
MV_movies = pd.read_csv('data/movies.csv')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
from collections import namedtuple
Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(MV_users, MV_movies, train, test)

In [4]:
class RecSys():
    def __init__(self,data):
        self.data=data
        self.allusers = list(self.data.users['uID'])
        self.allmovies = list(self.data.movies['mID'])
        self.genres = list(self.data.movies.columns.drop(['mID', 'title', 'year']))
        self.mid2idx = dict(zip(self.data.movies.mID,list(range(len(self.data.movies)))))
        self.uid2idx = dict(zip(self.data.users.uID,list(range(len(self.data.users)))))
        self.Mr=self.rating_matrix()
        self.Mm=None 
        self.sim=np.zeros((len(self.allmovies),len(self.allmovies)))
        
    def rating_matrix(self):
        """
        Convert the rating matrix to numpy array of shape (#allusers,#allmovies)
        """
        ind_movie = [self.mid2idx[x] for x in self.data.train.mID] 
        ind_user = [self.uid2idx[x] for x in self.data.train.uID]
        rating_train = list(self.data.train.rating)
        return np.array(coo_matrix((rating_train, (ind_user, ind_movie)), shape=(len(self.allusers), len(self.allmovies))).toarray())

    def predict_everything_to_3(self):
        """
        Predict everything to 3 for the test data
        """
        # Generate an array with 3s against all entries in test dataset
        # your code here
        y_pred = np.full(len(self.data.test), 3)
        return y_pred
        
    def predict_to_user_average(self):
        """
        Predict to average rating for the user.
        Returns numpy array of shape (#users,)
        """
        # Generate an array as follows:
        # 1. Calculate all avg user rating as sum of ratings of user across all movies/number of movies whose rating > 0
        # 2. Return the average rating of users in test data
        # your code here
        user_ratings = self.Mr.sum(axis=1)
        user_avg_ratings = user_ratings / (self.Mr > 0).sum(axis=1)
        user_avg_pred = user_avg_ratings[[self.uid2idx[x] for x in self.data.test.uID]]
        return user_avg_pred
        pass
    
    def predict_from_sim(self,uid,mid):
        """
        Predict a user rating on a movie given userID and movieID
        """
        # Predict user rating as follows:
        # 1. Get entry of user id in rating matrix
        # 2. Get entry of movie id in sim matrix
        # 3. Employ 1 and 2 to predict user rating of the movie
        # your code here
        user_feature = self.Mr[self.uid2idx[uid]]
        movie_feature = self.sim[self.mid2idx[mid]]
        return np.dot(user_feature, movie_feature) / np.dot(movie_feature, user_feature> 0)     
        pass
    
    def predict(self):
        """
        Predict ratings in the test data. Returns predicted rating in a numpy array of size (# of rows in testdata,)
        """
        # your code here
        y_pred=[]
        for i in range(len(self.data.test)):
            row = self.data.test.loc[i]
            uid = row.uID
            mid = row.mID
            y_pred.append(self.predict_from_sim(uid, mid))
        return np.array(y_pred)                                  
        pass
    
    def rmse(self,yp):
        yp[np.isnan(yp)]=3 #In case there is nan values in prediction, it will impute to 3.
        yt=np.array(self.data.test.rating)
        return np.sqrt(((yt-yp)**2).mean())

    
class ContentBased(RecSys):
    def __init__(self,data):
        super().__init__(data)
        self.data=data
        self.Mm = self.calc_movie_feature_matrix()  
        
    def calc_movie_feature_matrix(self):
        """
        Create movie feature matrix in a numpy array of shape (#allmovies, #genres) 
        """
        # your code here
        genre = self.data.movies[self.genres].values
        return genre
        pass
    
    def calc_item_item_similarity(self):
        """
        Create item-item similarity using Jaccard similarity
        """
        # Update the sim matrix by calculating item-item similarity using Jaccard similarity
        # Jaccard Similarity: J(A, B) = |A∩B| / |A∪B| 
        # your code here
        n_movies = len(self.allmovies)
        self.sim = np.zeros((n_movies, n_movies))
        for i in range(n_movies):
            for j in range(i, n_movies):
                inter = np.sum(np.logical_and(self.Mm[i], self.Mm[j]))
                union = np.sum(np.logical_or(self.Mm[i], self.Mm[j]))
                jaccard = inter / union
                self.sim[i, j] = jaccard
                self.sim[j, i] = jaccard
                
class Collaborative(RecSys):    
    def __init__(self,data):
        super().__init__(data)
        
    def calc_item_item_similarity(self, simfunction, *X):  
        """
        Create item-item similarity using similarity function. 
        X is an optional transformed matrix of Mr
        """    
        # General function that calculates item-item similarity based on the sim function and data inputed
        if len(X)==0:
            self.sim = simfunction()            
        else:
            self.sim = simfunction(X[0]) # *X passes in a tuple format of (X,), to X[0] will be the actual transformed matrix

    def cossim(self):    
        """
        Calculates item-item similarity for all pairs of items using cosine similarity (values from 0 to 1) on utility matrix
        Returns a cosine similarity matrix of size (#all movies, #all movies)
        """
        row_means = self.Mr.sum(axis=1) / (self.Mr > 0).sum(axis=1)
        center_means = np.repeat(np.expand_dims(row_means, axis=1), self.Mr.shape[1], axis=1)
        center_matrix = self.Mr + (self.Mr == 0) * center_means - center_means
        norm_matrix = center_matrix / np.sqrt((center_matrix ** 2).sum(axis=0))
        norm_matrix[np.isnan(norm_matrix)] = 0.
        cos = np.dot(norm_matrix.T, norm_matrix)
        for i in range(len(self.allmovies)):
            cos[i, i] = 1
        sim_matrix = 0.5*cos + 0.5
        return sim_matrix
        pass
    
    def jacsim(self,Xr):
        """
        Calculates item-item similarity for all pairs of items using jaccard similarity (values from 0 to 1)
        Xr is the transformed rating matrix.
        """
        # Return a sim matrix by calculating item-item similarity for all pairs of items using Jaccard similarity
        # Jaccard Similarity: J(A, B) = |A∩B| / |A∪B| 
        # your code here
        n = Xr.shape[1]
        max_Xr = int(Xr.max())
        inter_matrix = np.zeros((n, n), dtype=int)
        for i in range(1, max_Xr + 1):
            csr = csr_matrix((Xr == i).astype(int))
            inter_matrix += csr.T.dot(csr).toarray()
            
        csr_Xr = csr_matrix((Xr > 0).astype(int))
        none0_inter = csr_Xr.T.dot(csr_Xr).toarray()
        row_sum = (Xr > 0).sum(axis=0)
        row_tile = np.repeat(row_sum.reshape((n, 1)), n, axis=1)
        union_matrix = row_tile.T + row_tile - none0_inter
        
        inter_matrix = inter_matrix.astype(np.float64)
        union_matrix = union_matrix.astype(np.float64)
        jac_matrix = np.divide(inter_matrix, union_matrix, out=np.zeros_like(union_matrix), where=union_matrix != 0)
        np.nan_to_num(jac_matrix, copy=False)
        np.fill_diagonal(jac_matrix, 1)
        return jac_matrix        
        pass

1.1 Simple Baseline Result

In [5]:
# Creating Sample test data
np.random.seed(42)
sample_train = train[:30000]
sample_test = test[:30000]


sample_MV_users = MV_users[(MV_users.uID.isin(sample_train.uID)) | (MV_users.uID.isin(sample_test.uID))]
sample_MV_movies = MV_movies[(MV_movies.mID.isin(sample_train.mID)) | (MV_movies.mID.isin(sample_test.mID))]


sample_data = Data(sample_MV_users, sample_MV_movies, sample_train, sample_test)

# Hidden tests predict_everything_to_3 in class RecSys
rs = RecSys(data)
yp = rs.predict_everything_to_3()
print(rs.rmse(yp))

1.2585510334053043


1.2 Best Model Result

In [6]:
cf = Collaborative(data)
Xr = cf.Mr.astype(int)
t0=time.perf_counter()
cf.calc_item_item_similarity(cf.jacsim,Xr)
t1=time.perf_counter()
time_sim = t1-t0
print('similarity calculation time',time_sim)
yp = cf.predict()
rmse = cf.rmse(yp)
print(rmse)
assert(rmse<0.96)

similarity calculation time 4.543038669973612
0.9509126236828654


1.3 NMF Model Result

In [7]:
from sklearn.decomposition import NMF

In [8]:
# Create a ratings matrix
rs = RecSys(data)
ratings_matrix = rs.Mr

In [9]:
# NMF
model = NMF(random_state=26,
            n_components=10,
            init='random',
            solver='cd',
            beta_loss='frobenius',
            max_iter=200).fit(ratings_matrix)
## Factorize the ratings matrix into two matrices 
W = model.transform(ratings_matrix)
H = model.components_

In [10]:
# Reconstruct the ratings matrix using W and H
reconstruct = model.inverse_transform(W)
# Create a list of predicted ratings
predicted_ratings = []
n_test = len(rs.data.test)
for i in range(n_test):
    x = rs.data.test.iloc[i]
    movie_id = x.mID
    user_id = x.uID
    predicted_ratings.append(reconstruct[rs.uid2idx[user_id], rs.mid2idx[movie_id]])
## Create an array of actual ratings
actual_ratings = np.array(rs.data.test.rating)
nmf_rmse = np.sqrt(((actual_ratings-predicted_ratings)**2).mean())
print(nmf_rmse)

2.9140986869248278


2. Discuss the results and why sklearn's non-negative matrix facorization library did not work well compared to simple baseline or similarity-based methods we’ve done in Module 3. Can you suggest a way(s) to fix it? [10 pts]

Reason: Most ratings are 0, which makes the matrix too sparse. The NMF model in sklearn is more suitable for computing relatively dense matrices. If the matrix is too sparse, additional processing is required. If the NMF model in sklearn is used directly, it will cause significant bias in matrix computations.

Suggestion: We can preprocess the data by setting unknown values to the median instead of 0, which can avoid such large errors.