In [None]:
import pandas as pd
import numpy as np
import scipy
from scipy.linalg import sqrtm
from datetime import datetime
import os

In [None]:
from google.colab import drive
drive.mount("/content/drive",force_remount=True)

Mounted at /content/drive


In [None]:
dateparse = lambda x: datetime.utcfromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S')

files_path = '/content/drive/MyDrive/CSE547_Final_Project/ml-25m'
ratings_file = os.path.join(files_path, "ratings.csv")
movies_file = os.path.join(files_path, "movies.csv")
user_movie_ratings_matrix = os.path.join(files_path, "user_movie_ratings_matrix.csv")

data = pd.read_csv(   ratings_file, 
                            parse_dates=['timestamp'], 
                            date_parser=dateparse)
movies_df = pd.read_csv(movies_file)

In [None]:
data['userId'] = data['userId'].astype('str')
data['movieId'] = data['movieId'].astype('str')

users = data['userId'].unique() #list of all users
movies = data['movieId'].unique() #list of all movies

print("Number of users", len(users))
print("Number of movies", len(movies))

print(data.head())

Number of users 162541
Number of movies 59047
  userId movieId  rating           timestamp
0      1     296     5.0 2006-05-17 15:34:04
1      1     306     3.5 2006-05-17 12:26:57
2      1     307     5.0 2006-05-17 12:27:08
3      1     665     5.0 2006-05-17 15:13:40
4      1     899     3.5 2006-05-17 12:21:50


In [None]:
test = pd.DataFrame(columns=data.columns)
train = pd.DataFrame(columns=data.columns)

test_ratio = 0.2 #fraction of data to be used as test set.

for u in users:
    temp = data[data['userId'] == u]
    n = len(temp)
    test_size = int(test_ratio*n)

    temp = temp.sort_values('timestamp').reset_index()
    temp.drop('index', axis=1, inplace=True)
        
    dummy_test = temp.iloc[n-1-test_size :]
    dummy_train = temp.iloc[: n-2-test_size]
        
    test = pd.concat([test, dummy_test])
    train = pd.concat([train, dummy_train])

In [None]:
def create_utility_matrix(data, formatizer = {'user':0, 'item': 1, 'value': 2}):
    """
        :param data:      Array-like, 2D, nx3
        :param formatizer:pass the formatizer
        :return:          utility matrix (n x m), n=users, m=items
    """
        
    itemField = formatizer['item']
    userField = formatizer['user']
    valueField = formatizer['value']

    userList = data.iloc[:,userField].tolist()
    itemList = data.iloc[:,itemField].tolist()
    valueList = data.iloc[:,valueField].tolist()
    
    users = list(set(data.iloc[:,userField]))
    items = list(set(data.iloc[:,itemField]))
    
    users_index = {users[i]: i for i in range(len(users))}
    
    pd_dict = {item: [np.nan for i in range(len(users))] for item in items}
    
    for i in range(0,len(data)):
        item = itemList[i]
        user = userList[i]
        value = valueList[i]
        
        pd_dict[item][users_index[user]] = value
    
    X = pd.DataFrame(pd_dict)
    
    X.index = users
        
    itemcols = list(X.columns)
    items_index = {itemcols[i]: i for i in range(len(itemcols))}
    # users_index gives us a mapping of user_id to index of user
    # items_index provides the same for items
    return X, users_index, items_index

In [None]:
def svd(train, k):
    utilMat = np.array(train)
    
    # the nan or unavailable entries are masked
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)
    item_means = np.mean(masked_arr, axis=0)
    
    # nan entries will replaced by the average rating for each item
    utilMat = masked_arr.filled(item_means)
    x = np.tile(item_means, (utilMat.shape[0],1))
    
    # we remove the per item average from all entries.
    # the above mentioned nan entries will be essentially zero now
    utilMat = utilMat - x
    
    # The magic happens here. U and V are user and item features
    U, s, V=np.linalg.svd(utilMat, full_matrices=False)
    s=np.diag(s)
    
    # we take only the k most significant features
    s=s[0:k,0:k]
    U=U[:,0:k]
    V=V[0:k,:]
    
    s_root=sqrtm(s)
    
    Usk=np.dot(U,s_root)
    skV=np.dot(s_root,V)
    
    UsV = np.dot(Usk, skV)
    
    UsV = UsV + x
    print("svd done")
    
    return UsV

In [None]:
def rmse(true, pred):
    # this will be used towards the end
    x = true - pred
    return sum([xi*xi for xi in x])/len(x)

In [None]:
# to test the performance over a different number of features
no_of_features = [100, 200, 500, 1000, 5000, 10000]
utilMat, users_index, items_index = create_utility_matrix(train)

for f in no_of_features[:1]: 
    svdout = svd(utilMat, k=f)
    print(svdout.shape)
    pred = [] #to store the predicted ratings
    
    for _, row in test.iterrows():
        user = row['userId']
        item = row['movieId']
        u_index = users_index[user]
        
        if item in items_index:
            i_index = items_index[item]
            pred_rating = svdout[u_index, i_index]
        else:
            pred_rating = np.mean(svdout[u_index, :])
        
        pred.append(pred_rating)

    print(rmse(test['rating'], pred))

svd done
(610, 8204)
1.0205173744578095
