In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error
import sys

def svdModel(mtrx_df):
    mtrx = mtrx_df.to_numpy()
    ratings_mean = np.mean(mtrx, axis = 1)
    normalized_mtrx = mtrx - ratings_mean.reshape(-1, 1)

    U, sigma, Vt = svds(normalized_mtrx, k = 2)
    sigma = np.diag(sigma)
    all_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + ratings_mean.reshape(-1, 1)
    preds_df = pd.DataFrame(all_predicted_ratings, columns = ['userId', 'movieId', 'predictedRating'])
    columns_to_convert = ['userId', 'movieId']
    preds_df[columns_to_convert] = preds_df[columns_to_convert].astype(int)
    return preds_df

def _progress(num, users):
    sys.stdout.write('\rRating predictions. Progress status : %.1f%%' % (float(num/len(users))*100.0))
    sys.stdout.flush()

# Load data
ratings = pd.read_csv('/Users/bharath/Documents/Fall 2023/578 - stats ml/project/ratings.csv').iloc[:,:3]


tot_loss, num = 0, 0
users = list(set(list(ratings['userId'])))

for n in users:
    n_5_ratings = ratings[ratings['userId'] == n].iloc[0:5].reset_index(drop=True)
    n_rest_ratings = ratings[ratings['userId'] == n].iloc[5:].reset_index(drop=True)
    not_n_ratings = ratings[ratings['userId'] != n].reset_index(drop=True)
    input_ratings = pd.concat([n_5_ratings, not_n_ratings], ignore_index=True)
    
    prediction_df = svdModel(input_ratings)
    comparison_df = pd.merge(n_rest_ratings, prediction_df,  how='inner', on=['userId','movieId'])

    if not comparison_df.empty:
        true_ratings = list(comparison_df['rating'])
        predicted_ratings = list(comparison_df['predictedRating'])
        tot_loss += mean_squared_error(true_ratings, predicted_ratings) * len(true_ratings)

    num += 1
    _progress(num, users)

print('\nTotal Loss: ', round(tot_loss, 2))
print('Loss per user: ', round(tot_loss/len(users), 2))

#Rating predictions. Progress status : 100.0%
#Total Loss:  2216.75
#Loss per user:  3.63

Rating predictions. Progress status : 100.0%
Total Loss:  2216.75
Loss per user:  3.63
