In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from numpy.linalg import svd

In [19]:
ratings_df = pd.read_csv('ratings.csv', index_col=0)
ratings_df

Unnamed: 0,M1,M2,M3,M4,M5,M6,M7,M8,M9
U1,0,2,2,3,0,3,0,4,0
U2,3,0,1,0,1,4,5,0,2
U3,4,3,4,3,4,3,0,4,3
U4,0,0,0,3,3,2,2,0,3
U5,1,4,5,0,2,0,3,2,5
U6,0,5,4,5,1,1,0,5,1
U7,5,1,3,1,5,4,1,1,0
U8,0,0,2,2,0,0,0,3,0
U9,4,0,2,1,0,3,3,0,4
U10,0,2,0,0,3,0,5,0,1


In [21]:
# Apply SVD with missing values filled
# Replace zeros (missing ratings) with global mean for SVD
mean_rating = ratings_df.replace(0, np.nan).mean().mean()
filled_df = ratings_df.replace(0, mean_rating)
print(mean_rating)
filled_df

2.787477954144621


Unnamed: 0,M1,M2,M3,M4,M5,M6,M7,M8,M9
U1,2.787478,2.0,2.0,3.0,2.787478,3.0,2.787478,4.0,2.787478
U2,3.0,2.787478,1.0,2.787478,1.0,4.0,5.0,2.787478,2.0
U3,4.0,3.0,4.0,3.0,4.0,3.0,2.787478,4.0,3.0
U4,2.787478,2.787478,2.787478,3.0,3.0,2.0,2.0,2.787478,3.0
U5,1.0,4.0,5.0,2.787478,2.0,2.787478,3.0,2.0,5.0
U6,2.787478,5.0,4.0,5.0,1.0,1.0,2.787478,5.0,1.0
U7,5.0,1.0,3.0,1.0,5.0,4.0,1.0,1.0,2.787478
U8,2.787478,2.787478,2.0,2.0,2.787478,2.787478,2.787478,3.0,2.787478
U9,4.0,2.787478,2.0,1.0,2.787478,3.0,3.0,2.787478,4.0
U10,2.787478,2.0,2.787478,2.787478,3.0,2.787478,5.0,2.787478,1.0


In [25]:
# SVD decomposition
U, sigma, Vt = svd(filled_df.values, full_matrices=False)
k = 4  # number of latent factors
U_k = U[:, :k]
sigma_k = np.diag(sigma[:k])
Vt_k = Vt[:k, :]
# Reconstruct
predicted = np.dot(np.dot(U_k, sigma_k), Vt_k)
predicted_df = pd.DataFrame(predicted, index=filled_df.index, columns=filled_df.columns)
predicted_df

Unnamed: 0,M1,M2,M3,M4,M5,M6,M7,M8,M9
U1,2.972774,2.647269,2.607713,2.612495,2.429371,3.084152,3.131799,3.163367,2.527075
U2,2.844953,2.615237,0.926059,2.352814,1.352733,3.766072,5.007159,3.491556,1.944653
U3,3.868203,2.973987,4.140678,3.359193,3.762149,3.375172,2.773479,3.535074,3.035907
U4,2.684841,2.675368,3.308641,2.619645,2.642574,2.458273,1.950012,2.903181,2.795023
U5,2.108645,4.075914,3.70453,2.656431,2.120879,2.541256,1.714449,3.633781,4.820911
U6,1.849296,4.363827,4.334999,5.19451,1.208655,1.381579,3.060642,5.050111,1.155764
U7,4.618126,0.563121,3.155831,1.079035,5.186701,3.773074,1.379303,1.032281,3.038841
U8,2.803589,2.488898,2.337866,2.130045,2.352282,3.060326,2.813315,2.822393,2.8739
U9,3.21994,2.456632,2.126868,1.488005,2.822914,3.796537,2.891808,2.585007,4.010142
U10,3.462348,2.034897,2.278467,3.071631,2.550782,3.240862,4.089111,3.22114,0.865569


In [28]:
# Fill missing ratings with predictions
zero_mask = ratings_df == 0
predicted_ratings = ratings_df.copy().astype(float)
predicted_ratings[zero_mask] = predicted_df[zero_mask]
predicted_ratings

Unnamed: 0,M1,M2,M3,M4,M5,M6,M7,M8,M9
U1,2.972774,2.0,2.0,3.0,2.429371,3.0,3.131799,4.0,2.527075
U2,3.0,2.615237,1.0,2.352814,1.0,4.0,5.0,3.491556,2.0
U3,4.0,3.0,4.0,3.0,4.0,3.0,2.773479,4.0,3.0
U4,2.684841,2.675368,3.308641,3.0,3.0,2.0,2.0,2.903181,3.0
U5,1.0,4.0,5.0,2.656431,2.0,2.541256,3.0,2.0,5.0
U6,1.849296,5.0,4.0,5.0,1.0,1.0,3.060642,5.0,1.0
U7,5.0,1.0,3.0,1.0,5.0,4.0,1.0,1.0,3.038841
U8,2.803589,2.488898,2.0,2.0,2.352282,3.060326,2.813315,3.0,2.8739
U9,4.0,2.456632,2.0,1.0,2.822914,3.0,3.0,2.585007,4.0
U10,3.462348,2.0,2.278467,3.071631,3.0,3.240862,5.0,3.22114,1.0


In [29]:
# Recommend Top-N movies for a specific user (U1)
user_id = 'U1'
user_pred = predicted_df.loc[user_id]
top_n = user_pred.sort_values(ascending=False)[:5].index.tolist()
print(f"Top 5 recommended movies for {user_id}: {top_n}")

Top 5 recommended movies for U1: [' M8', ' M7', ' M6', ' M1', ' M2']


In [30]:
# Calculate RMSE on actual (non-zero) ratings
actual = ratings_df.values[ratings_df.values != 0]
predicted_known = predicted_df.values[ratings_df.values != 0]
rmse = np.sqrt(mean_squared_error(actual, predicted_known))
print(f"RMSE on known ratings: {rmse:.4f}")

RMSE on known ratings: 0.5487


In [None]:
# save predictions to a csv file
predicted_df.round(2).to_csv('svd_predicted_ratings.csv')