In [3]:
#installing surprise which is the library for matrux factorisation
!pip install scikit-surprise

Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4-cp312-cp312-win_amd64.whl
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [5]:
#importing necessary libraries
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")

In [9]:
#loading the dataset
df = pd.read_csv("u.data", sep='\t', names=['userId', 'movieId', 'rating', 'timestamp'])
df.drop(columns=['timestamp'], inplace=True)

In [11]:
#converting to surprise format
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)


In [13]:
#splitting the dataset into training and test dataset
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


In [15]:
#training the SVD model(Matrix Factorisation)
model = SVD(n_factors=50, n_epochs=20, random_state=42)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1c397074110>

In [17]:
#predicting on test dataset
predictions = model.test(testset)

In [19]:
#evaluating the model
print("Evaluation Metrics:")
print("RMSE:", round(accuracy.rmse(predictions), 4))
print("MAE :", round(accuracy.mae(predictions), 4))


Evaluation Metrics:
RMSE: 0.9348
RMSE: 0.9348
MAE:  0.7377
MAE : 0.7377


In [21]:
#getting top-n recommendations
def get_top_n(predictions, n=5):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

In [23]:
top_n = get_top_n(predictions, n=5)

In [25]:
#loading the movie titles
movie_titles_df = pd.read_csv('u.item', sep='|', encoding='latin-1', header=None, usecols=[0, 1], names=['movieId', 'title'])
movie_titles = pd.Series(movie_titles_df.title.values, index=movie_titles_df.movieId).to_dict()

In [27]:
#showing top 5 recommendations
print("Sample Top 5 Recommendations Per User:")
for uid, user_ratings in list(top_n.items())[:5]:
    print(f"\nUser {uid}:")
    for iid, rating in user_ratings:
        title = movie_titles.get(iid, "Unknown Movie")
        print(f"  {title} (ID: {iid}) - Predicted Rating: {rating:.2f}")

Sample Top 5 Recommendations Per User:

User 907:
  Princess Bride, The (1987) (ID: 173) - Predicted Rating: 5.00
  Celluloid Closet, The (1995) (ID: 813) - Predicted Rating: 5.00
  Fugitive, The (1993) (ID: 79) - Predicted Rating: 5.00
  Empire Strikes Back, The (1980) (ID: 172) - Predicted Rating: 5.00
  In the Name of the Father (1993) (ID: 317) - Predicted Rating: 5.00

User 371:
  Dances with Wolves (1990) (ID: 97) - Predicted Rating: 4.28
  Blues Brothers, The (1980) (ID: 186) - Predicted Rating: 4.26
  Indiana Jones and the Last Crusade (1989) (ID: 210) - Predicted Rating: 4.20
  Brazil (1985) (ID: 175) - Predicted Rating: 3.93
  Highlander (1986) (ID: 431) - Predicted Rating: 3.90

User 218:
  Usual Suspects, The (1995) (ID: 12) - Predicted Rating: 4.23
  Chinatown (1974) (ID: 654) - Predicted Rating: 3.76
  This Is Spinal Tap (1984) (ID: 209) - Predicted Rating: 3.58
  Abyss, The (1989) (ID: 164) - Predicted Rating: 3.42
  Clerks (1994) (ID: 42) - Predicted Rating: 3.39

User 