In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import torch
from surprise import Dataset, Reader, SVD, KNNBasic, accuracy

# Setup

In [11]:
DATA_DIR = "D:\\Delft\\ml-100k\\"
ratings = pd.read_csv(f"{DATA_DIR}u.data", sep="\t", names=["user_id", "movie_id", "rating", "timestamp"])

# Get average ratings of user, could be interesting


In [8]:
user_ratings = ratings.groupby('user_id')['rating'].mean().reset_index()
print(user_ratings)

users_rated = ratings['user_id'].value_counts().mean()
print(f"Average movies rated by a user: {users_rated} ")

     user_id    rating
0          1  3.610294
1          2  3.709677
2          3  2.796296
3          4  4.333333
4          5  2.874286
..       ...       ...
938      939  4.265306
939      940  3.457944
940      941  4.045455
941      942  4.265823
942      943  3.410714

[943 rows x 2 columns]
Average movies rated by a user: 106.04453870625663 


# Get 62 users with 100-120 ratings

In [13]:
users_ratings = ratings['user_id'].value_counts()
filtered_users = users_ratings[(users_ratings >= 100) & (users_ratings <= 200)].index

user_ratings = ratings[ratings['user_id'].isin(filtered_users)]
print(f"Total ratings: {len(user_ratings)}")

Total ratings: 30600


# Get results with full set (for metrics lower is better)

In [14]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split


reader = Reader(rating_scale=(1, 5))
full_trainset = Dataset.load_from_df(user_ratings[['user_id', 'movie_id', 'rating']], reader).build_full_trainset()
trainset, testset = train_test_split(Dataset.load_from_df(user_ratings[['user_id', 'movie_id', 'rating']], reader), test_size=0.2)

algo = SVD()
algo.fit(trainset)
predictions = algo.test(testset)

accuracy.rmse(predictions)
accuracy.mae(predictions)

RMSE: 0.9616
MAE:  0.7609


0.7608916740556295

# Remove fraction of ratings + plots

In [15]:
from surprise import BaselineOnly
original_ratings = user_ratings.copy()

reader = Reader(rating_scale=(1, 5))
fractions = [0.0, 0.1, 0.2, 0.3, 0.5, 0.7, 0.9]
results = []
results_b = []
for frac in fractions:
    if frac > 0:
        reduced_data_frame = original_ratings.sample(frac=(1-frac), random_state=42)
    else:
        reduced_data_frame = original_ratings.copy()

    data = Dataset.load_from_df(reduced_data_frame[['user_id', 'movie_id', 'rating']], reader)

    trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

    algo = SVD()
    algo_b = BaselineOnly()


    algo.fit(trainset)
    algo_b.fit(trainset)

    predictions = algo.test(testset)
    predictions_b = algo.test(testset)

    rmse = accuracy.rmse(predictions, verbose=False)
    rmse_b = accuracy.rmse(predictions_b, verbose=False)

    mae = accuracy.mae(predictions, verbose=False)
    mae_b = accuracy.mae(predictions_b, verbose=False)

    results.append((1 - frac, rmse, mae))
    results_b.append((1 - frac, rmse_b, mae_b))

    print(f"Kept {int((1-frac)*100)}% of ratings → RMSE: {rmse:.4f}, MAE: {mae:.4f}")
    print(f"Kept baseline {int((1-frac)*100)}% of ratings → RMSE: {rmse_b:.4f}, MAE: {mae_b:.4f}")


Estimating biases using als...
Kept 100% of ratings → RMSE: 0.9556, MAE: 0.7531
Kept baseline 100% of ratings → RMSE: 0.9556, MAE: 0.7531
Estimating biases using als...
Kept 90% of ratings → RMSE: 0.9660, MAE: 0.7644
Kept baseline 90% of ratings → RMSE: 0.9660, MAE: 0.7644
Estimating biases using als...
Kept 80% of ratings → RMSE: 0.9625, MAE: 0.7640
Kept baseline 80% of ratings → RMSE: 0.9625, MAE: 0.7640
Estimating biases using als...
Kept 70% of ratings → RMSE: 0.9709, MAE: 0.7712
Kept baseline 70% of ratings → RMSE: 0.9709, MAE: 0.7712
Estimating biases using als...
Kept 50% of ratings → RMSE: 1.0008, MAE: 0.7886
Kept baseline 50% of ratings → RMSE: 1.0008, MAE: 0.7886
Estimating biases using als...
Kept 30% of ratings → RMSE: 1.0153, MAE: 0.8072
Kept baseline 30% of ratings → RMSE: 1.0153, MAE: 0.8072
Estimating biases using als...
Kept 9% of ratings → RMSE: 1.0129, MAE: 0.8204
Kept baseline 9% of ratings → RMSE: 1.0129, MAE: 0.8204


# Plot results

In [None]:
import matplotlib.pyplot as plt

kept_fractions, rmses, maes = zip(*results)
removed_fractions = [1 - k for k in kept_fractions]

plt.figure(figsize=(10, 6))
plt.plot(removed_fractions, rmses, label='RMSE', marker='o')
plt.plot(removed_fractions, maes, label='MAE', marker='s')
plt.xlabel('Fraction of Ratings Removed')
plt.ylabel('Error')
plt.title('Performance Degradation with Random Rating Removal')
plt.legend()
plt.grid(True)
plt.show()