In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install scikit-surprise



In [5]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, KNNBasic, accuracy
from surprise.model_selection import train_test_split
import gc


df = pd.read_excel('/content/drive/MyDrive/FINAL jester 2006-15.xls', header=None)


df = df.drop(columns=[0])


df.insert(loc=0, column="User ID", value=np.arange(1, len(df.index) + 1))


removed_jokes = {1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 14, 20, 27, 31, 43, 51, 52, 61, 73, 80, 100, 116}
df = df.drop(columns=[col for col in removed_jokes], axis=1)

# Reshape the data (convert wide format to long format)
df = df.melt(id_vars="User ID", var_name="Joke ID", value_name="Rating")

# Remove null ratings (99 corresponds to null ratings)
df = df[df["Rating"] != 99]

# Ensure data is sorted by User ID and Joke ID
df = df.sort_values(by=["User ID", "Joke ID"]).reset_index(drop=True)

# Step 2: use only users with at least 10 ratings
user_ratings_count = df.groupby('User ID').size()
users_with_enough_ratings = user_ratings_count[user_ratings_count >= 10].index
df = df[df['User ID'].isin(users_with_enough_ratings)]

# using only 5000 users and 50 jokes
df = df[df['User ID'] <= 5000]
df = df[df['Joke ID'] <= 75]

# Remove users and jokes with no ratings
df = df[df.groupby('User ID')['Rating'].transform('count') > 0]
df = df[df.groupby('Joke ID')['Rating'].transform('count') > 0]

# Normalize ratings (subtract user mean)
df['Normalized Rating'] = df.groupby('User ID')['Rating'].transform(lambda x: x - x.mean())


#  Re-scaling after normalization
min_rating = df['Normalized Rating'].min()
max_rating = df['Normalized Rating'].max()

df['Rescaled Normalized Rating'] = (df['Normalized Rating'] - min_rating) / (max_rating - min_rating) * (10 - (-10)) - 10

# Step 3: Load data into Surprise's format
reader = Reader(rating_scale=(-10, 10))
data = Dataset.load_from_df(df[['User ID', 'Joke ID', 'Normalized Rating']], reader)

# Step 4: Train-test split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [7]:
from surprise import KNNBasic, accuracy
from surprise.model_selection import cross_validate
from joblib import Parallel, delayed
import numpy as np

best_mae = float('inf')

# Parallelized hyperparameter search
def train_and_evaluate(k, shrink):
    sim_options = {
        'name': 'pearson_baseline',
        'user_based': True,
        'shrinkage': shrink
    }
    algo = KNNBasic(k=k, sim_options=sim_options, min_k=5)
    algo.fit(trainset)
    predictions = algo.test(testset)
    mae = accuracy.mae(predictions, verbose=False)
    return (k, shrink, mae)

results = Parallel(n_jobs=4)(delayed(train_and_evaluate)(k, shrink) for k in range(5, 30, 5) for shrink in [50, 100, 150, 200])

# Find the best result
best_result = min(results, key=lambda x: x[2])
optimal_k, optimal_shrinkage, best_mae = best_result
print(f"Optimal K: {optimal_k}, Shrinkage: {optimal_shrinkage}, Best MAE: {best_mae:.4f}")


Optimal K: 25, Shrinkage: 50, Best MAE: 3.4080


In [8]:
import numpy as np
from collections import defaultdict

def user_bias_recommender(trainset, testset):
    # Calculate the mean rating for each user
    user_ratings = defaultdict(list)
    for uid, iid, rating in trainset.all_ratings():
        user_ratings[uid].append(rating)

    user_means = {uid: np.mean(ratings) for uid, ratings in user_ratings.items()}

    # Predict the mean rating for each user in the test set
    base_predictions = []
    for uid, iid, true_r in testset:
        predicted_rating = user_means.get(uid, np.mean([rating for (_, _, rating) in trainset.all_ratings()]))  # Fallback to global mean if no user data
        base_predictions.append((uid, iid, predicted_rating, true_r))

    return base_predictions

# Generate baseline predictions and calculate MAE
base_preds = user_bias_recommender(trainset, testset)
base_mae = np.mean([abs(true_r - est) for (_, _, est, true_r) in base_preds])
print(f"User Bias Model MAE: {base_mae}")


User Bias Model MAE: 3.828883073271953


In [11]:
# Step 8: Show test instances with predictions
print("\nTest Data Instances with Predictions:")

# Use the trained model to make predictions on the test set
algo = KNNBasic(k=optimal_k, sim_options={'name': 'pearson_baseline', 'user_based': True, 'shrinkage': optimal_shrinkage}, min_k=5)
algo.fit(trainset)
predictions = algo.test(testset)

# Display test instances with original and predicted ratings
for i, prediction in enumerate(predictions[:10]):
    uid = prediction.uid
    iid = prediction.iid
    true_r = prediction.r_ui
    est = prediction.est
    print(f"User: {uid}, Joke: {iid}, True Rating: {true_r:.2f}, Predicted Rating: {est:.2f}")


Test Data Instances with Predictions:
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
User: 3067, Joke: 74, True Rating: -1.34, Predicted Rating: -2.03
User: 3554, Joke: 26, True Rating: -8.33, Predicted Rating: -1.59
User: 2008, Joke: 41, True Rating: -5.85, Predicted Rating: -2.38
User: 1963, Joke: 53, True Rating: 4.43, Predicted Rating: 2.98
User: 822, Joke: 8, True Rating: -5.34, Predicted Rating: -1.03
User: 1876, Joke: 67, True Rating: 3.73, Predicted Rating: -1.28
User: 3246, Joke: 70, True Rating: -1.47, Predicted Rating: 1.43
User: 1827, Joke: 23, True Rating: -5.85, Predicted Rating: -1.03
User: 2248, Joke: 26, True Rating: 3.82, Predicted Rating: -0.41
User: 2196, Joke: 72, True Rating: 7.88, Predicted Rating: 2.25


In [13]:
def inference_for_user_with_sim_and_predictions(target_user_id, top_k_users=2000):

    target_inner_id = algo.trainset.to_inner_uid(target_user_id)

    # Step 2: Get the similarity matrix from the trained model
    sim_matrix = algo.sim

    # Get the similarity scores for the target user with all other users
    similarities = sim_matrix[target_inner_id]

    # Sort users by similarity 
    similar_users = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)

    # Get the top K similar users
    top_users_sim = [(algo.trainset.to_raw_uid(inner_id), sim) for inner_id, sim in similar_users[:top_k_users]]

    # Step 3: Print the top K similar users and their similarity scores
    print(f"\nTop {top_k_users} Similar Users for User {target_user_id} with Similarity Measures:")
    for neighbor_id, similarity in top_users_sim[:10]:
        print(f"Neighbor User: {neighbor_id}, Similarity: {similarity:.4f}")

    unrated_jokes = df[~df['Joke ID'].isin(df[df['User ID'] == target_user_id]['Joke ID'])]['Joke ID'].unique()

    predicted_ratings = {}
    for joke_id in unrated_jokes:
        # Use the trained model to predict the rating for this joke
        prediction = algo.predict(target_user_id, joke_id)
        predicted_ratings[joke_id] = prediction.est  # predicted rating

    # Step 5: Print the predicted ratings for the target user
    print(f"\nPredicted Ratings for User {target_user_id} based on the trained model:")
    for joke_id, rating in predicted_ratings.items():
        print(f"Joke ID: {joke_id}, Predicted Rating: {rating:.2f}")

    return predicted_ratings, top_users_sim

# Example: Predict for a specific user
target_user_id = 1
predicted_ratings, top_users = inference_for_user_with_sim_and_predictions(target_user_id=target_user_id)



Top 2000 Similar Users for User 1 with Similarity Measures:
Neighbor User: 1, Similarity: 1.0000
Neighbor User: 1024, Similarity: 0.1892
Neighbor User: 1284, Similarity: 0.1835
Neighbor User: 100, Similarity: 0.1776
Neighbor User: 3289, Similarity: 0.1710
Neighbor User: 471, Similarity: 0.1608
Neighbor User: 4967, Similarity: 0.1597
Neighbor User: 326, Similarity: 0.1571
Neighbor User: 3413, Similarity: 0.1569
Neighbor User: 836, Similarity: 0.1489

Predicted Ratings for User 1:
Joke ID: 28, Predicted Rating: 0.98
Joke ID: 30, Predicted Rating: -0.84
Joke ID: 48, Predicted Rating: 1.27
Joke ID: 33, Predicted Rating: -2.14
Joke ID: 37, Predicted Rating: -1.10
Joke ID: 38, Predicted Rating: 0.69
Joke ID: 39, Predicted Rating: 0.96
Joke ID: 40, Predicted Rating: -0.02
Joke ID: 41, Predicted Rating: -0.64
Joke ID: 44, Predicted Rating: -3.29
Joke ID: 45, Predicted Rating: 0.76
Joke ID: 46, Predicted Rating: 0.61
Joke ID: 47, Predicted Rating: 2.27
Joke ID: 55, Predicted Rating: -0.38
Joke