In [33]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

from IPython.display import Image
np.set_printoptions(precision = 3)

In [125]:
df = pd.read_csv('https://raw.githubusercontent.com/albanda/CE888-2023/main/lab4-recommenders/jester-data-1.csv', header=None)
print(df)

       0      1      2      3      4     5      6     7     8      9    ...  \
0       74  -7.82   8.79  -9.66  -8.16 -7.52  -8.50 -9.85  4.17  -8.98  ...   
1      100   4.08  -0.29   6.36   4.37 -2.38  -9.66 -0.73 -5.34   8.88  ...   
2       49  99.00  99.00  99.00  99.00  9.03   9.27  9.03  9.27  99.00  ...   
3       48  99.00   8.35  99.00  99.00  1.80   8.16 -2.82  6.21  99.00  ...   
4       91   8.50   4.61  -4.17  -5.39  1.36   1.60  7.04  4.61  -0.44  ...   
...    ...    ...    ...    ...    ...   ...    ...   ...   ...    ...  ...   
24978  100   0.44   7.43   9.08   2.33  3.20   6.75 -8.79 -0.53  -8.74  ...   
24979   91   9.13  -8.16   8.59   9.08  0.87  -8.93 -3.50  5.78  -8.11  ...   
24980   39  99.00  99.00  99.00  99.00 -7.77  99.00  6.70 -6.75  99.00  ...   
24981   37  99.00  99.00  99.00  99.00 -9.71  99.00  4.56 -8.30  99.00  ...   
24982   72   2.43   2.67  -3.98   4.27 -2.28   7.33  2.33  4.56   6.75  ...   

         91     92     93     94     95     96     

In [126]:
df.shape

(24983, 101)

In [127]:
# According to Jester dataset description, missing ratings are stored as 99 (or sometimes 0)
# Replace these with NaN
df.replace(99, np.nan, inplace=True)

In [128]:
# Drop the first column (number of rated jokes, not actual ratings)
df = df.drop(columns=[0])
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
0,-7.82,8.79,-9.66,-8.16,-7.52,-8.50,-9.85,4.17,-8.98,-4.76,...,2.82,,,,,,-5.63,,,
1,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,9.22,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,,,,,9.03,9.27,9.03,9.27,,,...,,,,9.08,,,,,,
3,,8.35,,,1.80,8.16,-2.82,6.21,,1.84,...,,,,0.53,,,,,,
4,8.50,4.61,-4.17,-5.39,1.36,1.60,7.04,4.61,-0.44,5.73,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.80,1.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24978,0.44,7.43,9.08,2.33,3.20,6.75,-8.79,-0.53,-8.74,7.23,...,8.83,-1.21,9.22,-6.70,8.45,9.03,6.55,8.69,8.79,7.43
24979,9.13,-8.16,8.59,9.08,0.87,-8.93,-3.50,5.78,-8.11,4.90,...,-1.17,-5.73,-1.46,0.24,9.22,-8.20,-7.23,-8.59,9.13,8.45
24980,,,,,-7.77,,6.70,-6.75,,,...,,,,,,,,,,
24981,,,,,-9.71,,4.56,-8.30,,,...,,,,,,,,,,


In [129]:
# Compute average rating per joke (column-wise mean)
average_ratings = df.mean()

# Find best-rated joke (highest average rating)
best_joke_index = average_ratings.idxmax()
best_joke_rating = average_ratings.max()

# Find worst-rated joke (lowest average rating)
worst_joke_index = average_ratings.idxmin()
worst_joke_rating = average_ratings.min()

# Display results
print(f"Best-rated joke: Joke {best_joke_index} with an average rating of {best_joke_rating:.2f}")
print(f"Worst-rated joke: Joke {worst_joke_index} with an average rating of {worst_joke_rating:.2f}")

Best-rated joke: Joke 50 with an average rating of 3.67
Worst-rated joke: Joke 58 with an average rating of -3.83


In [41]:
# Helper function to replace 10% of known ratings with 99 for validation
def replace_for_validation(orig, percentage=0.1):
    """
    Replaces 'percentage'% of the original known values (not NaN) in 'orig' with 99.
    
    :param orig: Original data array (numpy array)
    :param percentage: Percentage of values to replace (0 < percentage < 1)
    :return: (Modified array, tuple of indices of replaced values, original values of replaced cells)
    """
    new_data = orig.copy()
    
    # Get indices where values are not NaN
    rated = np.where(~np.isnan(orig))
    
    # Get total count of rated values
    n_rated = len(rated[0])
    
    # Select random indices to replace with 99
    idx = np.random.choice(n_rated, size=int(percentage * n_rated), replace=False)
    
    # Save original values for later use
    original_values = orig[rated[0][idx], rated[1][idx]]
    
    # Replace selected values with 99
    new_data[rated[0][idx], rated[1][idx]] = 99

    return new_data, (rated[0][idx], rated[1][idx]), original_values

In [42]:
# Convert dataframe to numpy array
df_copy = df.copy()
arr = df_copy.values


# Apply function to create validation set
new_arr, validation_indices, original_values = replace_for_validation(arr, 0.1)

# Convert back to DataFrame
df_validation = pd.DataFrame(new_arr)

# Check how many NaN values are still present
print("Number of NaN values in validation set:", df_validation.isna().sum().sum())

# Example: Checking the first replaced value
print("Original value at first replaced index:", original_values[0])
print("New value at first replaced index:", new_arr[validation_indices[0][0], validation_indices[1][0]])


Number of NaN values in validation set: 687845
Original value at first replaced index: 7.43
New value at first replaced index: 99.0


In [43]:
df_validation

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-7.82,8.79,-9.66,-8.16,-7.52,-8.50,-9.85,99.00,-8.98,-4.76,...,2.82,,,,,,-5.63,,,
1,4.08,-0.29,6.36,4.37,-2.38,99.00,-0.73,-5.34,8.88,9.22,...,2.82,-4.95,-0.29,7.86,-0.19,99.00,3.06,0.34,-4.32,1.07
2,,,,,9.03,9.27,9.03,9.27,,,...,,,,9.08,,,,,,
3,,99.00,,,1.80,8.16,-2.82,6.21,,1.84,...,,,,0.53,,,,,,
4,8.50,4.61,-4.17,-5.39,1.36,1.60,99.00,4.61,-0.44,5.73,...,5.19,5.58,4.27,99.00,5.73,1.55,3.11,6.55,1.80,99.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24978,0.44,7.43,9.08,2.33,3.20,6.75,-8.79,-0.53,-8.74,7.23,...,8.83,-1.21,9.22,-6.70,8.45,9.03,99.00,8.69,8.79,7.43
24979,9.13,-8.16,99.00,99.00,0.87,-8.93,-3.50,5.78,99.00,4.90,...,-1.17,-5.73,-1.46,0.24,9.22,-8.20,-7.23,-8.59,9.13,8.45
24980,,,,,-7.77,,6.70,-6.75,,,...,,,,,,,,,,
24981,,,,,-9.71,,4.56,-8.30,,,...,,,,,,,,,,


In [44]:
# Load modified dataset (where 99 is used for missing values)
df = df_validation.copy()
df.replace(99, np.nan, inplace=True)  # Treat "99" as NaN (missing)

# Convert to NumPy array for matrix operations
R = df.to_numpy()

# Set hyperparameters
num_users, num_items = R.shape
num_factors = 2  # Number of latent factors
alpha = 0.01  # Learning rate
lambda_reg = 0.1  # Regularization term
num_iterations = 5  # Lower iterations for quick convergence

# Initialize latent factor matrices randomly
latent_user_prefs = np.random.normal(scale=1.0/num_factors, size=(num_users, num_factors))
latent_item_features = np.random.normal(scale=1.0/num_factors, size=(num_items, num_factors))

def sgd(R, U, V):
    """
    Stochastic Gradient Descent for Latent Factor Matrix Factorization.

    Parameters:
    - R: User-item rating matrix
    - U: Latent user preference matrix
    - V: Latent item feature matrix
    - alpha: Learning rate
    - lambda_reg: Regularization term
    - num_iterations: Number of iterations

    Returns:
    - Updated U and V matrices
    """
    for iteration in range(num_iterations):
        for i in range(num_users):
            for j in range(num_items):
                if not np.isnan(R[i, j]):  # Only train on known ratings
                    prediction = np.dot(U[i, :], V[j, :].T)
                    error = R[i, j] - prediction
                    
                    # Update latent factors with gradient descent
                    U[i, :] += alpha * (error * V[j, :] - lambda_reg * U[i, :])
                    V[j, :] += alpha * (error * U[i, :] - lambda_reg * V[j, :])

        print(f"Iteration {iteration + 1}/{num_iterations} completed.")

    return U, V

# Train model
latent_user_prefs, latent_item_features = sgd(R, latent_user_prefs, latent_item_features)

# Predict missing values (including where "99" was originally)
predicted_ratings = np.dot(latent_user_prefs, latent_item_features.T)

# Fill in the missing ratings with predictions
R_filled = np.where(np.isnan(R), predicted_ratings, R)

# Convert back to DataFrame and save
df_filled = pd.DataFrame(R_filled, columns=df.columns)
df_filled.to_csv("jester-inferred-ratings.csv", index=False)

print("Missing ratings inferred and saved.")


  V[j, :] += alpha * (error * U[i, :] - lambda_reg * V[j, :])
  U[i, :] += alpha * (error * V[j, :] - lambda_reg * U[i, :])
  U[i, :] += alpha * (error * V[j, :] - lambda_reg * U[i, :])
  V[j, :] += alpha * (error * U[i, :] - lambda_reg * V[j, :])


Iteration 1/5 completed.
Iteration 2/5 completed.
Iteration 3/5 completed.
Iteration 4/5 completed.
Iteration 5/5 completed.
Missing ratings inferred and saved.


In [53]:
original_values

array([ 7.43, -0.97, -1.46, ..., -3.88, -4.66, -9.22])

In [55]:
y_true = original_values  # Ground truth values
y_pred = new_arr[validation_indices] # Model's predicted values

# Remove NaN values before calculating MSE
valid_mask = ~np.isnan(y_true) & ~np.isnan(y_pred)

# Apply the mask to filter valid values
y_true_filtered = y_true[valid_mask]
y_pred_filtered = y_pred[valid_mask]

# Ensure we have valid data to compute MSE
if len(y_true_filtered) == 0:
    print("No valid data points to evaluate MSE.")
else:
    mse = mean_squared_error(y_true_filtered, y_pred_filtered)
    print("Mean Squared Error on Validation Set:", mse)

Mean Squared Error on Validation Set: 9655.739694997377


In [130]:
df.iloc[974, 68]

np.float64(-9.22)

In [131]:
df_movie = pd.read_excel('https://raw.githubusercontent.com/albanda/CE888-2023/main/lab4-recommenders/movies_latent_factors.xlsx')
df_movie_user = pd.read_excel('https://raw.githubusercontent.com/albanda/CE888-2023/main/lab4-recommenders/movies_latent_factors.xlsx', sheet_name='Users')

print(df_movie)

    Movie ID                                              Title   Factor1  \
0         11          Star Wars: Episode IV - A New Hope (1977) -1.521848   
1         12                                Finding Nemo (2003) -0.342185   
2         13                                Forrest Gump (1994) -2.240888   
3         14                             American Beauty (1999) -0.634531   
4         22  Pirates of the Caribbean: The Curse of the Bla...  0.517348   
..       ...                                                ...       ...   
95      9806                             The Incredibles (2004)  0.159967   
96     10020                        Beauty and the Beast (1991)  1.286288   
97     36657                                       X-Men (2000)  0.811901   
98     36658                            X2: X-Men United (2003)  1.161006   
99     36955                                   True Lies (1994)  1.734008   

     Factor2   Factor3   Factor4   Factor5   Factor6   Factor7   Factor8  \

In [150]:
factor = df_movie[df_movie['Title'] == 'Star Wars: Episode IV - A New Hope (1977)']
factor

Unnamed: 0,Movie ID,Title,Factor1,Factor2,Factor3,Factor4,Factor5,Factor6,Factor7,Factor8,Factor9,Factor10,Factor11,Factor12,Factor13,Factor14,Factor15
0,11,Star Wars: Episode IV - A New Hope (1977),-1.521848,-1.038507,2.027269,0.247933,-0.594548,2.51326,-1.84891,0.47671,-0.224146,-0.760681,-0.973915,0.862379,0.403861,1.129616,-0.248806


In [133]:
factor8_value = df_movie['Factor13'].idxmax()
factor8_value

21

In [134]:
df_movie.loc[21]

Movie ID                       155
Title       The Dark Knight (2008)
Factor1                  -2.169359
Factor2                  -1.131584
Factor3                   -0.23918
Factor4                  -0.556938
Factor5                  -1.473591
Factor6                  -0.739988
Factor7                   1.351486
Factor8                  -1.679131
Factor9                   0.133108
Factor10                 -0.943399
Factor11                 -1.135594
Factor12                 -0.179583
Factor13                  2.193465
Factor14                  0.720142
Factor15                  0.445634
Name: 21, dtype: object

In [135]:
df_movie.info

<bound method DataFrame.info of     Movie ID                                              Title   Factor1  \
0         11          Star Wars: Episode IV - A New Hope (1977) -1.521848   
1         12                                Finding Nemo (2003) -0.342185   
2         13                                Forrest Gump (1994) -2.240888   
3         14                             American Beauty (1999) -0.634531   
4         22  Pirates of the Caribbean: The Curse of the Bla...  0.517348   
..       ...                                                ...       ...   
95      9806                             The Incredibles (2004)  0.159967   
96     10020                        Beauty and the Beast (1991)  1.286288   
97     36657                                       X-Men (2000)  0.811901   
98     36658                            X2: X-Men United (2003)  1.161006   
99     36955                                   True Lies (1994)  1.734008   

     Factor2   Factor3   Factor4   Factor5 

In [136]:
df_movie_user.info

<bound method DataFrame.info of     User   Factor1   Factor2   Factor3   Factor4   Factor5   Factor6  \
0   4768 -0.204024  0.161079 -0.090447  0.138495 -0.162934  0.163894   
1    156 -0.189652 -0.178979 -0.091490 -0.000823 -0.032646  0.177209   
2   5323 -0.115308 -0.090886 -0.053129  0.018472 -0.068081 -0.004828   
3    174 -0.227462 -0.272532 -0.017231  0.054324  0.214755 -0.072639   
4   4529 -0.014616 -0.102218 -0.107935  0.155784 -0.123362 -0.118228   
5    783 -0.020301 -0.031919 -0.036955  0.033690  0.000174 -0.003178   
6   3878 -0.091462  0.215879 -0.180453  0.085408 -0.321094  0.227947   
7    768  0.000819 -0.009229 -0.019228  0.002703  0.012869  0.006655   
8   4469 -0.030528 -0.011537 -0.042822 -0.014378  0.031338  0.012297   
9   1882 -0.083093 -0.029160  0.013748  0.022716 -0.062732 -0.080670   
10  4997 -0.185443 -0.276496 -0.781275 -0.240506  0.029611 -0.271814   
11  2067 -0.083242  0.181171 -0.344267 -0.019713 -0.243199  0.039220   
12  3806 -0.042344 -0.040755 -0.

In [92]:
df_copy

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
0,-7.82,8.79,-9.66,-8.16,-7.52,-8.50,-9.85,4.17,-8.98,-4.76,...,2.82,,,,,,-5.63,,,
1,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,9.22,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,,,,,9.03,9.27,9.03,9.27,,,...,,,,9.08,,,,,,
3,,8.35,,,1.80,8.16,-2.82,6.21,,1.84,...,,,,0.53,,,,,,
4,8.50,4.61,-4.17,-5.39,1.36,1.60,7.04,4.61,-0.44,5.73,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.80,1.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24978,0.44,7.43,9.08,2.33,3.20,6.75,-8.79,-0.53,-8.74,7.23,...,8.83,-1.21,9.22,-6.70,8.45,9.03,6.55,8.69,8.79,7.43
24979,9.13,-8.16,8.59,9.08,0.87,-8.93,-3.50,5.78,-8.11,4.90,...,-1.17,-5.73,-1.46,0.24,9.22,-8.20,-7.23,-8.59,9.13,8.45
24980,,,,,-7.77,,6.70,-6.75,,,...,,,,,,,,,,
24981,,,,,-9.71,,4.56,-8.30,,,...,,,,,,,,,,


In [118]:

# Extract user and movie latent factors
user_vector = df_movie_user[df_movie_user['User'] == 2848]  # Row for user 2848
movie_vector = df_movie[df_movie['Movie ID'] == 1572]    # Row for movie 1572


In [119]:
user_vector

Unnamed: 0,User,Factor1,Factor2,Factor3,Factor4,Factor5,Factor6,Factor7,Factor8,Factor9,Factor10,Factor11,Factor12,Factor13,Factor14,Factor15
13,2848,-0.113283,-0.098942,-0.040393,0.04218,-0.041068,-0.039568,0.018291,-0.191254,0.218912,0.017262,-0.087468,0.036894,0.067105,-0.01678,-0.029503


In [None]:
movie_vector
movie_vector_1572 = movie_factors[movie_df[movie_df['Movie ID'] == 1572].index[0]]

Unnamed: 0,Movie ID,Title,Factor1,Factor2,Factor3,Factor4,Factor5,Factor6,Factor7,Factor8,Factor9,Factor10,Factor11,Factor12,Factor13,Factor14,Factor15
74,1572,Die Hard: With a Vengeance (1995),0.861731,-0.415145,-0.059279,-1.168039,-1.101257,0.315561,0.08003,-0.042413,1.075221,1.138518,0.300352,0.255787,0.643441,-0.402201,-0.451662


In [None]:
# Compute predicted rating
# predicted_rating = np.dot(user_vector, movie_vector)

# # Format to two decimal places
# print(f"{predicted_rating:.2f}")

# Movie ID 1572's latent factor vector (replace with the actual vector)
# movie_vector_1572 = movie_factors[movie_df[movie_df['Movie ID'] == 1572].index[0]]

# Compute the predicted rating by taking the dot product
predicted_rating = np.dot(user_vector, movie_vector)

# Round to two decimal places
predicted_rating = round(predicted_rating, 2)

print("Predicted rating for user 2848 on movie 1572:", predicted_rating)

Unnamed: 0,Movie ID,Title,Factor1,Factor2,Factor3,Factor4,Factor5,Factor6,Factor7,Factor8,Factor9,Factor10,Factor11,Factor12,Factor13,Factor14,Factor15
74,1572,Die Hard: With a Vengeance (1995),0.861731,-0.415145,-0.059279,-1.168039,-1.101257,0.315561,0.08003,-0.042413,1.075221,1.138518,0.300352,0.255787,0.643441,-0.402201,-0.451662
