In [60]:
pip install tensorflow d2l mxnet



# Task 0: Download Dataset, and import important libraries

In [61]:
import os
import pandas as pd
from mxnet import gluon, np
from d2l import mxnet as d2l

In [62]:
d2l.DATA_HUB['ml-100k'] = (
    'http://files.grouplens.org/datasets/movielens/ml-100k.zip',
    'cd4dcac4241c8a4ad7badc7ca635da8a69dddb83')

def read_data_ml100k():
    data_dir = d2l.download_extract('ml-100k')
    names = ['user_id', 'item_id', 'rating', 'timestamp']
    data = pd.read_csv(os.path.join(data_dir, 'u.data'), sep='\t', names=names, engine='python')
    num_users = data.user_id.unique().shape[0]
    num_items = data.item_id.unique().shape[0]
    return data, num_users, num_items

In [63]:
data, num_users, num_items = read_data_ml100k()
sparsity = 1 - len(data) / (num_users * num_items)
print(f'number of users: {num_users}, number of items: {num_items}')
print(f'matrix sparsity: {sparsity:f}')
print(data.head(5))

number of users: 943, number of items: 1682
matrix sparsity: 0.936953
   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596


# Task 1: Construct a user-item matrix
Construct a user-item matrix, in which each row represents a user and each column represents a movie, and the value of the corresponding cell represents the user’s rating on the movie.

In [64]:
# Task 1: Construct a user-item matrix
user_item_matrix = data.pivot_table(index='user_id', columns='item_id', values='rating', fill_value=0)

user_item_matrix.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,3,4,3,3,5,4,1,5,3,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Task 2: User-based neighborhood method

A commonly used user-based neighborhood method is defined by the formula provided below, where “a” represents the active user, “n” represents the movie id, and “bar x” represents the mean value of ratings given by the user. Please use the user-based neighborhood method to predict movie ratings for user #1. Particularly, we consider the neighborhood size, M=10, and the similarity between two users is defined by Pearson
correlation.

In [65]:
from scipy.stats import pearsonr
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Task 2: User-based neighborhood method to predict movie ratings for user #1
def get_recommended_ratings_for_user(active_user_id, user_item_matrix, neighborhood_size):

  # Get the ratings of the active user
  active_user_ratings = user_item_matrix.loc[active_user_id]

  # Calculate Pearson correlation between the active user and all other users
  pearson_correlations = {}
  for user_id, ratings in user_item_matrix.iterrows():
      if user_id != active_user_id:
          common_movies = active_user_ratings.index.intersection(ratings.index)
          if len(common_movies) > 0:
              pearson_correlation, _ = pearsonr(active_user_ratings[common_movies], ratings[common_movies])
              pearson_correlations[user_id] = pearson_correlation

  # Sort users by decreasing Pearson correlation and select the top M=10 neighbors
  neighborhood = sorted(pearson_correlations, key=pearson_correlations.get, reverse=True)[:neighborhood_size]

  # Predict ratings for movies not yet rated by the active user
  predicted_ratings = {}
  actual_ratings = {}

  for movie_id in user_item_matrix.columns:
      if pd.isna(active_user_ratings[movie_id]) or active_user_ratings[movie_id] == 0:
          weighted_sum = 0
          sum_of_weights = 0
          actual_rating = active_user_ratings[movie_id]
          for neighbor_id in neighborhood:
              if not pd.isna(user_item_matrix.loc[neighbor_id, movie_id]) and user_item_matrix.loc[neighbor_id, movie_id] != 0:
                  neighbor_rating = user_item_matrix.loc[neighbor_id, movie_id]
                  neighbor_mean = user_item_matrix.loc[neighbor_id].mean()
                  weighted_sum += pearson_correlations[neighbor_id] * (neighbor_rating - neighbor_mean)
                  sum_of_weights += abs(pearson_correlations[neighbor_id])
          if sum_of_weights > 0:
              predicted_rating = active_user_ratings.mean() + weighted_sum / sum_of_weights
              predicted_ratings[movie_id] = predicted_rating
              actual_ratings[movie_id] = actual_rating

  return predicted_ratings, actual_ratings

# Task 3: Evaluate the user-based neighborhood method by MSE and MAE.

In [66]:
# Task 3: Evaluate the User-Based Neighborhood Method by MSE and MAE

predicted_ratings, actual_ratings = get_recommended_ratings_for_user(1, user_item_matrix, 10)

# Calculate MAE and MSE
mae = mean_absolute_error(list(actual_ratings.values()), list(predicted_ratings.values()))
mse = mean_squared_error(list(actual_ratings.values()), list(predicted_ratings.values()))

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")

Mean Absolute Error (MAE): 3.0194
Mean Squared Error (MSE): 10.0822


These values, for the range it can represent, are actually not that good, and need improvements.

# Task 4: Feedforward Neural Network

Develop a feedforward neural network (NN) with two hidden layers to predict ratings for user #1. Particularly, in data pre-processing phase, please use the above user-based neighborhood method (with neighborhood size =2) to fill up the missing ratings except the ratings corresponding to the active users and movies (Here, we define the active user and movies as those we (the recommender system) aim to predict. For exam, in the above formula, “a” is the active user and “n” is the active movie). To streamline the training process, you may temporarily use a constant value to fill in missing data, and then use the user-based neighborhood method’s prediction. Evaluate the performance of the NN model by MSE and MAE.

In [71]:
active_user_id=1
neighborhood_size=2

predicted_ratings, actual_ratings = get_recommended_ratings_for_user(active_user_id, user_item_matrix, neighborhood_size)

# Step 1: Replace missing ratings in the original user-item matrix with predicted ratings
for movie_id, rating in predicted_ratings.items():
    if pd.isna(user_item_matrix.loc[active_user_id, movie_id]) or user_item_matrix.loc[active_user_id, movie_id] == 0:
        user_item_matrix.loc[active_user_id, movie_id] = rating

# Step 2: Use the mean rating for remaining missing values
mean_ratings = user_item_matrix.mean(axis=1)
user_item_matrix.loc[active_user_id] = user_item_matrix.loc[active_user_id].replace(0, mean_ratings[active_user_id])

# Identify active movies (movies to predict) for the active user
active_movies = data[data['user_id'] == active_user_id]['item_id'].unique()

# Create a copy of the original DataFrame to store the modified ratings
processed_df = data.copy()

# Iterate through active movies and use the user-based neighborhood method to fill missing ratings
for movie_id in active_movies:
    missing_ratings = processed_df[(processed_df['user_id'] != active_user_id) &
                                   (processed_df['item_id'] == movie_id) &
                                   (processed_df['rating'].isnull())]

    for _, row in missing_ratings.iterrows():
        predicted_rating = get_recommended_ratings_for_user(active_user_id, movie_id, neighborhood_size)
        processed_df.at[row.name, 'rating'] = predicted_rating

In [72]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(processed_df, test_size=0.2, random_state=42)

# Define the neural network model
model = Sequential()
model.add(Dense(64, input_dim=neighborhood_size, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Prepare the training data
X_train = train_df[['user_id', 'item_id']].values
y_train = train_df['rating'].values

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Prepare the testing data
X_test = test_df[['user_id', 'item_id']].values
y_test_true = test_df['rating'].values

# Make predictions using the trained model
y_test_pred = model.predict(X_test).flatten()

# Evaluate the model performance
mse = mean_squared_error(y_test_true, y_test_pred)
mae = mean_absolute_error(y_test_true, y_test_pred)

print(f'Mean Squared Error (MSE): {mse}')
print(f'Mean Absolute Error (MAE): {mae}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Mean Squared Error (MSE): 1.6570231558822481
Mean Absolute Error (MAE): 1.0612854012131692


This model is actually a very good model for this, as it minimizes the error close to zero! It starts at a high value, but very quickly comes down and plateus in the first couple of epochs.

# Task 5: Categorize the movies into two classes based on a threshold rating of 4

Categorize the movies into two classes based on a threshold rating of 4 Define any movie with a user rating of 4 or higher as 'Like.' Similarly, define movies with ratings below 4 as 'Dislike.' Develop a neural network (NN) model with two hidden layers specifically to predict movie preferences for user #1.

In [74]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Categorize movies into two classes based on the threshold rating of 4
data['preference'] = np.where(data['rating'] >= 4, 'Like', 'Dislike')

# Filter data for the active user
user_data = data[data['user_id'] == active_user_id]

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(user_data, test_size=0.2, random_state=42)

# Define features and target variable
X_train = train_data[['item_id']].values
y_train = (train_data['preference'] == 'Like').astype(int)

X_test = test_data[['item_id']].values
y_test_true = (test_data['preference'] == 'Like').astype(int)

# Standardize features (optional, but often beneficial for neural networks)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the neural network model
model = Sequential()
model.add(Dense(64, input_dim=1, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Make predictions using the trained model
y_test_pred_prob = model.predict(X_test)
y_test_pred = (y_test_pred_prob >= 0.5).astype(int)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.5454545454545454


# Task 6: Accuracy

Use “Accuracy = percentage of correct predictions” to evaluate the performance of the NN model.

In [75]:
# Evaluate the model performance
accuracy = accuracy_score(y_test_true, y_test_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.5454545454545454


This model, unfortunately, is not very accurate! Half of the time, it will guess incorrectly on if a user will like or dislike a movie. Ideally, it would be more closer to 95% at the minimum! I think the cause of this may be found in a pre-processing stage, and more accurately predicting the user-item matrix, as this is what powers the learning for this model.