# DIS Project 2: Recommender Systems

In this project, we aim to build a recommender system using matrix factorization techniques for predicting user ratings for books based on historical data.

In [7]:
# Imports

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, KFold

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from IPython.display import clear_output

import warnings
warnings.filterwarnings('ignore')

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cpu


In [8]:
import random

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For CUDA
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

### Data loading and preprocessing

In [11]:
data_dir = 'data/'

train_df = pd.read_csv(f'{data_dir}train.csv')
test_df = pd.read_csv(f'{data_dir}test.csv')
books_df = pd.read_csv(f'{data_dir}books.csv')

# Display first few rows of each DataFrame
print("Train Data:")
display(train_df.head())

print("\nTest Data:")
display(test_df.head())

print("\nBooks Data:")
display(books_df.head())

Train Data:


Unnamed: 0,book_id,user_id,rating
0,7260,20145,3.5
1,243238,85182,4.0
2,9135,45973,1.0
3,18671,63554,3.0
4,243293,81002,5.0



Test Data:


Unnamed: 0,id,book_id,user_id
0,0,3786,40484
1,1,1985,47039
2,2,2290,60111
3,3,118657,64447
4,4,1560,2953



Books Data:


Unnamed: 0,ISBN,book_id
0,2005018,1
1,374157065,3
2,399135782,5
3,440234743,18
4,452264464,19


Mapping users and books to indices

In [12]:
# Get unique user_ids and book_ids from training data
unique_users = train_df['user_id'].unique()
unique_books = train_df['book_id'].unique()

# Create mappings
user2idx = {user_id: idx for idx, user_id in enumerate(unique_users)}
book2idx = {book_id: idx for idx, book_id in enumerate(unique_books)}

# Apply mappings to train and test data
train_df['user_idx'] = train_df['user_id'].map(user2idx)
train_df['book_idx'] = train_df['book_id'].map(book2idx)

test_df['user_idx'] = test_df['user_id'].map(user2idx)
test_df['book_idx'] = test_df['book_id'].map(book2idx)

# Handle users/books in test set not seen in training
# Assign a special index for unknown users/books
unknown_user_idx = len(user2idx)
unknown_book_idx = len(book2idx)

test_df['user_idx'] = test_df['user_idx'].fillna(unknown_user_idx).astype(int)
test_df['book_idx'] = test_df['book_idx'].fillna(unknown_book_idx).astype(int)

# Update mappings to include unknown user and book
if unknown_user_idx not in user2idx.values():
    user2idx['unknown'] = unknown_user_idx

if unknown_book_idx not in book2idx.values():
    book2idx['unknown'] = unknown_book_idx

print(f"Number of unique users: {len(user2idx)}")
print(f"Number of unique books: {len(book2idx)}")

Number of unique users: 18906
Number of unique books: 15713


In [13]:
# Split the original training data into training and validation sets
train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=42)

print(f"Training Set Size: {len(train_data)}")
print(f"Validation Set Size: {len(val_data)}")

Training Set Size: 90470
Validation Set Size: 10053


Creating a class BookRatingsDataset allows us to encapsulate data, promote code reuse, improve maintainability, and model real-world entities in a structured and intuitive way.

In [14]:
class BookRatingsDataset(Dataset):
    def __init__(self, dataframe, is_train=True):
        self.is_train = is_train
        self.user = torch.tensor(dataframe['user_idx'].values, dtype=torch.long)
        self.book = torch.tensor(dataframe['book_idx'].values, dtype=torch.long)
        if self.is_train:
            self.rating = torch.tensor(dataframe['rating'].values, dtype=torch.float32)
        else:
            self.id = torch.tensor(dataframe['id'].values, dtype=torch.long)

    def __len__(self):
        return len(self.user)

    def __getitem__(self, idx):
        if self.is_train:
            return self.user[idx], self.book[idx], self.rating[idx]
        else:
            return self.id[idx], self.user[idx], self.book[idx]

In [15]:
default_batch_size = 1408

# Create Dataset objects
train_dataset = BookRatingsDataset(train_data, is_train=True)
val_dataset = BookRatingsDataset(val_data, is_train=True)
test_dataset = BookRatingsDataset(test_df, is_train=False)

# Create DataLoaders with num_workers=0 and pin_memory=True
train_loader = DataLoader(train_dataset, batch_size=default_batch_size, shuffle=True, num_workers=0, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=default_batch_size, shuffle=False, num_workers=0, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=default_batch_size, shuffle=False, num_workers=0, pin_memory=True)

In [17]:
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_books, embedding_size=100, dropout_rate=0.1, device='cpu'):
        super(MatrixFactorization, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.book_embedding = nn.Embedding(num_books, embedding_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.book_bias = nn.Embedding(num_books, 1)
        self.global_bias = nn.Parameter(torch.tensor([3.0], device=device))  # Initialize on device
        self.dropout = nn.Dropout(dropout_rate)
        self.activation = nn.Sigmoid()

        self._init_weights()

    def _init_weights(self):
        nn.init.normal_(self.user_embedding.weight, mean=0, std=0.1)
        nn.init.normal_(self.book_embedding.weight, mean=0, std=0.1)
        nn.init.constant_(self.user_bias.weight, 0.0)
        nn.init.constant_(self.book_bias.weight, 0.0)

    def forward(self, user, book):
        user_emb = self.user_embedding(user)
        book_emb = self.book_embedding(book)
        user_b = self.user_bias(user).squeeze()
        book_b = self.book_bias(book).squeeze()
        interaction = (user_emb * book_emb).sum(1)
        pred = interaction + user_b + book_b + self.global_bias
        pred = self.activation(pred)
        pred = self.dropout(pred) * 4.0 + 1.0  # Scale to [1.0, 5.0]
        return pred

In [None]:
# Define model parameters
num_users = len(user2idx)
num_books = len(book2idx)
embedding_size = 145
dropout_rate = 0.1

# Initialize the model with device parameter
model = MatrixFactorization(num_users, num_books, embedding_size, dropout_rate, device=device)
model = model.to(device)  # Ensure the model is on the correct device
print(model)

MatrixFactorization(
  (user_embedding): Embedding(18906, 145)
  (book_embedding): Embedding(15713, 145)
  (user_bias): Embedding(18906, 1)
  (book_bias): Embedding(15713, 1)
  (dropout): Dropout(p=0.1, inplace=False)
  (activation): Sigmoid()
)


Training loop

In [19]:
# Define training parameters
learning_rate = 0.009968354230012818
epochs = 100
patience = 10  # Number of epochs to wait for improvement before stopping

# Define loss function and optimizer with L2 regularization (weight_decay)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)

# Initialize variables for Early Stopping
best_val_loss = float('inf')
patience_counter = 0

# Initialize a list to store the last 5 log messages
recent_logs = []
max_logs = 5  # Maximum number of logs to display

In [None]:
for epoch in range(1, epochs + 1):
    model.train()
    train_losses = []

    for user, book, rating in tqdm(train_loader, desc=f"Epoch {epoch}/{epochs} - Training"):
        # Move data to GPU
        user = user.to(device, non_blocking=True)
        book = book.to(device, non_blocking=True)
        rating = rating.to(device, non_blocking=True)

        optimizer.zero_grad()
        outputs = model(user, book)
        loss = criterion(outputs, rating)
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())

    avg_train_loss = np.mean(train_losses)

    # Validation Phase
    model.eval()
    val_losses = []
    with torch.no_grad():
        for user, book, rating in tqdm(val_loader, desc=f"Epoch {epoch}/{epochs} - Validation"):
            # Move data to GPU
            user = user.to(device, non_blocking=True)
            book = book.to(device, non_blocking=True)
            rating = rating.to(device, non_blocking=True)

            outputs = model(user, book)
            loss = criterion(outputs, rating)
            val_losses.append(loss.item())

    avg_val_loss = np.mean(val_losses)

    # Logging
    log_message = f"Epoch {epoch}: Train Loss = {avg_train_loss:.4f}, Validation Loss = {avg_val_loss:.4f}"
    recent_logs.append(log_message)
    if len(recent_logs) > max_logs:
        recent_logs.pop(0)

    clear_output(wait=True)
    print("\n".join(recent_logs))

    # Early Stopping Check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        # Save the best model
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

Epoch 37: Train Loss = 0.3850, Validation Loss = 0.7237
Epoch 38: Train Loss = 0.3842, Validation Loss = 0.7228
Epoch 39: Train Loss = 0.3957, Validation Loss = 0.7230
Epoch 40: Train Loss = 0.3810, Validation Loss = 0.7282
Epoch 41: Train Loss = 0.3759, Validation Loss = 0.7250
Early stopping triggered.


### Evaluation and submission

In [21]:
# Load the best model
model.load_state_dict(torch.load('best_model.pth'))
model = model.to(device)
model.eval()

MatrixFactorization(
  (user_embedding): Embedding(18906, 145)
  (book_embedding): Embedding(15713, 145)
  (user_bias): Embedding(18906, 1)
  (book_bias): Embedding(15713, 1)
  (dropout): Dropout(p=0.1, inplace=False)
  (activation): Sigmoid()
)

In [22]:
def calculate_rmse(model, data_loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for user, book, rating in data_loader:
            user = user.to(device)
            book = book.to(device)
            rating = rating.to(device)  # Ensure ratings are on the correct device
            outputs = model(user, book)
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(rating.cpu().numpy())  # Move to CPU for metric calculation
    mse = mean_squared_error(actuals, predictions)
    rmse = np.sqrt(mse)
    return rmse

def calculate_mae(model, data_loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for user, book, rating in data_loader:
            user = user.to(device)
            book = book.to(device)
            rating = rating.to(device)  # Ensure ratings are on the correct device
            outputs = model(user, book)
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(rating.cpu().numpy())  # Move to CPU for metric calculation
    mae = mean_absolute_error(actuals, predictions)
    return mae

In [23]:
# Calculate RMSE and MAE on validation set
val_rmse = calculate_rmse(model, val_loader)
val_mae = calculate_mae(model, val_loader)

print(f"Validation RMSE: {val_rmse:.4f}")
print(f"Validation MAE: {val_mae:.4f}")

Validation RMSE: 0.8576
Validation MAE: 0.6664


Generating predictions for test set

In [24]:
def generate_test_predictions(model, data_loader):
    model.eval()
    predictions = []
    ids = []
    with torch.no_grad():
        for id_batch, user, book in tqdm(data_loader, desc="Generating Predictions"):
            user = user.to(device)  # Move user tensor to the model's device
            book = book.to(device)  # Move book tensor to the model's device
            id_batch = id_batch.to(device)  # Ensure id_batch is also moved if necessary

            outputs = model(user, book)
            preds = outputs.cpu().numpy()
            predictions.extend(preds)
            ids.extend(id_batch.cpu().numpy())
    return ids, predictions

# Generate predictions
test_ids, test_predictions = generate_test_predictions(model, test_loader)

Generating Predictions:   0%|          | 0/21 [00:00<?, ?it/s]

Preparing the Submission File

In [25]:
# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'id': test_ids,
    'rating': test_predictions
})

# Ensure ratings are within the 1.0 to 5.0 range
submission_df['rating'] = submission_df['rating'].clip(1.0, 5.0)

# Display the first few predictions
submission_df.head()

Unnamed: 0,id,rating
0,0,2.281063
1,1,1.886247
2,2,1.527935
3,3,2.54508
4,4,2.424813


In [26]:
# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' has been created.")

Submission file 'submission.csv' has been created.
