# Hyperparameter optimization with Optuna library

This notebook serves the purpose of finding a good hyperparameter configuration for our matrix factorization approach. We will use Optuna for hyperparameter optimization to improve the performance of our recommendation system. The notebook includes data loading, preprocessing, model training, evaluation, and submission preparation steps.

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, KFold

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from IPython.display import clear_output

import warnings
warnings.filterwarnings('ignore')

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cpu


In [3]:
import random

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For CUDA
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the seed
set_seed(42)

### Data loading and preprocessing

In [4]:
data_dir = 'data/'

train_df = pd.read_csv(f'{data_dir}train.csv')
test_df = pd.read_csv(f'{data_dir}test.csv')
books_df = pd.read_csv(f'{data_dir}books.csv')

# Display first few rows of each DataFrame
print("Train Data:")
display(train_df.head())

print("\nTest Data:")
display(test_df.head())

print("\nBooks Data:")
display(books_df.head())

Train Data:


Unnamed: 0,book_id,user_id,rating
0,7260,20145,3.5
1,243238,85182,4.0
2,9135,45973,1.0
3,18671,63554,3.0
4,243293,81002,5.0



Test Data:


Unnamed: 0,id,book_id,user_id
0,0,3786,40484
1,1,1985,47039
2,2,2290,60111
3,3,118657,64447
4,4,1560,2953



Books Data:


Unnamed: 0,ISBN,book_id
0,2005018,1
1,374157065,3
2,399135782,5
3,440234743,18
4,452264464,19


Mapping users and books to indices

In [5]:
# Get unique user_ids and book_ids from training data
unique_users = train_df['user_id'].unique()
unique_books = train_df['book_id'].unique()

# Create mappings
user2idx = {user_id: idx for idx, user_id in enumerate(unique_users)}
book2idx = {book_id: idx for idx, book_id in enumerate(unique_books)}

# Apply mappings to train and test data
train_df['user_idx'] = train_df['user_id'].map(user2idx)
train_df['book_idx'] = train_df['book_id'].map(book2idx)

test_df['user_idx'] = test_df['user_id'].map(user2idx)
test_df['book_idx'] = test_df['book_id'].map(book2idx)

# Handle users/books in test set not seen in training
# Assign a special index for unknown users/books
unknown_user_idx = len(user2idx)
unknown_book_idx = len(book2idx)

test_df['user_idx'] = test_df['user_idx'].fillna(unknown_user_idx).astype(int)
test_df['book_idx'] = test_df['book_idx'].fillna(unknown_book_idx).astype(int)

# Update mappings to include unknown user and book
if unknown_user_idx not in user2idx.values():
    user2idx['unknown'] = unknown_user_idx

if unknown_book_idx not in book2idx.values():
    book2idx['unknown'] = unknown_book_idx

print(f"Number of unique users: {len(user2idx)}")
print(f"Number of unique books: {len(book2idx)}")

Number of unique users: 18906
Number of unique books: 15713


In [6]:
# Split the original training data into training and validation sets
train_data, val_data = train_test_split(train_df, test_size=0.1, random_state=42)

print(f"Training Set Size: {len(train_data)}")
print(f"Validation Set Size: {len(val_data)}")

Training Set Size: 90470
Validation Set Size: 10053


In [7]:
class BookRatingsDataset(Dataset):
    def __init__(self, dataframe, is_train=True):
        self.is_train = is_train
        self.user = torch.tensor(dataframe['user_idx'].values, dtype=torch.long)
        self.book = torch.tensor(dataframe['book_idx'].values, dtype=torch.long)
        if self.is_train:
            self.rating = torch.tensor(dataframe['rating'].values, dtype=torch.float32)
        else:
            self.id = torch.tensor(dataframe['id'].values, dtype=torch.long)

    def __len__(self):
        return len(self.user)

    def __getitem__(self, idx):
        if self.is_train:
            return self.user[idx], self.book[idx], self.rating[idx]
        else:
            return self.id[idx], self.user[idx], self.book[idx]

In [8]:
# Define default batch size
default_batch_size = 1024  # This will be optimized later

# Create Dataset objects
train_dataset = BookRatingsDataset(train_data, is_train=True)
val_dataset = BookRatingsDataset(val_data, is_train=True)
test_dataset = BookRatingsDataset(test_df, is_train=False)

# Create DataLoaders with num_workers=0
train_loader = DataLoader(train_dataset, batch_size=default_batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=default_batch_size, shuffle=False, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=default_batch_size, shuffle=False, num_workers=0)

In [9]:
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_books, embedding_size=100, dropout_rate=0.1):
        super(MatrixFactorization, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.book_embedding = nn.Embedding(num_books, embedding_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.book_bias = nn.Embedding(num_books, 1)
        self.global_bias = nn.Parameter(torch.tensor([3.0]))
        self.dropout = nn.Dropout(dropout_rate)
        self.activation = nn.Sigmoid()

        self._init_weights()

    def _init_weights(self):
        nn.init.normal_(self.user_embedding.weight, mean=0, std=0.1)
        nn.init.normal_(self.book_embedding.weight, mean=0, std=0.1)
        nn.init.constant_(self.user_bias.weight, 0.0)
        nn.init.constant_(self.book_bias.weight, 0.0)

    def forward(self, user, book):
        user_emb = self.user_embedding(user)
        book_emb = self.book_embedding(book)
        user_b = self.user_bias(user).squeeze()
        book_b = self.book_bias(book).squeeze()
        dot = (user_emb * book_emb).sum(1)
        pred = dot + user_b + book_b + self.global_bias
        pred = self.activation(pred)
        pred = self.dropout(pred) * 4.0 + 1.0  # Scale to [1.0, 5.0]
        return pred

In [10]:
# Define model parameters
num_users = len(user2idx)
num_books = len(book2idx)
embedding_size = 100
dropout_rate = 0.1

# Initialize the model
model = MatrixFactorization(num_users, num_books, embedding_size, dropout_rate)
model = model.to(device)
print(model)

MatrixFactorization(
  (user_embedding): Embedding(18906, 100)
  (book_embedding): Embedding(15713, 100)
  (user_bias): Embedding(18906, 1)
  (book_bias): Embedding(15713, 1)
  (dropout): Dropout(p=0.1, inplace=False)
  (activation): Sigmoid()
)


In [11]:
# Define training parameters
learning_rate = 0.001
epochs = 100
patience = 10  # Number of epochs to wait for improvement before stopping

# Define loss function and optimizer with L2 regularization (weight_decay)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)

# Initialize variables for Early Stopping
best_val_loss = float('inf')
patience_counter = 0

# Initialize a list to store the last 5 log messages
recent_logs = []
max_logs = 5  # Maximum number of logs to display

Training loop

In [12]:
for epoch in range(1, epochs + 1):
    model.train()
    train_losses = []

    for user, book, rating in tqdm(train_loader, desc=f"Epoch {epoch}/{epochs} - Training"):
        user = user.to(device)
        book = book.to(device)
        rating = rating.to(device)

        optimizer.zero_grad()
        outputs = model(user, book)
        loss = criterion(outputs, rating)
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())

    avg_train_loss = np.mean(train_losses)

    # Validation Phase
    model.eval()
    val_losses = []
    with torch.no_grad():
        for user, book, rating in tqdm(val_loader, desc=f"Epoch {epoch}/{epochs} - Validation"):
            user = user.to(device)
            book = book.to(device)
            rating = rating.to(device)

            outputs = model(user, book)
            loss = criterion(outputs, rating)
            val_losses.append(loss.item())

    avg_val_loss = np.mean(val_losses)

    # Logging
    log_message = f"Epoch {epoch}: Train Loss = {avg_train_loss:.4f}, Validation Loss = {avg_val_loss:.4f}"
    recent_logs.append(log_message)
    if len(recent_logs) > max_logs:
        recent_logs.pop(0)

    clear_output(wait=True)
    print("\n".join(recent_logs))

    # Early Stopping Check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        # Save the best model
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

Epoch 96: Train Loss = 0.3429, Validation Loss = 0.7633
Epoch 97: Train Loss = 0.3366, Validation Loss = 0.7622
Epoch 98: Train Loss = 0.3385, Validation Loss = 0.7609
Epoch 99: Train Loss = 0.3505, Validation Loss = 0.7601
Epoch 100: Train Loss = 0.3497, Validation Loss = 0.7592


### HPO with Optuna

In [12]:
import optuna
from optuna.trial import TrialState
import joblib

In [13]:
def objective(trial):
    emb_size = trial.suggest_int('embedding_size', 75, 200, step=1)
    lr = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
    batch_size = trial.suggest_int('batch_size', 512, 2048, step=128)

    # Create DataLoaders with the suggested batch size and set num_workers=0
    train_loader_hpo = DataLoader(BookRatingsDataset(train_data, is_train=True), batch_size=batch_size, shuffle=True, num_workers=0)
    val_loader_hpo = DataLoader(BookRatingsDataset(val_data, is_train=True), batch_size=batch_size, shuffle=False, num_workers=0)

    # Initialize the model with the suggested embedding size
    model_hpo = MatrixFactorization(num_users, num_books, embedding_size=emb_size, dropout_rate=0.1)
    model_hpo = model_hpo.to(device)

    # Define loss function and optimizer with the suggested learning rate
    criterion_hpo = nn.MSELoss()
    optimizer_hpo = optim.Adam(model_hpo.parameters(), lr=lr, weight_decay=1e-5)

    # Training parameters
    epochs_hpo = 100
    patience_hpo = 10

    # Initialize Early Stopping variables
    best_val_loss_hpo = float('inf')
    patience_counter_hpo = 0

    for epoch in range(1, epochs_hpo + 1):
        model_hpo.train()
        train_losses_hpo = []

        for user, book, rating in train_loader_hpo:
            user = user.to(device)
            book = book.to(device)
            rating = rating.to(device)

            optimizer_hpo.zero_grad()
            outputs = model_hpo(user, book)
            loss = criterion_hpo(outputs, rating)
            loss.backward()
            optimizer_hpo.step()

            train_losses_hpo.append(loss.item())

        avg_train_loss_hpo = np.mean(train_losses_hpo)

        # Validation Phase
        model_hpo.eval()
        val_losses_hpo = []
        with torch.no_grad():
            for user, book, rating in val_loader_hpo:
                user = user.to(device)
                book = book.to(device)
                rating = rating.to(device)

                outputs = model_hpo(user, book)
                loss = criterion_hpo(outputs, rating)
                val_losses_hpo.append(loss.item())

        avg_val_loss_hpo = np.mean(val_losses_hpo)

        # Early Stopping Check
        if avg_val_loss_hpo < best_val_loss_hpo:
            best_val_loss_hpo = avg_val_loss_hpo
            patience_counter_hpo = 0
            # Save the best model for this trial
            torch.save(model_hpo.state_dict(), 'best_model_hpo.pth')
        else:
            patience_counter_hpo += 1
            if patience_counter_hpo >= patience_hpo:
                break  # Early stopping

    return best_val_loss_hpo

In [14]:
# Create an Optuna study
study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))

# Optimize the objective function
study.optimize(objective, n_trials=50, timeout=None)  # Adjust n_trials as needed

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2024-12-09 11:57:39,153] A new study created in memory with name: no-name-9de22257-b6cd-4081-b1ec-f6f0a5d3c453
[I 2024-12-09 11:58:54,314] Trial 0 finished with value: 0.7312820383480617 and parameters: {'embedding_size': 122, 'learning_rate': 0.007969454818643935, 'batch_size': 1664}. Best is trial 0 with value: 0.7312820383480617.
[I 2024-12-09 12:02:39,201] Trial 1 finished with value: 1.6670062116214208 and parameters: {'embedding_size': 150, 'learning_rate': 0.0002051338263087451, 'batch_size': 768}. Best is trial 0 with value: 0.7312820383480617.
[I 2024-12-09 12:03:59,138] Trial 2 finished with value: 0.7219854518771172 and parameters: {'embedding_size': 82, 'learning_rate': 0.005399484409787433, 'batch_size': 1408}. Best is trial 2 with value: 0.7219854518771172.
[I 2024-12-09 12:06:15,655] Trial 3 finished with value: 5.384723949432373 and parameters: {'embedding_size': 164, 'learning_rate': 0.00010994335574766199, 'batch_size': 2048}. Best is trial 2 with value: 0.72198545

Number of finished trials:  50
Best trial:
  Value: 0.714960440993309
  Params: 
    embedding_size: 145
    learning_rate: 0.009968354230012818
    batch_size: 1408


In [15]:
# Visualize optimization history
optuna.visualization.plot_optimization_history(study)
plt.show()

In [16]:
# Visualize parameter importances
optuna.visualization.plot_param_importances(study)
plt.show()

In [17]:
# Save the study
joblib.dump(study, 'optuna_study.pkl')

# To load the study later
# study = joblib.load('optuna_study.pkl')

['optuna_study.pkl']

### Evaluation and submission

In [18]:
trial = study.best_trial


# Extract best hyperparameters
best_embedding_size = int(trial.params['embedding_size'])
best_learning_rate = trial.params['learning_rate']
best_batch_size = int(trial.params['batch_size'])

print(f"Best Hyperparameters:\nEmbedding Size: {best_embedding_size}\nLearning Rate: {best_learning_rate}\nBatch Size: {best_batch_size}")

Best Hyperparameters:
Embedding Size: 145
Learning Rate: 0.009968354230012818
Batch Size: 1408


In [19]:
# Create DataLoaders with the best batch size and set num_workers=0
train_loader_best = DataLoader(BookRatingsDataset(train_data, is_train=True), batch_size=best_batch_size, shuffle=True, num_workers=0)
val_loader_best = DataLoader(BookRatingsDataset(val_data, is_train=True), batch_size=best_batch_size, shuffle=False, num_workers=0)
test_loader_best = DataLoader(BookRatingsDataset(test_df, is_train=False), batch_size=best_batch_size, shuffle=False, num_workers=0)

In [20]:
# Initialize the final model with best embedding size
final_model = MatrixFactorization(num_users, num_books, embedding_size=best_embedding_size, dropout_rate=0.1)
final_model = final_model.to(device)

# Define loss function and optimizer with best learning rate
criterion_final = nn.MSELoss()
optimizer_final = optim.Adam(final_model.parameters(), lr=best_learning_rate, weight_decay=1e-5)

# Training parameters
epochs_final = 100
patience_final = 10

# Initialize Early Stopping variables
best_val_loss_final = float('inf')
patience_counter_final = 0

# Initialize recent_logs for logging
recent_logs_final = []
max_logs_final = 5  # Maximum number of logs to display

In [21]:
for epoch in range(1, epochs_final + 1):
    final_model.train()
    train_losses_final = []

    for user, book, rating in tqdm(train_loader_best, desc=f"Epoch {epoch}/{epochs_final} - Training"):
        user = user.to(device)
        book = book.to(device)
        rating = rating.to(device)

        optimizer_final.zero_grad()
        outputs = final_model(user, book)
        loss = criterion_final(outputs, rating)
        loss.backward()
        optimizer_final.step()

        train_losses_final.append(loss.item())

    avg_train_loss_final = np.mean(train_losses_final)

    # Validation Phase
    final_model.eval()
    val_losses_final = []
    with torch.no_grad():
        for user, book, rating in tqdm(val_loader_best, desc=f"Epoch {epoch}/{epochs_final} - Validation"):
            user = user.to(device)
            book = book.to(device)
            rating = rating.to(device)

            outputs = final_model(user, book)
            loss = criterion_final(outputs, rating)
            val_losses_final.append(loss.item())

    avg_val_loss_final = np.mean(val_losses_final)

    # Logging
    log_message_final = f"Epoch {epoch}: Train Loss = {avg_train_loss_final:.4f}, Validation Loss = {avg_val_loss_final:.4f}"
    recent_logs_final.append(log_message_final)
    if len(recent_logs_final) > max_logs_final:
        recent_logs_final.pop(0)

    clear_output(wait=True)
    print("\n".join(recent_logs_final))

    # Early Stopping Check
    if avg_val_loss_final < best_val_loss_final:
        best_val_loss_final = avg_val_loss_final
        patience_counter_final = 0
        # Save the best model
        torch.save(final_model.state_dict(), 'best_final_model.pth')
    else:
        patience_counter_final += 1
        if patience_counter_final >= patience_final:
            print("Early stopping triggered.")
            break

Epoch 34: Train Loss = 0.3820, Validation Loss = 0.7202
Epoch 35: Train Loss = 0.3815, Validation Loss = 0.7218
Epoch 36: Train Loss = 0.3927, Validation Loss = 0.7202
Epoch 37: Train Loss = 0.3854, Validation Loss = 0.7239
Epoch 38: Train Loss = 0.3908, Validation Loss = 0.7219
Early stopping triggered.


In [22]:
# Load the best model
final_model.load_state_dict(torch.load('best_final_model.pth'))
final_model = final_model.to(device)

In [23]:
# Define function to calculate RMSE
def calculate_rmse(model, data_loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for user, book, rating in data_loader:
            user = user.to(device)
            book = book.to(device)
            outputs = model(user, book)
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(rating.numpy())
    mse = mean_squared_error(actuals, predictions)
    rmse = np.sqrt(mse)
    return rmse

# Define function to calculate MAE
def calculate_mae(model, data_loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for user, book, rating in data_loader:
            user = user.to(device)
            book = book.to(device)
            outputs = model(user, book)
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(rating.numpy())
    mae = mean_absolute_error(actuals, predictions)
    return mae

In [24]:
# Calculate RMSE and MAE on validation set
val_rmse = calculate_rmse(final_model, val_loader_best)
val_mae = calculate_mae(final_model, val_loader_best)

print(f"Validation RMSE: {val_rmse:.4f}")
print(f"Validation MAE: {val_mae:.4f}")

Validation RMSE: 0.8595
Validation MAE: 0.6718


Generating predictions for test set

In [25]:
# Define a function to generate predictions for test set
def generate_test_predictions(model, data_loader):
    model.eval()
    predictions = []
    ids = []
    with torch.no_grad():
        for id_batch, user, book in tqdm(data_loader, desc="Generating Predictions"):
            user = user.to(device)
            book = book.to(device)
            outputs = model(user, book)
            preds = outputs.cpu().numpy()
            predictions.extend(preds)
            ids.extend(id_batch.numpy())
    return ids, predictions

In [26]:
# Generate predictions
test_ids, test_predictions = generate_test_predictions(final_model, test_loader_best)

Generating Predictions:   0%|          | 0/21 [00:00<?, ?it/s]

Preparing the Submission File

In [27]:
# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'id': test_ids,
    'rating': test_predictions
})

# Ensure ratings are within the 1.0 to 5.0 range
submission_df['rating'] = submission_df['rating'].clip(1.0, 5.0)

# Display the first few predictions
submission_df.head()

Unnamed: 0,id,rating
0,0,2.210467
1,1,1.919093
2,2,1.469288
3,3,2.478331
4,4,2.343541


In [28]:
# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' has been created.")

Submission file 'submission.csv' has been created.
