In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, random_split

In [2]:
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7e76778d29d0>

In [3]:
EMBED_DIM = 200    
LR = 0.01       
EPOCHS = 50       
BATCH_SIZE = 4096
WEIGHT_DECAY = 1e-4

In [4]:
TRAIN_PATH = "/kaggle/input/dis-project2-datasets/train.csv"
TEST_PATH = "/kaggle/input/dis-project2-datasets/test.csv"
BOOKS_PATH = "/kaggle/input/dis-project2-datasets/books.csv"
SUBMISSION_OUTPUT = "submission.csv"

In [5]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

In [6]:
unique_users = train_df['user_id'].unique()
unique_books = train_df['book_id'].unique()

user_to_idx = {u: i for i, u in enumerate(unique_users)}
book_to_idx = {b: i for i, b in enumerate(unique_books)}

train_df['user_idx'] = train_df['user_id'].map(user_to_idx)
train_df['book_idx'] = train_df['book_id'].map(book_to_idx)

test_df['user_idx'] = test_df['user_id'].map(user_to_idx)
test_df['book_idx'] = test_df['book_id'].map(book_to_idx)

global_mean = train_df['rating'].mean()

test_df = test_df.dropna(subset=['user_idx', 'book_idx']).reset_index(drop=True)
test_df['user_idx'] = test_df['user_idx'].astype(int)
test_df['book_idx'] = test_df['book_idx'].astype(int)

train_users = train_df['user_idx'].values
train_items = train_df['book_idx'].values
train_ratings = train_df['rating'].values.astype(np.float32)

test_users = test_df['user_idx'].values
test_items = test_df['book_idx'].values

num_users = len(unique_users)
num_items = len(unique_books)

In [7]:
class RatingsDataset(Dataset):
    def __init__(self, users, items, ratings):
        self.users = users
        self.items = items
        self.ratings = ratings
    
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, idx):
        return (
            self.users[idx],
            self.items[idx],
            self.ratings[idx]
        )

dataset = RatingsDataset(train_users, train_items, train_ratings)

In [8]:
val_ratio = 0.1
val_size = int(len(dataset) * val_ratio)
train_size = len(dataset) - val_size
train_data, val_data = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False)

In [9]:
class MFModel(nn.Module):
    def __init__(self, num_users, num_items, emb_size=50, global_mean=3.0):
        super(MFModel, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)
        
        # Initialize embeddings
        nn.init.normal_(self.user_emb.weight, std=0.01)
        nn.init.normal_(self.item_emb.weight, std=0.01)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)
        
        self.global_mean = global_mean

    def forward(self, user_ids, item_ids):
        u_emb = self.user_emb(user_ids)
        i_emb = self.item_emb(item_ids)
        u_b = self.user_bias(user_ids).squeeze()
        i_b = self.item_bias(item_ids).squeeze()
        
        # Dot product
        pred = (u_emb * i_emb).sum(dim=1)
        pred = pred + u_b + i_b + self.global_mean
        return pred

model = MFModel(num_users, num_items, emb_size=EMBED_DIM, global_mean=global_mean)

In [10]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

def rmse(y_true, y_pred):
    return torch.sqrt(((y_true - y_pred)**2).mean())

best_val_rmse = float('inf')
for epoch in range(EPOCHS):
    model.train()
    train_losses = []
    for users_batch, items_batch, ratings_batch in train_loader:
        users_batch = users_batch.long()
        items_batch = items_batch.long()
        ratings_batch = ratings_batch.float()

        optimizer.zero_grad()
        preds = model(users_batch, items_batch)
        loss = criterion(preds, ratings_batch)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
    
    # Validation
    model.eval()
    val_losses = []
    with torch.no_grad():
        for u_val, i_val, r_val in val_loader:
            u_val = u_val.long()
            i_val = i_val.long()
            r_val = r_val.float()
            val_preds = model(u_val, i_val)
            v_loss = rmse(r_val, val_preds)
            val_losses.append(v_loss.item())
    avg_val_rmse = np.mean(val_losses)
    
    print(f"Epoch {epoch+1}/{EPOCHS}, Train Loss: {np.mean(train_losses):.4f}, Val RMSE: {avg_val_rmse:.4f}")
    if avg_val_rmse < best_val_rmse:
        best_val_rmse = avg_val_rmse
        torch.save(model.state_dict(), "best_mf_model.pth")

model.load_state_dict(torch.load("best_mf_model.pth"))

Epoch 1/50, Train Loss: 1.1608, Val RMSE: 1.0418
Epoch 2/50, Train Loss: 0.9738, Val RMSE: 0.9953
Epoch 3/50, Train Loss: 0.8228, Val RMSE: 0.9700
Epoch 4/50, Train Loss: 0.7295, Val RMSE: 0.9537
Epoch 5/50, Train Loss: 0.6705, Val RMSE: 0.9415
Epoch 6/50, Train Loss: 0.6229, Val RMSE: 0.9325
Epoch 7/50, Train Loss: 0.5889, Val RMSE: 0.9247
Epoch 8/50, Train Loss: 0.5688, Val RMSE: 0.9185
Epoch 9/50, Train Loss: 0.5471, Val RMSE: 0.9120
Epoch 10/50, Train Loss: 0.5333, Val RMSE: 0.9083
Epoch 11/50, Train Loss: 0.5252, Val RMSE: 0.9047
Epoch 12/50, Train Loss: 0.5111, Val RMSE: 0.9021
Epoch 13/50, Train Loss: 0.5076, Val RMSE: 0.8993
Epoch 14/50, Train Loss: 0.4998, Val RMSE: 0.8980
Epoch 15/50, Train Loss: 0.4915, Val RMSE: 0.8962
Epoch 16/50, Train Loss: 0.4882, Val RMSE: 0.8943
Epoch 17/50, Train Loss: 0.4844, Val RMSE: 0.8936
Epoch 18/50, Train Loss: 0.4771, Val RMSE: 0.8928
Epoch 19/50, Train Loss: 0.4776, Val RMSE: 0.8914
Epoch 20/50, Train Loss: 0.4736, Val RMSE: 0.8910
Epoch 21/

  model.load_state_dict(torch.load("best_mf_model.pth"))


<All keys matched successfully>

In [11]:
model.eval()
test_ids = test_df['id'].values
all_preds = []
BATCH_TEST = 4096
for start in range(0, len(test_users), BATCH_TEST):
    end = start + BATCH_TEST
    u_batch = torch.tensor(test_users[start:end], dtype=torch.long)
    i_batch = torch.tensor(test_items[start:end], dtype=torch.long)
    with torch.no_grad():
        pred_batch = model(u_batch, i_batch)
    pred_batch = torch.clamp(pred_batch, 1.0, 5.0)
    all_preds.append(pred_batch.numpy())
all_preds = np.concatenate(all_preds)

In [12]:
submission = pd.DataFrame({"id": test_ids, "rating": all_preds})
submission.to_csv(SUBMISSION_OUTPUT, index=False)
print("Submission file saved:", SUBMISSION_OUTPUT)

Submission file saved: submission.csv
