In [11]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [12]:
# Create a dataset class
class InteractionDataset(Dataset):
    def __init__(self, user_item_pairs, ratings):
        self.user_item_pairs = user_item_pairs
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.user_item_pairs[idx], self.ratings[idx]


# Define the NCF model
class NCF(nn.Module):
    def __init__(self, num_users, num_items, latent_dim):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, latent_dim)
        self.item_embedding = nn.Embedding(num_items, latent_dim)
        self.fc_layers = nn.Sequential(
            nn.Linear(latent_dim * 2, 128),
            nn.BatchNorm1d(128),  # Batch normalization
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, 1)
        )

    def forward(self, user, item):
        user_embed = self.user_embedding(user)
        item_embed = self.item_embedding(item)
        interaction = torch.cat([user_embed, item_embed], dim=-1)
        return self.fc_layers(interaction).squeeze()

In [13]:
# Prepare data
train_df = pd.read_csv("../data/train.csv")
user_item_pairs = train_df[["user_id", "book_id"]].values
#Map the user_id and book_id to a unique index
user_to_index = {user_id: idx for idx, user_id in enumerate(train_df['user_id'].unique())}
item_to_index = {book_id: idx for idx, book_id in enumerate(train_df['book_id'].unique())}
train_df['user_idx'] = train_df['user_id'].apply(lambda x: user_to_index[x])
train_df['book_idx'] = train_df['book_id'].apply(lambda x: item_to_index[x])
user_item_index_pairs = train_df[["user_idx", "book_idx"]].values
ratings = train_df["rating"].values

# Define constants
num_users = len(user_to_index)
num_items = len(item_to_index)
latent_dim = 5
batch_size = 256
epochs = 18

# DataLoader
dataset = InteractionDataset(user_item_index_pairs, ratings)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Train the model
model = NCF(num_users, num_items, latent_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

In [14]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for (batch, (user_item, rating)) in enumerate(data_loader):
        user, item = user_item[:, 0].long(), user_item[:, 1].long()
        rating = rating.float()
        optimizer.zero_grad()
        predictions = model(user, item)
        loss = criterion(predictions, rating)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(data_loader):.4f}")

Epoch 1, Loss: 2.1484
Epoch 2, Loss: 1.5056
Epoch 3, Loss: 1.4081
Epoch 4, Loss: 1.3333
Epoch 5, Loss: 1.2644
Epoch 6, Loss: 1.1895
Epoch 7, Loss: 1.1097
Epoch 8, Loss: 1.0308
Epoch 9, Loss: 0.9549
Epoch 10, Loss: 0.8906
Epoch 11, Loss: 0.8334
Epoch 12, Loss: 0.7883
Epoch 13, Loss: 0.7381
Epoch 14, Loss: 0.6978
Epoch 15, Loss: 0.6571
Epoch 16, Loss: 0.6200
Epoch 17, Loss: 0.5809
Epoch 18, Loss: 0.5530


In [15]:
test_df = pd.read_csv("../data/test.csv")


# Imputation (predict missing values)
def predict_missing_values(model, test_df, user_to_index, item_to_index):
    test_df['user_idx'] = test_df['user_id'].apply(lambda x: user_to_index[x])
    test_df['book_idx'] = test_df['book_id'].apply(lambda x: item_to_index[x])
    user_item_pairs = test_df[['user_idx', 'book_idx']].values

    model.eval()
    submission = []
    for user, item in user_item_pairs:
        with torch.no_grad():
            prediction = model(torch.tensor([user]), torch.tensor([item])).item()
        submission.append([prediction])
    return submission


submission = predict_missing_values(model, test_df, user_to_index, item_to_index)
#save the submission
submission_df = pd.DataFrame(submission, columns=['rating'])
submission_df.to_csv("submission.csv", index=True)

### Kaggle results

time : 40 sec

score : 0.81