In [None]:
from tqdm import tqdm
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

In [None]:
train_path = "../data/train.csv"
test_path = "../data/test.csv"

In [None]:
# RUN THIS CELL ONLY ON GOOGLE COLAB
from google.colab import drive
drive.mount('/content/drive')
train_path = "/content/drive/MyDrive/DIS_recomander_sys/train.csv"
test_path = "/content/drive/MyDrive/DIS_recomander_sys/test.csv"

In [None]:
# Create a dataset class
class InteractionDataset(Dataset):
    def __init__(self, user_item_pairs, ratings):
        self.user_item_pairs = user_item_pairs
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.user_item_pairs[idx], self.ratings[idx]


# Define the NCF model
class NCF(nn.Module):
    def __init__(self, num_users, num_items, latent_dim):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, latent_dim)
        self.item_embedding = nn.Embedding(num_items, latent_dim)
        self.fc_layers = nn.Sequential(
            nn.Linear(latent_dim * 2, 128),
            nn.BatchNorm1d(128),  # Batch normalization
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, 1)
        )

    def forward(self, user, item):
        user_embed = self.user_embedding(user)
        item_embed = self.item_embedding(item)
        interaction = torch.cat([user_embed, item_embed], dim=-1)
        return self.fc_layers(interaction).squeeze()

In [None]:
# Prepare data
train_df = pd.read_csv(train_path)
user_item_pairs = train_df[["user_id", "book_id"]].values
#Map the user_id and book_id to a unique index
user_to_index = {user_id: idx for idx, user_id in enumerate(train_df['user_id'].unique())}
item_to_index = {book_id: idx for idx, book_id in enumerate(train_df['book_id'].unique())}
train_df['user_idx'] = train_df['user_id'].apply(lambda x: user_to_index[x])
train_df['book_idx'] = train_df['book_id'].apply(lambda x: item_to_index[x])
user_item_index_pairs = train_df[["user_idx", "book_idx"]].values
ratings = train_df["rating"].values

# Define constants
num_users = len(user_to_index)
num_items = len(item_to_index)
latent_dim = 5
batch_size = 256
epochs = 18

# DataLoader
dataset = InteractionDataset(user_item_index_pairs, ratings)

# Split into training and validation sets (80-20 split)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NCF(num_users, num_items, latent_dim).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

In [None]:
print(torch.cuda.is_available())
print(torch.cuda.memory_allocated())
print(torch.cuda.memory_reserved())

In [None]:
for epoch in tqdm(range(epochs), desc="Epochs", unit="epoch"):
    model.train()
    total_train_loss = 0

    # Training loop
    for user_item, rating in train_loader:
        user, item = user_item[:, 0].long().to(device), user_item[:, 1].long().to(device)
        rating = rating.float().to(device)

        optimizer.zero_grad()
        predictions = model(user, item)
        loss = criterion(predictions, rating)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    # Validation loop
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for user_item, rating in val_loader:
            user, item = user_item[:, 0].long().to(device), user_item[:, 1].long().to(device)
            rating = rating.float().to(device)
            predictions = model(user, item)
            loss = criterion(predictions, rating)
            total_val_loss += loss.item()

    # Calculate average losses for the epoch
    avg_train_loss = total_train_loss / len(train_loader)
    avg_val_loss = total_val_loss / len(val_loader)

    # Print average loss for the epoch
    print(f"Epoch {epoch + 1}/{epochs} - Average Training Loss: {avg_train_loss:.4f} - Average Validation Loss: {avg_val_loss:.4f}")


In [None]:
test_df = pd.read_csv(test_path)


# Imputation (predict missing values)
def predict_missing_values(model, test_df, user_to_index, item_to_index):
    test_df['user_idx'] = test_df['user_id'].apply(lambda x: user_to_index[x])
    test_df['book_idx'] = test_df['book_id'].apply(lambda x: item_to_index[x])
    user_item_pairs = test_df[['user_idx', 'book_idx']].values

    model.eval()
    submission = []
    for user, item in user_item_pairs:
        with torch.no_grad():
          user_tensor = torch.tensor([user]).to(device)
          item_tensor = torch.tensor([item]).to(device)
          prediction = model(user_tensor, item_tensor).item()
        submission.append([prediction])
    return submission


submission = predict_missing_values(model, test_df, user_to_index, item_to_index)
#save the submission
submission_df = pd.DataFrame(submission, columns=['rating'])
submission_df.to_csv("submission.csv", index=True)

### Kaggle results

time : 3min

score : 0.88