In [9]:
from tqdm import tqdm
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

google_colab = False
with_validation = True
latent_dim_gmf = 30
latent_dim_mlp = 3
batch_size = 256
epochs = 30
LR = 0.001
WEIGHT_DECAY = 1e-4

In [10]:
if google_colab:
    from google.colab import drive

    drive.mount('/content/drive')
    train_path = "/content/drive/MyDrive/DIS_recomander_sys/train.csv"
    test_path = "/content/drive/MyDrive/DIS_recomander_sys/test.csv"
else:
    train_path = "../data/train.csv"
    test_path = "../data/test.csv"

In [11]:
# Create a dataset class
class InteractionDataset(Dataset):
    def __init__(self, user_item_pairs, ratings):
        self.user_item_pairs = user_item_pairs
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.user_item_pairs[idx], self.ratings[idx]


# Define the NCF model
class NCF(nn.Module):
    def __init__(self, num_users, num_items, latent_dim_gmf, latent_dim_mlp):
        super(NCF, self).__init__()
        # Embeddings for MLP part
        self.user_embedding_mlp = nn.Embedding(num_users, latent_dim_mlp)
        self.item_embedding_mlp = nn.Embedding(num_items, latent_dim_mlp)

        # Embeddings for GMF part
        self.user_embedding_gmf = nn.Embedding(num_users, latent_dim_gmf)
        self.item_embedding_gmf = nn.Embedding(num_items, latent_dim_gmf)

        # Fully connected layers for MLP
        self.fc_layers = nn.Sequential(
            nn.Linear(latent_dim_mlp * 2, 128),
            nn.BatchNorm1d(128),  # Batch normalization
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 80),
            nn.BatchNorm1d(80),
            nn.ReLU(),
            nn.Dropout(0.5)
        )

        # Final output layer (after concatenating GMF and MLP outputs)
        self.final_layer = nn.Linear(latent_dim_gmf + 80, 1)  # GMF (latent_dim) + MLP output (64)

    def forward(self, user, item):
        # MLP embeddings
        user_embed_mlp = self.user_embedding_mlp(user)  # Shape: [batch_size, latent_dim]
        item_embed_mlp = self.item_embedding_mlp(item)  # Shape: [batch_size, latent_dim]

        # GMF embeddings
        user_embed_gmf = self.user_embedding_gmf(user)  # Shape: [batch_size, latent_dim]
        item_embed_gmf = self.item_embedding_gmf(item)  # Shape: [batch_size, latent_dim]

        # GMF interaction (element-wise product)
        gmf_output = torch.mul(user_embed_gmf, item_embed_gmf)  # Shape: [batch_size, latent_dim]

        # MLP interaction (concatenation)
        mlp_input = torch.cat([user_embed_mlp, item_embed_mlp], dim=-1)  # Shape: [batch_size, latent_dim * 2]
        mlp_output = self.fc_layers(mlp_input)  # Shape: [batch_size, 64]

        # Concatenate GMF and MLP outputs
        combined = torch.cat([gmf_output, mlp_output], dim=-1)  # Shape: [batch_size, latent_dim + 64]

        # Final prediction layer
        output = self.final_layer(combined).squeeze()  # Shape: [batch_size]

        return output


In [12]:
# Prepare data
train_df = pd.read_csv(train_path)
user_item_pairs = train_df[["user_id", "book_id"]].values
#Map the user_id and book_id to a unique index
user_to_index = {user_id: idx for idx, user_id in enumerate(train_df['user_id'].unique())}
item_to_index = {book_id: idx for idx, book_id in enumerate(train_df['book_id'].unique())}
train_df['user_idx'] = train_df['user_id'].apply(lambda x: user_to_index[x])
train_df['book_idx'] = train_df['book_id'].apply(lambda x: item_to_index[x])
user_item_index_pairs = train_df[["user_idx", "book_idx"]].values
ratings = train_df["rating"].values

# Define constants
num_users = len(user_to_index)
num_items = len(item_to_index)
print(num_users, num_items)

# DataLoader
dataset = InteractionDataset(user_item_index_pairs, ratings)

# Split into training and validation sets (80-20 split)
if with_validation:
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
else:
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    val_loader = None

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NCF(num_users, num_items, latent_dim_gmf, latent_dim_mlp).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

18905 15712


In [13]:
if google_colab:
    print(torch.cuda.is_available())
    print(torch.cuda.memory_allocated())
    print(torch.cuda.memory_reserved())

In [14]:
for epoch in tqdm(range(epochs), desc="Epochs", unit="epoch"):
    model.train()
    total_train_loss = 0

    # Training loop
    for user_item, rating in train_loader:
        user, item = user_item[:, 0].long().to(device), user_item[:, 1].long().to(device)
        rating = rating.float().to(device)

        optimizer.zero_grad()
        predictions = model(user, item)
        loss = criterion(predictions, rating)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    # Validation loop
    if with_validation:
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for user_item, rating in val_loader:
                user, item = user_item[:, 0].long().to(device), user_item[:, 1].long().to(device)
                rating = rating.float().to(device)
                predictions = model(user, item)
                loss = criterion(predictions, rating)
                total_val_loss += loss.item()
        print(
            f"Epoch {epoch + 1}, Train Loss: {total_train_loss / len(train_loader):.4f}, Val Loss: {total_val_loss / len(val_loader):.4f}")
    else:
        print(f"Epoch {epoch + 1}, Train Loss: {total_train_loss / len(train_loader):.4f}")


Epochs:   3%|▎         | 1/30 [00:03<01:44,  3.59s/epoch]

Epoch 1, Train Loss: 2.0643, Val Loss: 1.4776


Epochs:   7%|▋         | 2/30 [00:06<01:31,  3.27s/epoch]

Epoch 2, Train Loss: 1.4483, Val Loss: 1.3356


Epochs:  10%|█         | 3/30 [00:09<01:26,  3.21s/epoch]

Epoch 3, Train Loss: 1.3876, Val Loss: 1.2439


Epochs:  13%|█▎        | 4/30 [00:12<01:20,  3.11s/epoch]

Epoch 4, Train Loss: 1.3401, Val Loss: 1.1455


Epochs:  17%|█▋        | 5/30 [00:15<01:18,  3.15s/epoch]

Epoch 5, Train Loss: 1.2907, Val Loss: 1.0695


Epochs:  20%|██        | 6/30 [00:18<01:13,  3.08s/epoch]

Epoch 6, Train Loss: 1.2416, Val Loss: 1.0086


Epochs:  23%|██▎       | 7/30 [00:22<01:14,  3.25s/epoch]

Epoch 7, Train Loss: 1.1885, Val Loss: 0.9468


Epochs:  27%|██▋       | 8/30 [00:25<01:12,  3.30s/epoch]

Epoch 8, Train Loss: 1.1266, Val Loss: 0.8966


Epochs:  30%|███       | 9/30 [00:29<01:11,  3.38s/epoch]

Epoch 9, Train Loss: 1.0699, Val Loss: 0.8575


Epochs:  33%|███▎      | 10/30 [00:32<01:07,  3.37s/epoch]

Epoch 10, Train Loss: 1.0098, Val Loss: 0.8308


Epochs:  37%|███▋      | 11/30 [00:36<01:04,  3.38s/epoch]

Epoch 11, Train Loss: 0.9537, Val Loss: 0.7995


Epochs:  40%|████      | 12/30 [00:39<00:59,  3.31s/epoch]

Epoch 12, Train Loss: 0.9068, Val Loss: 0.7801


Epochs:  43%|████▎     | 13/30 [00:42<00:54,  3.20s/epoch]

Epoch 13, Train Loss: 0.8579, Val Loss: 0.7606


Epochs:  47%|████▋     | 14/30 [00:45<00:49,  3.11s/epoch]

Epoch 14, Train Loss: 0.8191, Val Loss: 0.7486


Epochs:  50%|█████     | 15/30 [00:48<00:45,  3.04s/epoch]

Epoch 15, Train Loss: 0.7798, Val Loss: 0.7376


Epochs:  53%|█████▎    | 16/30 [00:50<00:42,  3.00s/epoch]

Epoch 16, Train Loss: 0.7454, Val Loss: 0.7265


Epochs:  57%|█████▋    | 17/30 [00:53<00:38,  2.99s/epoch]

Epoch 17, Train Loss: 0.7138, Val Loss: 0.7229


Epochs:  60%|██████    | 18/30 [00:57<00:37,  3.14s/epoch]

Epoch 18, Train Loss: 0.6774, Val Loss: 0.7238


Epochs:  63%|██████▎   | 19/30 [01:00<00:34,  3.14s/epoch]

Epoch 19, Train Loss: 0.6570, Val Loss: 0.7118


Epochs:  67%|██████▋   | 20/30 [01:03<00:31,  3.10s/epoch]

Epoch 20, Train Loss: 0.6222, Val Loss: 0.7111


Epochs:  70%|███████   | 21/30 [01:06<00:27,  3.07s/epoch]

Epoch 21, Train Loss: 0.6016, Val Loss: 0.7273


Epochs:  73%|███████▎  | 22/30 [01:09<00:24,  3.11s/epoch]

Epoch 22, Train Loss: 0.5806, Val Loss: 0.7111


Epochs:  77%|███████▋  | 23/30 [01:12<00:21,  3.05s/epoch]

Epoch 23, Train Loss: 0.5611, Val Loss: 0.7303


Epochs:  80%|████████  | 24/30 [01:15<00:18,  3.01s/epoch]

Epoch 24, Train Loss: 0.5420, Val Loss: 0.7324


Epochs:  83%|████████▎ | 25/30 [01:18<00:14,  3.00s/epoch]

Epoch 25, Train Loss: 0.5268, Val Loss: 0.7347


Epochs:  87%|████████▋ | 26/30 [01:21<00:12,  3.02s/epoch]

Epoch 26, Train Loss: 0.5123, Val Loss: 0.7487


Epochs:  90%|█████████ | 27/30 [01:24<00:09,  3.05s/epoch]

Epoch 27, Train Loss: 0.4961, Val Loss: 0.7529


Epochs:  93%|█████████▎| 28/30 [01:27<00:06,  3.07s/epoch]

Epoch 28, Train Loss: 0.4774, Val Loss: 0.7521


Epochs:  97%|█████████▋| 29/30 [01:30<00:03,  3.07s/epoch]

Epoch 29, Train Loss: 0.4657, Val Loss: 0.7486


Epochs: 100%|██████████| 30/30 [01:34<00:00,  3.14s/epoch]

Epoch 30, Train Loss: 0.4540, Val Loss: 0.7441





In [15]:
test_df = pd.read_csv(test_path)


# Imputation (predict missing values)
def predict_missing_values(model, test_df, user_to_index, item_to_index):
    test_df['user_idx'] = test_df['user_id'].apply(lambda x: user_to_index[x])
    test_df['book_idx'] = test_df['book_id'].apply(lambda x: item_to_index[x])
    user_item_pairs = test_df[['user_idx', 'book_idx']].values

    model.eval()
    submission = []
    for user, item in user_item_pairs:
        with torch.no_grad():
            user_tensor = torch.tensor([user]).to(device)
            item_tensor = torch.tensor([item]).to(device)
            prediction = model(user_tensor, item_tensor).item()
        submission.append([prediction])
    return submission


submission = predict_missing_values(model, test_df, user_to_index, item_to_index)
#save the submission
submission_df = pd.DataFrame(submission, columns=['rating'])
submission_df.to_csv("submission.csv", index=True)

### Kaggle results

time : 3min

score : 0.88