In [1]:
from google.colab import drive
drive.mount('/mnt/drive')

Mounted at /mnt/drive


In [2]:

!pip install datatable

Collecting datatable
  Downloading datatable-1.1.0-cp310-cp310-manylinux_2_35_x86_64.whl (82.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.0/82.0 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datatable
Successfully installed datatable-1.1.0


In [17]:
# import pandas as pd
import datatable as dt

df_dt = dt.fread('../data/train.csv')
# num_rows_to_keep = int(df_dt.nrows * 0.00001)
# print(num_rows_to_keep)
# # Keep only the first 10% of the rows
# df_dt_trimmed = df_dt[:num_rows_to_keep, :]
# # Convert to pandas (if needed for complex operations)
# df = df_dt_trimmed.to_pandas()
df = df_dt.to_pandas()
# df = pd.read_csv('/mnt/drive/MyDrive/Colab Notebooks/preprocessed_data/train.csv')
print(df.head())

301
       t_dat                                        customer_id  article_id  \
0 2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   663713001   
1 2018-09-20  a8c746a9230ef3a1d535949d6f74c5922a88591b1d4c17...   562245034   
2 2018-09-20  a8c746a9230ef3a1d535949d6f74c5922a88591b1d4c17...   615367008   
3 2018-09-20  a8cf43214284500a1e492dddbc23fe48f8fb42cd313f6d...   508227002   
4 2018-09-20  a8cf43214284500a1e492dddbc23fe48f8fb42cd313f6d...   573017003   

      price  sales_channel_id  
0  0.050831                 2  
1  0.012186                 2  
2  0.025407                 2  
3  0.016932                 2  
4  0.020322                 2  


In [5]:
user_sequences = df.groupby('customer_id')['article_id'].apply(list)

In [28]:
print(user_sequences)

customer_id
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318                                          [663713001]
a8188c1cb46aa845a7812607e03d6d9b060ae84f82cdd38834e8f241c31ae7e4                                          [504152012]
a81bc4968aa241ef9dd0546d61a84e8d19bb5d663af29deca2810d315130dec1    [693211001, 529008008, 580000001, 677967009, 6...
a81dbea670c04e5a9dbe45475415d9ae13ca31c1e101c76383bec153ff647385                               [457892005, 457892003]
a8244824d1e5161d15582b3ad3027d75c32878ff4a315adc80b2dc8d01351f89         [458239017, 467302100, 676979003, 467302099]
                                                                                          ...                        
a9f57e2cc46cd78cf82c2d6dc0a8154329b44e933c0e2cc072543efbd5346be5                               [567884001, 549265001]
a9f7afb19c8222d80e1ec0793c1f3eefe7d7e572c0001ae0498b56512e69b09a                                          [562820006]
a9f7b76cdfb71a480af60d43f3803111c2a0f4b7766e

In [6]:

unique_articles = df['article_id'].unique()
article_to_index = {article: index + 1 for index, article in enumerate(unique_articles)}  # +1 if 0 is for padding
article_to_index['<PAD>'] = 0  # Padding token


In [7]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_padded = pad_sequence(targets, batch_first=True, padding_value=0)
    return inputs_padded, targets_padded


In [8]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class PurchaseDataset(Dataset):
    def __init__(self, sequences, article_to_index):
        self.sequences = [[article_to_index[article] for article in seq] for seq in sequences]

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = torch.tensor(self.sequences[idx])
        return sequence[:-1], sequence[1:]


dataset = PurchaseDataset(user_sequences, article_to_index)
data_loader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)



In [9]:
import torch.nn as nn

class GRU4RecModel(nn.Module):
    def __init__(self, n_items, embed_dim, hidden_size):
        super(GRU4RecModel, self).__init__()
        self.item_embedding = nn.Embedding(num_embeddings=len(article_to_index), embedding_dim=embed_dim)

        self.gru = nn.GRU(embed_dim, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, n_items)

    def forward(self, x):
        embedded = self.item_embedding(x)
        output, hidden = self.gru(embedded)
        return self.out(output)

# Initialize the model
n_items = len(df['article_id'].unique()) + 1  # number of unique items
model = GRU4RecModel(n_items=n_items, embed_dim=128, hidden_size=256)


In [10]:
!pip install accelerate

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


In [11]:
from accelerate import Accelerator
import torch.optim as optim

torch.cuda.empty_cache()

# Initialize Accelerator
accelerator = Accelerator()

# Model, optimizer, and dataloaders
model = GRU4RecModel(n_items=n_items, embed_dim=32, hidden_size=64)  # Adjust model parameters here

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Prepare everything
model, optimizer, data_loader = accelerator.prepare(model, optimizer, data_loader)
num_epochs = 1

# Training loop
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in data_loader:
      assert targets.max() < n_items, f"Target index {targets.max()} exceeds number of classes {n_items}"
      outputs = model(inputs)
      loss = criterion(outputs.view(-1, n_items), targets.view(-1))

      accelerator.backward(loss)

      optimizer.step()
      optimizer.zero_grad()

    print(f'Epoch {epoch}, Loss: {loss.item()}')

# Don't forget to save your model
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
accelerator.save(unwrapped_model.state_dict(), 'gru4rec_model.pth')


Epoch 0, Loss: 5.0465474128723145


In [35]:
test_df = dt.fread('../data/test.csv').to_pandas()

# Convert article_id in test data using the same mapping as training data




In [36]:
print(test_df.head())

       t_dat                                        customer_id  article_id  \
0 2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   663713001   
1 2018-09-20  a8c746a9230ef3a1d535949d6f74c5922a88591b1d4c17...   562245034   
2 2018-09-20  a8c746a9230ef3a1d535949d6f74c5922a88591b1d4c17...   615367008   
3 2018-09-20  a8cf43214284500a1e492dddbc23fe48f8fb42cd313f6d...   508227002   
4 2018-09-20  a8cf43214284500a1e492dddbc23fe48f8fb42cd313f6d...   573017003   

      price  sales_channel_id  
0  0.050831                 2  
1  0.012186                 2  
2  0.025407                 2  
3  0.016932                 2  
4  0.020322                 2  


In [37]:

test_unique_articles = test_df['article_id'].unique()
test_article_to_index = {article: index + 1 for index, article in enumerate(test_unique_articles)}  # +1 if 0 is for padding
test_article_to_index['<PAD>'] = 0  # Padding token


In [38]:
test_user_sequences = test_df.groupby('customer_id')['article_id'].apply(list)

# Convert the Series to a list of sequences
test_sequences_list = [seq for seq in test_user_sequences]

# Convert to PyTorch Dataset
test_dataset = PurchaseDataset(test_sequences_list, test_article_to_index)


In [39]:
test_data_loader = DataLoader(test_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

In [40]:
model.eval()  # Set the model to evaluation mode
total_correct = 0
total_samples = 0

with torch.no_grad():
    for inputs, targets in test_data_loader:
        outputs = model(inputs)
        # Assuming your model outputs raw logits, you might need to apply a softmax and argmax
        predicted = outputs.argmax(dim=2)
        total_correct += (predicted == targets).sum().item()
        total_samples += targets.numel()

accuracy = total_correct / total_samples
print(f"Accuracy on test data: {accuracy:.4f}")


Accuracy on test data: 0.6667


In [None]:
from pynvml import *
nvmlInit()
h = nvmlDeviceGetHandleByIndex(0)
info = nvmlDeviceGetMemoryInfo(h)
print(f'total    : {info.total}')
print(f'free     : {info.free}')
print(f'used     : {info.used}')

total    : 16106127360
free     : 1443692544
used     : 14662434816
