# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import torch

def format_pytorch_version(version):
  return version.split('+')[0]

TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)

def format_cuda_version(version):
  return 'cu' + version.replace('.', '')

CUDA_version = torch.version.cuda
CUDA = format_cuda_version(CUDA_version)

!pip install -q torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install -q torch-sparse      -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install -q torch-cluster     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install -q torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install -q torch-geometric

[K     |████████████████████████████████| 9.4 MB 39.2 MB/s 
[K     |████████████████████████████████| 4.5 MB 27.2 MB/s 
[K     |████████████████████████████████| 3.2 MB 29.6 MB/s 
[K     |████████████████████████████████| 873 kB 36.1 MB/s 
[K     |████████████████████████████████| 564 kB 26.7 MB/s 
[K     |████████████████████████████████| 280 kB 62.2 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone


In [3]:
import pandas as pd, numpy as np
from itertools import product
import io, os, json

import time

from sklearn.metrics import mean_squared_error

import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
pio.templates.default = "plotly_white"

import torch
import torch.nn.functional as F
import torch_geometric.transforms as T
from torch_geometric.datasets import MovieLens
from torch_geometric.nn import to_hetero
from torch_geometric.nn import HeteroConv, GCNConv, SAGEConv, GATConv, Linear, GraphConv
from torch_geometric.data import Data, HeteroData

!pip install -q sentence-transformers

%matplotlib inline

[K     |████████████████████████████████| 85 kB 3.6 MB/s 
[K     |████████████████████████████████| 5.8 MB 59.4 MB/s 
[K     |████████████████████████████████| 1.3 MB 43.2 MB/s 
[K     |████████████████████████████████| 182 kB 69.8 MB/s 
[K     |████████████████████████████████| 7.6 MB 15.6 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


# Load Competition Dataset

In [4]:

from google.colab import files
files.upload() # expire any previous token(s) and upload recreated token
!rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets list


Saving kaggle.json to kaggle.json
rm: cannot remove '/root/.kaggle': No such file or directory
ref                                                             title                                             size  lastUpdated          downloadCount  voteCount  usabilityRating  
--------------------------------------------------------------  -----------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
meirnizri/covid19-dataset                                       COVID-19 Dataset                                   5MB  2022-11-13 15:47:17          11703        346  1.0              
michals22/coffee-dataset                                        Coffee dataset                                    24KB  2022-12-15 20:02:12           2374         63  1.0              
thedevastator/jobs-dataset-from-glassdoor                       Salary Prediction                                  3MB  2022-11-16 13:52:31           7269        155

In [5]:
!kaggle datasets download 'radek1/otto-full-optimized-memory-footprint' -p /content/kaggle/ --unzip

Downloading otto-full-optimized-memory-footprint.zip to /content/kaggle
 99% 1.08G/1.09G [00:08<00:00, 147MB/s]
100% 1.09G/1.09G [00:08<00:00, 131MB/s]


In [6]:
#!kaggle competitions download -c otto-recommender-system -p /content/dataset/

In [7]:
#!unzip '/content/dataset/otto-recommender-system.zip'

## Files
- **train.jsonl** - the training data, which contains full session data
  * `session` - the unique session id
  * `events` - the time ordered sequence of events in the session
    * `aid` - the article id (product code) of the associated event
    * `ts` - the Unix timestamp of the event
    * `type` - the event type, i.e., whether a product was clicked, added to the user's cart, or ordered during the session
###### {'clicks': 0, 'carts': 1, 'orders': 2}
- **test.jsonl** - the test data, which contains truncated session data
  * your task is to predict the next `aid` clicked after the session truncation, as well as the the remaining `aids` that are added to `carts` and `orders`; you may predict up to 20 values for each session `type`
- **sample_submission.csv** - a sample submission file in the correct format

In [8]:
#chunks = pd.read_parquet('/content/kaggle/train.parquet')

In [9]:
#df = chunks.iloc[:628747]

In [12]:
df = pd.read_parquet('/content/kaggle/train.parquet')

In [13]:
# sample 10%(frac=0.1) of data
df = df.sample(frac=0.001, replace=False)

In [14]:
df

Unnamed: 0,session,aid,ts,type
146545218,6231853,1282769,1660647647,1
87101289,2817898,773484,1660673406,0
17389096,366172,658224,1660574617,0
69246336,2038226,24318,1660340255,1
193986673,10353414,428581,1660932252,2
...,...,...,...,...
148378610,6359534,1388637,1660417339,0
2092495,39662,378864,1659815849,0
41753662,1059033,1470005,1661501703,0
158588211,7141431,521275,1661104644,0


In [15]:
df.isna().sum()

session    0
aid        0
ts         0
type       0
dtype: int64

In [16]:
df['type'].unique()

array([1, 0, 2], dtype=uint8)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 216716 entries, 146545218 to 197980447
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   session  216716 non-null  int32
 1   aid      216716 non-null  int32
 2   ts       216716 non-null  int32
 3   type     216716 non-null  uint8
dtypes: int32(3), uint8(1)
memory usage: 4.3 MB


# Construct Graph Data (memory-efficient optimized)

### Construct `edge_index`

In [18]:
def to_tuple(row):
    return tuple(row)

In [19]:
# also drop the ts and type column
connectivity = df.drop(columns=['ts', 'type']).apply(to_tuple, axis=1)

In [43]:
connectivity

146545218     (6231853, 1282769)
87101289       (2817898, 773484)
17389096        (366172, 658224)
69246336        (2038226, 24318)
193986673     (10353414, 428581)
                    ...         
148378610     (6359534, 1388637)
2092495          (39662, 378864)
41753662      (1059033, 1470005)
158588211      (7141431, 521275)
197980447    (10754999, 1138386)
Length: 216716, dtype: object

In [35]:
# Try hard new code but not finished yet
#dummy_session_idx = dict(zip([sess[0] for sess in connectivity], [i for i in range(len(connectivity))]))

In [45]:
# Old code

# session index dict
#session = sorted(df['session'].unique())
session = df['session'].unique()
session_nodes_idx = {id:idx for idx, id in enumerate(session)}

# aid(article id) index dict
#aid = sorted(df['aid'].unique())
aid = df['aid'].unique()
aid_nodes_idx = {id:idx for idx, id in enumerate(aid)}


In [None]:
#aid_nodes_idx

In [None]:
# use this code if the above cause memory crash
# very slow but memory good
'''
def get_node_indices(data, key):
  for id in data[key].unique():
    yield id, next(i for i, v in enumerate(data[key]) if v == id)

session_nodes_idx = dict(get_node_indices(df, 'session'))
aid_nodes_idx = dict(get_node_indices(df, 'aid'))
'''

"\ndef get_node_indices(data, key):\n  for id in data[key].unique():\n    yield id, next(i for i, v in enumerate(data[key]) if v == id)\n\nsession_nodes_idx = dict(get_node_indices(df, 'session'))\naid_nodes_idx = dict(get_node_indices(df, 'aid'))\n"

In [None]:
# /!\ this cause memory crashed in a very large data
'''
i_session = []
i_aid = []
for session, aid in connectivity_list:
  i_session.append(user_nodes_idx[user])
  i_aid.append(item_nodes_idx[item])

indice = [i_session, i_aid]
edge_index = torch.Tensor(indice).type(torch.long)
'''

'\ni_session = []\ni_aid = []\nfor session, aid in connectivity_list:\n  i_session.append(user_nodes_idx[user])\n  i_aid.append(item_nodes_idx[item])\n\nindice = [i_session, i_aid]\nedge_index = torch.Tensor(indice).type(torch.long)\n'

In [47]:
# Work but still got memory crash in very large data
def get_indices(connectivity_list, user_nodes_idx, item_nodes_idx):
  for user, item in connectivity_list:
    yield user_nodes_idx[user], item_nodes_idx[item]

edge_index = torch.Tensor(list(get_indices(connectivity, session_nodes_idx, aid_nodes_idx))).type(torch.int64).t()

In [48]:
edge_index

tensor([[     0,      1,      2,  ..., 208226, 208227, 208228],
        [     0,      1,      2,  ...,  14377, 127891, 127892]])

In [49]:
edge_index.shape

torch.Size([2, 216716])

### Nodes and Edges Attribute

In [50]:
## Nodes
session_num_nodes = df['session'].nunique()
aid_num_nodes = df['aid'].nunique()
aid_features = torch.rand((aid_num_nodes, 300)) # Create (random) article features with shape [num_node_aid, dimensions]

## Edges
edge_index = edge_index
edge_label = torch.tensor(df['type'].values).type(torch.int64)

In [51]:
edge_label

tensor([1, 0, 0,  ..., 0, 0, 0])

In [52]:
edge_index.shape

torch.Size([2, 216716])

In [53]:
aid_features.shape

torch.Size([127893, 300])

In [54]:
edge_label.shape

torch.Size([216716])

### Construct HeteroData

In [55]:
node_types = {
    'session': {
        'num_nodes': session_num_nodes
    },
    'aid': {
        'x': aid_features
    }
}

edge_types = {
    ('session','event', 'aid'): {
        'edge_index': edge_index,
        'edge_label': edge_label
    }
}

In [56]:
data = HeteroData({**node_types, **edge_types})

In [57]:
data

HeteroData(
  [1msession[0m={ num_nodes=208229 },
  [1maid[0m={ x=[127893, 300] },
  [1m(session, event, aid)[0m={
    edge_index=[2, 216716],
    edge_label=[216716]
  }
)

In [58]:
data.metadata()

(['session', 'aid'], [('session', 'event', 'aid')])

In [59]:
node_types, edge_types = data.metadata()
print('Node types:', node_types)
print('Edge types:',edge_types)

Node types: ['session', 'aid']
Edge types: [('session', 'event', 'aid')]


In [60]:
print('Isolated nodes?', data.has_isolated_nodes())
print('Self loops?', data.has_self_loops())
print('Undirected graph? ', data.is_undirected())

Isolated nodes? False
Self loops? False
Undirected graph?  False


# Calculate Weight

In [61]:
data['session', 'aid'].edge_label.shape

torch.Size([216716])

In [62]:
# We have an unbalanced dataset with many labels for rating 3 and 4, and very
# few for 0 and 1. Therefore we use a weighted MSE loss.

counts = torch.bincount(data['session', 'aid'].edge_label)

# Set weights normalized by (max count/each count)
weight = counts.max() / counts

In [63]:
counts

tensor([194705,  17064,   4947])

In [64]:
weight

tensor([ 1.0000, 11.4103, 39.3582])

In [65]:
data_dict = {'type': (counts, '# rows','coral'), 'weights': (weight, 'weights','royalblue')}

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Scatter(x=np.arange(6), y=counts.detach().cpu().numpy(),
               name = 'nb rows', line_color= 'coral'))
fig.add_trace(
    go.Scatter(x=np.arange(6), y=weight.detach().cpu().numpy(),
               name = 'weights', line_color= 'royalblue'),  secondary_y=True)


fig.update_yaxes(title_text="# rows", secondary_y=False)
fig.update_yaxes(title_text="weights", secondary_y=True)
fig.update_xaxes(title_text="Type")
fig

# Graph-based Modeling

### Construct Undirected Graph

In [66]:
# add seesion features
data['session'].x = torch.rand(data['session'].num_nodes, 300)

In [67]:
data = T.ToUndirected()(data)

In [68]:
del data['aid', 'rev_event', 'session'].edge_label  # Remove "reverse" label.

In [69]:
data

HeteroData(
  [1msession[0m={
    num_nodes=208229,
    x=[208229, 300]
  },
  [1maid[0m={ x=[127893, 300] },
  [1m(session, event, aid)[0m={
    edge_index=[2, 216716],
    edge_label=[216716]
  },
  [1m(aid, rev_event, session)[0m={ edge_index=[2, 216716] }
)

### Train/Val/Test Link Level Splits

In [70]:
# Perform a link-level split into training, validation, and test edges:
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.2,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('session', 'event', 'aid')],
    rev_edge_types=[('aid', 'rev_event', 'session')],
)(data)

In [71]:
train_data['session','aid']

{'edge_index': tensor([[96758, 43432, 14364,  ..., 22687, 67340, 20135],
        [ 3843, 35257,  4487,  ..., 19809, 51273, 17807]]), 'edge_label': tensor([0, 0, 0,  ..., 0, 0, 0]), 'edge_label_index': tensor([[96758, 43432, 14364,  ..., 22687, 67340, 20135],
        [ 3843, 35257,  4487,  ..., 19809, 51273, 17807]])}

In [72]:
train_data['session','aid'].edge_label

tensor([0, 0, 0,  ..., 0, 0, 0])

In [73]:
def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

In [74]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, conv):
        super().__init__()
        # conv(#in_channels, #out_channels)
        '''
        in_channels (int or tuple): 
            Size of each input sample, or :obj:`-1` to
            derive the size from the first input(s) to the forward method.
            A tuple corresponds to the sizes of source and target
            dimensionalities.
        '''
        self.conv1 = conv((-1, -1), hidden_channels)
        self.conv2 = conv((-1, -1), out_channels)
        self.linear1 = Linear(-1, out_channels)
        self.linear2 = Linear(-1, out_channels)

    def forward(self, x, edge_index):
        x0 = self.linear1(x)
        x2 = self.conv1(x0, edge_index).relu()
        x3 = self.conv2(x2, edge_index)
        x4 = self.linear2(x2 + x3)
        # Add combined layer to reduce over-smoothing
        return x4

class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['session'][row], z_dict['aid'][col]], dim=-1)
        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels,  conv=SAGEConv):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels,  conv) # Initialize GNNEncoder
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels) # Initialze EdgeDecoder

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict) # Here the call from model.encoder(...)
        return self.decoder(z_dict, edge_label_index)

In [75]:
train_data['session', 'aid'].edge_label.dtype

torch.int64

In [76]:
train_data['session', 'aid'].edge_label.shape

torch.Size([151702])

In [77]:
def train(train_data, model, optimizer, loss=weighted_mse_loss):
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['session', 'aid'].edge_label_index)
    target = train_data['session', 'aid'].edge_label
    loss = weighted_mse_loss(pred, target, weight)
    loss.backward()
    optimizer.step()
    return float(loss.sqrt())

In [78]:
## set pred.clamp
@torch.no_grad()
def test(data, model, metric=F.mse_loss):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['session', 'aid'].edge_label_index)
    pred = pred.clamp(min=0, max=2)
    target = data['session', 'aid'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse) # Return RMSE loss

In [79]:
from tqdm import tqdm
from IPython.display import clear_output

In [80]:
def train_test(model, model_params, learning_rate=0.01, e_patience = 10, min_acc= 0.05, n_epochs=500):
    t0 = time.time()

    model = model(**model_params) # Define the model

    # Due to lazy initialization, we need to run one model step so the number
    # of parameters can be inferred:
    with torch.no_grad():
        model.encoder(train_data.x_dict, train_data.edge_index_dict) # Run once with torch.no_grad() to get parameter for optimizer below

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    k=0
    loss, train_rmse, val_rmse, test_rmse = [], [], [], []
    train_wrmse, val_wrmse, test_wrmse = [], [], []
    for epoch in tqdm(range(n_epochs)):
        # Call train fuction here >> return loss
        loss += [train(train_data, model, optimizer, loss=weighted_mse_loss)]
        
        # Call test function here >> return RMSE loss
        train_wrmse += [test(train_data, model, metric=weighted_mse_loss)]
        train_rmse += [test(train_data, model, metric=F.mse_loss)]
        
        val_wrmse += [test(val_data, model, metric=weighted_mse_loss)]
        val_rmse += [test(val_data, model, metric=F.mse_loss)]
        
        test_wrmse += [test(test_data, model, metric=weighted_mse_loss)]
        test_rmse += [test(test_data, model, metric=F.mse_loss)]

        if epoch+1 %10==0:
            print(f'Epoch: {epoch+1:03d}, Loss: {loss[-1]:.4f}, Train: {train_rmse[-1]:.4f}, '
                  f'Val: {val_rmse[-1]:.4f}, Test: {test_rmse[-1]:.4f}')

        results = pd.DataFrame({
            'loss': loss,
            'train_rmse': train_rmse, 'val_rmse': val_rmse, 'test_rmse': test_rmse,
            'train_wrmse': train_wrmse, 'val_wrmse': val_wrmse, 'test_wrmse': test_wrmse,
            'time':(time.time()-t0)/60
        })
        
        ## Debugging
        #clear_output()
        '''
        print('\nloss: ', loss, 
              '\ntrain_rmse: ', train_rmse, 
              '\nval_rmse: ', val_rmse, 
              '\ntest_rmse: ', test_rmse,
              '\ntrain_wrmse: ', train_wrmse, 
              '\nval_wrmse: ', val_wrmse, 
              '\ntest_wrmse: ', test_wrmse,
              '\ntime: ', (time.time()-t0)/60)
        '''
        #visualize_loss(results, metric='wrmse').show()
        #print(results.to_string())

        # enable early stopping
        if (epoch > 1) and abs(loss[-1]/loss[-2]-1) < min_acc :
            k += 1
        if k> e_patience:
            print('Early stopping')
            break

    return results, model

In [81]:
def visualize_loss(results, metric='rmse'):
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=results.index, y=results['train_'+metric], name = 'train_'+metric))
    fig.add_trace(go.Scatter(x=results.index, y=results['val_'+metric], name = 'val_'+metric))
    fig.add_trace(go.Scatter(x=results.index, y=results['test_'+metric], name = 'test_'+metric))
    fig.add_trace(go.Scatter(x=results.index, y=results['loss'], name = 'loss'))

    fig.update_yaxes(title_text=metric.upper())
    fig.update_xaxes(title_text="Epoch")

    return fig

In [82]:
N_EPOCHS = 300
E_PATIENCE = 50
LEARNING_RATE = 0.01

model_params = {"hidden_channels":32, 'conv':SAGEConv}

results, trained_model = train_test(
    Model, model_params, learning_rate=LEARNING_RATE, e_patience = E_PATIENCE, n_epochs=N_EPOCHS)

 21%|██        | 62/300 [05:12<19:59,  5.04s/it]

Early stopping





In [83]:
visualize_loss(results, metric='wrmse')

In [92]:
trained_model.encoder(test_data.x_dict, test_data.edge_index_dict)['session']

tensor([[-0.0202,  0.1865, -1.5817,  ...,  0.3076, -0.5708, -0.3113],
        [ 0.5542,  1.7736, -3.6844,  ...,  1.0820, -3.4425, -0.9355],
        [ 0.5612,  1.8848, -3.6680,  ...,  1.0439, -3.4606, -0.8596],
        ...,
        [ 0.6097,  1.9271, -3.8425,  ...,  1.1255, -3.6148, -0.9310],
        [-0.0196,  0.1864, -1.5662,  ...,  0.3088, -0.5734, -0.3177],
        [ 0.6241,  1.8300, -3.6237,  ...,  1.0752, -3.3341, -0.8680]],
       grad_fn=<AddmmBackward0>)

In [87]:
trained_model.state_dict()['encoder.linear2.session.weight']

tensor([[-0.0422,  0.1216,  0.0835,  ...,  0.1315,  0.1706, -0.0522],
        [ 0.1098,  0.0852,  0.1251,  ...,  0.0553,  0.0312, -0.0077],
        [-0.0142,  0.0712, -0.0888,  ..., -0.1421, -0.0150, -0.0069],
        ...,
        [ 0.0493,  0.1519,  0.0811,  ...,  0.1542, -0.1844,  0.0976],
        [-0.0150,  0.1030, -0.1174,  ..., -0.1944, -0.0111,  0.0048],
        [ 0.1059,  0.0325, -0.0114,  ..., -0.0815,  0.1416, -0.0457]])