# Import Libraries

In [1]:
import dask.dataframe as dd

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import vaex

In [3]:
import torch

def format_pytorch_version(version):
  return version.split('+')[0]

TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)

def format_cuda_version(version):
  return 'cu' + version.replace('.', '')

CUDA_version = torch.version.cuda
CUDA = format_cuda_version(CUDA_version)

!pip install -q torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install -q torch-sparse      -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install -q torch-cluster     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install -q torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install -q torch-geometric

In [4]:
import pandas as pd, numpy as np
from itertools import product
import io, os, json

import time

from sklearn.metrics import mean_squared_error

import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
pio.templates.default = "plotly_white"

import torch
import torch.nn.functional as F
import torch_geometric.transforms as T
from torch_geometric.datasets import MovieLens
from torch_geometric.nn import to_hetero
from torch_geometric.nn import HeteroConv, GCNConv, SAGEConv, GATConv, Linear, GraphConv
from torch_geometric.data import Data, HeteroData

!pip install -q sentence-transformers

%matplotlib inline

# Load Competition Dataset

In [5]:
'''
from google.colab import files
files.upload() # expire any previous token(s) and upload recreated token
!rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets list
'''

'\nfrom google.colab import files\nfiles.upload() # expire any previous token(s) and upload recreated token\n!rm -r ~/.kaggle\n!mkdir ~/.kaggle\n!mv ./kaggle.json ~/.kaggle/\n!chmod 600 ~/.kaggle/kaggle.json\n\n!kaggle datasets list\n'

In [6]:
#!kaggle datasets download 'radek1/otto-full-optimized-memory-footprint' -p /content/kaggle/ --unzip

In [7]:
#!kaggle competitions download -c otto-recommender-system -p /content/dataset/

In [8]:
#!unzip '/content/dataset/otto-recommender-system.zip'

## Files
- **train.jsonl** - the training data, which contains full session data
  * `session` - the unique session id
  * `events` - the time ordered sequence of events in the session
    * `aid` - the article id (product code) of the associated event
    * `ts` - the Unix timestamp of the event
    * `type` - the event type, i.e., whether a product was clicked, added to the user's cart, or ordered during the session
###### {'clicks': 0, 'carts': 1, 'orders': 2}
- **test.jsonl** - the test data, which contains truncated session data
  * your task is to predict the next `aid` clicked after the session truncation, as well as the the remaining `aids` that are added to `carts` and `orders`; you may predict up to 20 values for each session `type`
- **sample_submission.csv** - a sample submission file in the correct format

In [9]:
df = pd.read_parquet('/content/kaggle/train.parquet')

In [10]:
#df = dd.read_parquet('/content/kaggle/train.parquet')

In [11]:
# sample 10%(frac=0.1) of data
df = df.sample(frac=0.001, replace=False)

In [12]:
df

Unnamed: 0,session,aid,ts,type
37512576,927006,1503106,1659376124,1
205953871,11602020,1051312,1661278327,0
28233506,657056,1582085,1659865707,0
40411745,1016076,741952,1660833246,0
110370196,4025269,114892,1661454435,0
...,...,...,...,...
70396312,2087833,691399,1660824099,0
148347259,6357101,1806778,1660850880,0
106282515,3803790,606300,1660537616,0
8071118,154681,1484209,1659332876,0


In [13]:
df.isna().sum()

session    0
aid        0
ts         0
type       0
dtype: int64

In [14]:
df['type'].unique()

array([1, 0, 2], dtype=uint8)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 216716 entries, 37512576 to 105009115
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   session  216716 non-null  int32
 1   aid      216716 non-null  int32
 2   ts       216716 non-null  int32
 3   type     216716 non-null  uint8
dtypes: int32(3), uint8(1)
memory usage: 4.3 MB


# Construct Graph Data (memory-efficient optimized)

### Construct `edge_index`

In [16]:
def to_tuple(row):
    return tuple(row)

In [17]:
# also drop the ts and type column
connectivity_list = df.drop(columns=['ts', 'type']).apply(to_tuple, axis=1)

In [18]:
connectivity_list

37512576       (927006, 1503106)
205953871    (11602020, 1051312)
28233506       (657056, 1582085)
40411745       (1016076, 741952)
110370196      (4025269, 114892)
                    ...         
70396312       (2087833, 691399)
148347259     (6357101, 1806778)
106282515      (3803790, 606300)
8071118        (154681, 1484209)
105009115      (3739910, 921304)
Length: 216716, dtype: object

In [19]:
# session index dict
session = sorted(df['session'].unique())
#session = df['session'].unique()
session_nodes_idx = {id:idx for idx, id in enumerate(session)}

# aid(article id) index dict
aid = sorted(df['aid'].unique())
#aid = df['aid'].unique()
aid_nodes_idx = {id:idx for idx, id in enumerate(aid)}

In [20]:
# use this code if the above cause memory crash
# very slow but memory good
'''
def get_node_indices(data, key):
  for id in data[key].unique():
    yield id, next(i for i, v in enumerate(data[key]) if v == id)

session_nodes_idx = dict(get_node_indices(df, 'session'))
aid_nodes_idx = dict(get_node_indices(df, 'aid'))
'''

"\ndef get_node_indices(data, key):\n  for id in data[key].unique():\n    yield id, next(i for i, v in enumerate(data[key]) if v == id)\n\nsession_nodes_idx = dict(get_node_indices(df, 'session'))\naid_nodes_idx = dict(get_node_indices(df, 'aid'))\n"

In [21]:
# /!\ this cause memory crashed in a very large data
'''
i_session = []
i_aid = []
for session, aid in connectivity_list:
  i_session.append(user_nodes_idx[user])
  i_aid.append(item_nodes_idx[item])

indice = [i_session, i_aid]
edge_index = torch.Tensor(indice).type(torch.long)
'''

'\ni_session = []\ni_aid = []\nfor session, aid in connectivity_list:\n  i_session.append(user_nodes_idx[user])\n  i_aid.append(item_nodes_idx[item])\n\nindice = [i_session, i_aid]\nedge_index = torch.Tensor(indice).type(torch.long)\n'

In [22]:
# Work but still got memory crash in very large data
def get_indices(connectivity_list, user_nodes_idx, item_nodes_idx):
  for user, item in connectivity_list:
    yield user_nodes_idx[user], item_nodes_idx[item]

edge_index = torch.Tensor(list(get_indices(connectivity_list, session_nodes_idx, aid_nodes_idx))).type(torch.int64).t()

In [23]:
edge_index

tensor([[ 35409, 197604,  26706,  ..., 100741,   7512,  99494],
        [102946,  71880, 108501,  ...,  41351, 101644,  63008]])

In [24]:
edge_index.shape

torch.Size([2, 216716])

### Nodes and Edges Attribute

In [25]:
## Nodes
session_num_nodes = df['session'].nunique()
aid_num_nodes = df['aid'].nunique()
aid_features = torch.rand((aid_num_nodes, 300)) # Create (random) article features with shape [num_node_aid, dimensions]

## Edges
edge_index = edge_index
edge_label = torch.tensor(df['type'].values).type(torch.int64)

In [26]:
edge_label

tensor([1, 0, 0,  ..., 0, 0, 2])

In [27]:
edge_index.shape

torch.Size([2, 216716])

In [28]:
aid_features.shape

torch.Size([127639, 300])

In [29]:
edge_label.shape

torch.Size([216716])

### Construct HeteroData

In [30]:
node_types = {
    'session': {
        'num_nodes': session_num_nodes
    },
    'aid': {
        'x': aid_features
    }
}

edge_types = {
    ('session','event', 'aid'): {
        'edge_index': edge_index,
        'edge_label': edge_label
    }
}

In [31]:
data = HeteroData({**node_types, **edge_types})

In [32]:
data

HeteroData(
  [1msession[0m={ num_nodes=208333 },
  [1maid[0m={ x=[127639, 300] },
  [1m(session, event, aid)[0m={
    edge_index=[2, 216716],
    edge_label=[216716]
  }
)

In [33]:
data.metadata()

(['session', 'aid'], [('session', 'event', 'aid')])

In [34]:
node_types, edge_types = data.metadata()
print('Node types:', node_types)
print('Edge types:',edge_types)

Node types: ['session', 'aid']
Edge types: [('session', 'event', 'aid')]


In [35]:
print('Isolated nodes?', data.has_isolated_nodes())
print('Self loops?', data.has_self_loops())
print('Undirected graph? ', data.is_undirected())

Isolated nodes? False
Self loops? False
Undirected graph?  False


# Calculate Weight

In [36]:
data['session', 'aid'].edge_label.shape

torch.Size([216716])

In [37]:
# We have an unbalanced dataset with many labels for rating 3 and 4, and very
# few for 0 and 1. Therefore we use a weighted MSE loss.

counts = torch.bincount(data['session', 'aid'].edge_label)

# Set weights normalized by (max count/each count)
weight = counts.max() / counts

In [38]:
counts

tensor([194747,  16886,   5083])

In [39]:
weight

tensor([ 1.0000, 11.5330, 38.3134])

In [40]:
data_dict = {'type': (counts, '# rows','coral'), 'weights': (weight, 'weights','royalblue')}

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Scatter(x=np.arange(6), y=counts.detach().cpu().numpy(),
               name = 'nb rows', line_color= 'coral'))
fig.add_trace(
    go.Scatter(x=np.arange(6), y=weight.detach().cpu().numpy(),
               name = 'weights', line_color= 'royalblue'),  secondary_y=True)


fig.update_yaxes(title_text="# rows", secondary_y=False)
fig.update_yaxes(title_text="weights", secondary_y=True)
fig.update_xaxes(title_text="Type")
fig

# Graph-based Modeling

### Construct Undirected Graph

In [41]:
# add seesion features
data['session'].x = torch.rand(data['session'].num_nodes, 300)

In [42]:
data = T.ToUndirected()(data)

In [43]:
del data['aid', 'rev_event', 'session'].edge_label  # Remove "reverse" label.

In [44]:
data

HeteroData(
  [1msession[0m={
    num_nodes=208333,
    x=[208333, 300]
  },
  [1maid[0m={ x=[127639, 300] },
  [1m(session, event, aid)[0m={
    edge_index=[2, 216716],
    edge_label=[216716]
  },
  [1m(aid, rev_event, session)[0m={ edge_index=[2, 216716] }
)

### Train/Val/Test Link Level Splits

In [45]:
# Perform a link-level split into training, validation, and test edges:
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.2,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('session', 'event', 'aid')],
    rev_edge_types=[('aid', 'rev_event', 'session')],
)(data)

In [46]:
train_data['session','aid']

{'edge_index': tensor([[132749,   4128,  31550,  ...,  61417,  93875, 169813],
        [101455,  30154,  90625,  ...,  44192,  71272,  96024]]), 'edge_label': tensor([0, 0, 0,  ..., 1, 0, 0]), 'edge_label_index': tensor([[132749,   4128,  31550,  ...,  61417,  93875, 169813],
        [101455,  30154,  90625,  ...,  44192,  71272,  96024]])}

In [47]:
train_data['session','aid'].edge_label

tensor([0, 0, 0,  ..., 1, 0, 0])

In [48]:
def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

In [49]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, conv):
        super().__init__()
        # conv(#in_channels, #out_channels)
        '''
        in_channels (int or tuple): 
            Size of each input sample, or :obj:`-1` to
            derive the size from the first input(s) to the forward method.
            A tuple corresponds to the sizes of source and target
            dimensionalities.
        '''
        self.conv1 = conv((-1, -1), hidden_channels)
        self.conv2 = conv((-1, -1), out_channels)
        self.linear1 = Linear(-1, out_channels)
        self.linear2 = Linear(-1, out_channels)

    def forward(self, x, edge_index):
        x0 = self.linear1(x)
        x2 = self.conv1(x0, edge_index).relu()
        x3 = self.conv2(x2, edge_index)
        x4 = self.linear2(x2 + x3)
        # Add combined layer to reduce over-smoothing
        return x4

class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['session'][row], z_dict['aid'][col]], dim=-1)
        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels,  conv=SAGEConv):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels,  conv) # Initialize GNNEncoder
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels) # Initialze EdgeDecoder

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict) # Here the call from model.encoder(...)
        return self.decoder(z_dict, edge_label_index)

In [50]:
train_data['session', 'aid'].edge_label.dtype

torch.int64

In [51]:
train_data['session', 'aid'].edge_label.shape

torch.Size([151702])

In [52]:
def train(train_data, model, optimizer, loss=weighted_mse_loss):
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['session', 'aid'].edge_label_index)
    target = train_data['session', 'aid'].edge_label
    loss = weighted_mse_loss(pred, target, weight)
    loss.backward()
    optimizer.step()
    return float(loss.sqrt())

In [53]:
## set pred.clamp
@torch.no_grad()
def test(data, model, metric=F.mse_loss):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['session', 'aid'].edge_label_index)
    pred = pred.clamp(min=0, max=2)
    target = data['session', 'aid'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse) # Return RMSE loss

In [54]:
from tqdm import tqdm
from IPython.display import clear_output

In [55]:
def train_test(model, model_params, learning_rate=0.01, e_patience = 10, min_acc= 0.05, n_epochs=500):
    t0 = time.time()

    model = model(**model_params) # Define the model

    # Due to lazy initialization, we need to run one model step so the number
    # of parameters can be inferred:
    with torch.no_grad():
        model.encoder(train_data.x_dict, train_data.edge_index_dict) # Run once with torch.no_grad() to get parameter for optimizer below

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    k=0
    loss, train_rmse, val_rmse, test_rmse = [], [], [], []
    train_wrmse, val_wrmse, test_wrmse = [], [], []
    for epoch in tqdm(range(n_epochs)):
        # Call train fuction here >> return loss
        loss += [train(train_data, model, optimizer, loss=weighted_mse_loss)]
        
        # Call test function here >> return RMSE loss
        train_wrmse += [test(train_data, model, metric=weighted_mse_loss)]
        train_rmse += [test(train_data, model, metric=F.mse_loss)]
        
        val_wrmse += [test(val_data, model, metric=weighted_mse_loss)]
        val_rmse += [test(val_data, model, metric=F.mse_loss)]
        
        test_wrmse += [test(test_data, model, metric=weighted_mse_loss)]
        test_rmse += [test(test_data, model, metric=F.mse_loss)]

        if epoch+1 %10==0:
            print(f'Epoch: {epoch+1:03d}, Loss: {loss[-1]:.4f}, Train: {train_rmse[-1]:.4f}, '
                  f'Val: {val_rmse[-1]:.4f}, Test: {test_rmse[-1]:.4f}')

        results = pd.DataFrame({
            'loss': loss,
            'train_rmse': train_rmse, 'val_rmse': val_rmse, 'test_rmse': test_rmse,
            'train_wrmse': train_wrmse, 'val_wrmse': val_wrmse, 'test_wrmse': test_wrmse,
            'time':(time.time()-t0)/60
        })
        
        ## Debugging
        #clear_output()
        '''
        print('\nloss: ', loss, 
              '\ntrain_rmse: ', train_rmse, 
              '\nval_rmse: ', val_rmse, 
              '\ntest_rmse: ', test_rmse,
              '\ntrain_wrmse: ', train_wrmse, 
              '\nval_wrmse: ', val_wrmse, 
              '\ntest_wrmse: ', test_wrmse,
              '\ntime: ', (time.time()-t0)/60)
        '''
        #visualize_loss(results, metric='wrmse').show()
        #print(results.to_string())

        # enable early stopping
        if (epoch > 1) and abs(loss[-1]/loss[-2]-1) < min_acc :
            k += 1
        if k> e_patience:
            print('Early stopping')
            break

    return results, model

In [56]:
def visualize_loss(results, metric='rmse'):
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=results.index, y=results['train_'+metric], name = 'train_'+metric))
    fig.add_trace(go.Scatter(x=results.index, y=results['val_'+metric], name = 'val_'+metric))
    fig.add_trace(go.Scatter(x=results.index, y=results['test_'+metric], name = 'test_'+metric))
    fig.add_trace(go.Scatter(x=results.index, y=results['loss'], name = 'loss'))

    fig.update_yaxes(title_text=metric.upper())
    fig.update_xaxes(title_text="Epoch")

    return fig

In [64]:
N_EPOCHS = 300
E_PATIENCE = 50
LEARNING_RATE = 0.01

model_params = {"hidden_channels":32, 'conv':SAGEConv}

results, trained_model = train_test(
    Model, model_params, learning_rate=LEARNING_RATE, e_patience = E_PATIENCE, n_epochs=N_EPOCHS)

 19%|█▉        | 57/300 [05:16<22:29,  5.56s/it]

Early stopping





In [65]:
visualize_loss(results, metric='wrmse')

In [66]:
data.x_dict

{'session': tensor([[0.5847, 0.1464, 0.2928,  ..., 0.2151, 0.3417, 0.8224],
         [0.1528, 0.4135, 0.8896,  ..., 0.1158, 0.1947, 0.7028],
         [0.3903, 0.5135, 0.3806,  ..., 0.7144, 0.8384, 0.8290],
         ...,
         [0.7663, 0.7573, 0.5902,  ..., 0.7656, 0.1255, 0.6207],
         [0.2058, 0.1803, 0.5983,  ..., 0.8376, 0.7079, 0.4223],
         [0.6442, 0.0270, 0.8847,  ..., 0.2764, 0.2636, 0.0074]]),
 'aid': tensor([[0.8777, 0.7547, 0.3473,  ..., 0.5942, 0.5942, 0.2161],
         [0.6943, 0.0477, 0.1764,  ..., 0.2534, 0.5470, 0.9246],
         [0.2059, 0.0198, 0.8377,  ..., 0.5170, 0.7153, 0.5540],
         ...,
         [0.0732, 0.8948, 0.7060,  ..., 0.4959, 0.9232, 0.5172],
         [0.5611, 0.8449, 0.6297,  ..., 0.2463, 0.5351, 0.7553],
         [0.8340, 0.3269, 0.5220,  ..., 0.8878, 0.5874, 0.5975]])}

In [71]:
trained_model.encoder(data.x_dict, data.edge_index_dict)

{'session': tensor([[ 3.5620,  2.2107,  1.9964,  ..., -1.1181,  4.8102, -5.1099],
         [ 3.6567,  2.3546,  2.1354,  ..., -1.1670,  4.9637, -5.3549],
         [ 3.7181,  2.3710,  2.1148,  ..., -1.1814,  5.0499, -5.4280],
         ...,
         [ 3.4364,  2.2857,  2.0189,  ..., -1.0862,  4.7376, -5.1184],
         [ 3.5750,  2.2860,  2.0347,  ..., -1.1847,  4.8684, -5.2354],
         [ 3.6064,  2.3901,  2.0803,  ..., -1.1513,  4.9769, -5.3917]],
        grad_fn=<AddmmBackward0>),
 'aid': tensor([[-1.2894,  2.1410,  1.5678,  ...,  1.9609,  1.4239,  0.7837],
         [-1.4979,  2.3668,  1.4113,  ...,  1.9839,  1.5305,  0.8046],
         [-1.4194,  2.2243,  1.4928,  ...,  1.9381,  1.4664,  0.8593],
         ...,
         [-1.4334,  2.2126,  1.5482,  ...,  2.0071,  1.4700,  0.8322],
         [-1.3958,  2.2550,  1.4411,  ...,  1.9880,  1.3210,  0.6664],
         [-1.4170,  2.2889,  1.3340,  ...,  1.9188,  1.4279,  0.7589]],
        grad_fn=<AddmmBackward0>)}