<a href="https://colab.research.google.com/github/p4zaa/OTTO-Multi-Objective-Recommender-System/blob/main/%5BDEVELOP1_5%5D_OTTO_%E2%80%93_Multi_Objective_Recommender_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Version Logs
* [View in my Github](https://github.com/p4zaa/OTTO-Multi-Objective-Recommender-System)

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import torch

def format_pytorch_version(version):
  return version.split('+')[0]

TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)

def format_cuda_version(version):
  return 'cu' + version.replace('.', '')

CUDA_version = torch.version.cuda
CUDA = format_cuda_version(CUDA_version)

!pip install -q torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install -q torch-sparse      -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install -q torch-cluster     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install -q torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
!pip install -q torch-geometric

[K     |████████████████████████████████| 9.4 MB 1.7 MB/s 
[K     |████████████████████████████████| 4.5 MB 6.2 MB/s 
[K     |████████████████████████████████| 3.2 MB 14.5 MB/s 
[K     |████████████████████████████████| 873 kB 13.7 MB/s 
[K     |████████████████████████████████| 564 kB 4.2 MB/s 
[K     |████████████████████████████████| 280 kB 71.1 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone


In [3]:
import pandas as pd, numpy as np
from itertools import product
import io, os, json

import time

from sklearn.metrics import mean_squared_error

import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
pio.templates.default = "plotly_white"

import torch
import torch.nn.functional as F
import torch_geometric.transforms as T
from torch_geometric.datasets import MovieLens
from torch_geometric.nn import to_hetero
from torch_geometric.nn import HeteroConv, GCNConv, SAGEConv, GATConv, Linear, GraphConv
from torch_geometric.data import Data, HeteroData

!pip install -q sentence-transformers

%matplotlib inline

[K     |████████████████████████████████| 85 kB 2.5 MB/s 
[K     |████████████████████████████████| 5.8 MB 22.3 MB/s 
[K     |████████████████████████████████| 1.3 MB 37.9 MB/s 
[K     |████████████████████████████████| 182 kB 52.2 MB/s 
[K     |████████████████████████████████| 7.6 MB 23.5 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


# Load Competition Dataset

In [4]:

from google.colab import files
files.upload() # expire any previous token(s) and upload recreated token
!rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets list


Saving kaggle.json to kaggle.json
rm: cannot remove '/root/.kaggle': No such file or directory
ref                                                             title                                           size  lastUpdated          downloadCount  voteCount  usabilityRating  
--------------------------------------------------------------  ---------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
meirnizri/covid19-dataset                                       COVID-19 Dataset                                 5MB  2022-11-13 15:47:17          14622        418  1.0              
thedevastator/analyzing-credit-card-spending-habits-in-india    Credit Card Spending Habits in India           319KB  2022-12-14 07:30:37           1366         50  1.0              
michals22/coffee-dataset                                        Coffee dataset                                  24KB  2022-12-15 20:02:12           3506         78  1.0     

In [5]:
!kaggle datasets download 'radek1/otto-full-optimized-memory-footprint' -p /content/kaggle/ --unzip

Downloading otto-full-optimized-memory-footprint.zip to /content/kaggle
100% 1.09G/1.09G [00:48<00:00, 29.9MB/s]
100% 1.09G/1.09G [00:48<00:00, 23.9MB/s]


## Files
- **train.jsonl** - the training data, which contains full session data
  * `session` - the unique session id
  * `events` - the time ordered sequence of events in the session
    * `aid` - the article id (product code) of the associated event
    * `ts` - the Unix timestamp of the event
    * `type` - the event type, i.e., whether a product was clicked, added to the user's cart, or ordered during the session
###### {'clicks': 0, 'carts': 1, 'orders': 2}
- **test.jsonl** - the test data, which contains truncated session data
  * your task is to predict the next `aid` clicked after the session truncation, as well as the the remaining `aids` that are added to `carts` and `orders`; you may predict up to 20 values for each session `type`
- **sample_submission.csv** - a sample submission file in the correct format

In [6]:
df = pd.read_parquet('/content/kaggle/train.parquet')

In [7]:
# sample 10%(frac=0.1) of data
df = df.sample(frac=0.001, replace=False)

In [8]:
df

Unnamed: 0,session,aid,ts,type
178431972,8874432,1666901,1660571057,1
168730771,8000950,226839,1660857489,0
73797448,2222821,1294636,1659502585,0
87901738,2856696,72748,1660476952,0
126407384,4927474,335903,1660582578,0
...,...,...,...,...
123253187,4751973,1383529,1660335234,0
121076270,4625071,226628,1660409032,0
143991820,6043013,1236345,1660059941,0
31319654,744959,794192,1659695074,0


In [9]:
df.isna().sum()

session    0
aid        0
ts         0
type       0
dtype: int64

In [10]:
df['type'].unique()

array([1, 0, 2], dtype=uint8)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 216716 entries, 178431972 to 84836289
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   session  216716 non-null  int32
 1   aid      216716 non-null  int32
 2   ts       216716 non-null  int32
 3   type     216716 non-null  uint8
dtypes: int32(3), uint8(1)
memory usage: 4.3 MB


# Construct Graph Data (memory-efficient optimized)

### [Use new code instead] Construct `edge_index`

In [12]:
#def to_tuple(row):
#    return tuple(row)

In [13]:
# also drop the ts and type column
#connectivity = df.drop(columns=['ts', 'type']).apply(to_tuple, axis=1)

In [14]:
#connectivity

In [15]:
# Old code
'''
# session index dict
#session = sorted(df['session'].unique())
session = df['session'].unique()
session_nodes_idx = {id:idx for idx, id in enumerate(session)}

# aid(article id) index dict
#aid = sorted(df['aid'].unique())
aid = df['aid'].unique()
aid_nodes_idx = {id:idx for idx, id in enumerate(aid)}
'''

"\n# session index dict\n#session = sorted(df['session'].unique())\nsession = df['session'].unique()\nsession_nodes_idx = {id:idx for idx, id in enumerate(session)}\n\n# aid(article id) index dict\n#aid = sorted(df['aid'].unique())\naid = df['aid'].unique()\naid_nodes_idx = {id:idx for idx, id in enumerate(aid)}\n"

In [16]:
# use this code if the above cause memory crash
# very slow but memory good
'''
def get_node_indices(data, key):
  for id in data[key].unique():
    yield id, next(i for i, v in enumerate(data[key]) if v == id)

session_nodes_idx = dict(get_node_indices(df, 'session'))
aid_nodes_idx = dict(get_node_indices(df, 'aid'))
'''

"\ndef get_node_indices(data, key):\n  for id in data[key].unique():\n    yield id, next(i for i, v in enumerate(data[key]) if v == id)\n\nsession_nodes_idx = dict(get_node_indices(df, 'session'))\naid_nodes_idx = dict(get_node_indices(df, 'aid'))\n"

In [17]:
# /!\ this cause memory crashed in a very large data
'''
i_session = []
i_aid = []
for session, aid in connectivity_list:
  i_session.append(user_nodes_idx[user])
  i_aid.append(item_nodes_idx[item])

indice = [i_session, i_aid]
edge_index = torch.Tensor(indice).type(torch.long)
'''

'\ni_session = []\ni_aid = []\nfor session, aid in connectivity_list:\n  i_session.append(user_nodes_idx[user])\n  i_aid.append(item_nodes_idx[item])\n\nindice = [i_session, i_aid]\nedge_index = torch.Tensor(indice).type(torch.long)\n'

In [18]:
# Work but still got memory crash in very large data
'''
def get_indices(connectivity_list, user_nodes_idx, item_nodes_idx):
  for user, item in connectivity_list:
    yield user_nodes_idx[user], item_nodes_idx[item]

edge_index = torch.Tensor(list(get_indices(connectivity, session_nodes_idx, aid_nodes_idx))).type(torch.int64).t()
'''

'\ndef get_indices(connectivity_list, user_nodes_idx, item_nodes_idx):\n  for user, item in connectivity_list:\n    yield user_nodes_idx[user], item_nodes_idx[item]\n\nedge_index = torch.Tensor(list(get_indices(connectivity, session_nodes_idx, aid_nodes_idx))).type(torch.int64).t()\n'

### `edge_index` new code construct

In [19]:
# session index dict
session = df['session'].unique()
source_idx = {id:idx for idx, id in enumerate(session)}

# aid(article id) index dict
aid = df['aid'].unique()
target_idx = {id:idx for idx, id in enumerate(aid)}

In [20]:
connected = df[['session', 'aid']]
connected['session'] = connected['session'].map(source_idx)
connected['aid'] = connected['aid'].map(target_idx)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  connected['session'] = connected['session'].map(source_idx)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  connected['aid'] = connected['aid'].map(target_idx)


In [21]:
source = connected['session']
target = connected['aid']
edge_index = torch.tensor((source.values, target.values))

  edge_index = torch.tensor((source.values, target.values))


### Nodes and Edges Attribute

In [22]:
## Nodes Atrributes
session_num_nodes = df['session'].nunique()
aid_num_nodes = df['aid'].nunique()
aid_features = torch.rand((aid_num_nodes, 300)) # Create (random) article features with shape [num_node_aid, dimensions]

## Edges Atrributes
edge_index = edge_index
edge_label = torch.tensor(df['type'].values).type(torch.int64)

In [23]:
edge_label

tensor([1, 0, 0,  ..., 0, 0, 0])

In [24]:
edge_index.shape

torch.Size([2, 216716])

In [25]:
aid_features.shape

torch.Size([127796, 300])

In [26]:
edge_label.shape

torch.Size([216716])

### Construct HeteroData

In [27]:
node_types = {
    'session': {
        'num_nodes': session_num_nodes
    },
    'aid': {
        'x': aid_features
    }
}

edge_types = {
    ('session', 'event', 'aid'): {
        'edge_index': edge_index,
        'edge_label': edge_label
    }#,
    #('session', 'cart', 'aid'): {
        
    #},
    #('session', 'buy', 'aid'): {
        
    #}
}

In [28]:
data = HeteroData({**node_types, **edge_types})

In [29]:
data

HeteroData(
  [1msession[0m={ num_nodes=208219 },
  [1maid[0m={ x=[127796, 300] },
  [1m(session, event, aid)[0m={
    edge_index=[2, 216716],
    edge_label=[216716]
  }
)

In [30]:
data.metadata()

(['session', 'aid'], [('session', 'event', 'aid')])

In [31]:
node_types, edge_types = data.metadata()
print('Node types:', node_types)
print('Edge types:',edge_types)

Node types: ['session', 'aid']
Edge types: [('session', 'event', 'aid')]


In [32]:
print('Isolated nodes?', data.has_isolated_nodes())
print('Self loops?', data.has_self_loops())
print('Undirected graph? ', data.is_undirected())

Isolated nodes? False
Self loops? False
Undirected graph?  False


# Graph-based Modeling [follow [this](https://github.com/pyg-team/pytorch_geometric/blob/master/examples/hetero/hetero_link_pred.py) sample]

In [33]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Construct Undirected Graph

In [34]:
# add sesion features for message passing:
data['session'].x = torch.rand(data['session'].num_nodes, 300)

In [35]:
# Add user node features for message passing:
#data['session'].x = torch.eye(data['session'].num_nodes, device=device)
#del data['session'].num_nodes

In [36]:
# Add a reverse ('movie', 'rev_rates', 'user') relation for message passing:
data = T.ToUndirected()(data)

In [37]:
del data['aid', 'rev_event', 'session'].edge_label  # Remove "reverse" label.

In [38]:
data

HeteroData(
  [1msession[0m={
    num_nodes=208219,
    x=[208219, 300]
  },
  [1maid[0m={ x=[127796, 300] },
  [1m(session, event, aid)[0m={
    edge_index=[2, 216716],
    edge_label=[216716]
  },
  [1m(aid, rev_event, session)[0m={ edge_index=[2, 216716] }
)

### Calculate Weight

In [39]:
data['session', 'aid'].edge_label.shape

torch.Size([216716])

In [40]:
# We have an unbalanced dataset with many labels for rating 3 and 4, and very
# few for 0 and 1. Therefore we use a weighted MSE loss.

counts = torch.bincount(data['session', 'aid'].edge_label)

# Set weights normalized by (max count/each count)
weight = counts.max() / counts

In [41]:
counts

tensor([194613,  17035,   5068])

In [42]:
weight

tensor([ 1.0000, 11.4243, 38.4004])

In [43]:
data_dict = {'type': (counts, '# rows','coral'), 'weights': (weight, 'weights','royalblue')}

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Scatter(x=np.arange(6), y=counts.detach().cpu().numpy(),
               name = 'nb rows', line_color= 'coral'))
fig.add_trace(
    go.Scatter(x=np.arange(6), y=weight.detach().cpu().numpy(),
               name = 'weights', line_color= 'royalblue'),  secondary_y=True)


fig.update_yaxes(title_text="# rows", secondary_y=False)
fig.update_yaxes(title_text="weights", secondary_y=True)
fig.update_xaxes(title_text="Type")
fig

### [Follow [this sample](https://github.com/pyg-team/pytorch_geometric/blob/master/examples/hetero/hetero_link_pred.py)] Train/Val/Test Link Level Splits

In [44]:
# Perform a link-level split into training, validation, and test edges:
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('session', 'event', 'aid')],
    rev_edge_types=[('aid', 'rev_event', 'session')],
)(data)

In [45]:
train_data['session','aid']

{'edge_index': tensor([[ 62423,   9991,  27914,  ...,  33016, 198085,  23915],
        [ 47901,   9226,  23652,  ...,  27468,   6702,   9892]]), 'edge_label': tensor([0, 0, 0,  ..., 0, 0, 0]), 'edge_label_index': tensor([[ 62423,   9991,  27914,  ...,  33016, 198085,  23915],
        [ 47901,   9226,  23652,  ...,  27468,   6702,   9892]])}

In [46]:
train_data['session','aid'].edge_label

tensor([0, 0, 0,  ..., 0, 0, 0])

### [New weight calculation code]

In [47]:
# We have an unbalanced dataset with many labels for rating 3 and 4, and very
# few for 0 and 1. Therefore we use a weighted MSE loss.
weight = torch.bincount(train_data['session', 'aid'].edge_label)
weight = weight.max() / weight

In [48]:
weight

tensor([ 1.0000, 11.4673, 38.6405])

### Model and loss functions

#### Loss function

In [49]:
def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

#### Model

In [50]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['session'][row], z_dict['aid'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)

        return z.view(-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

In [51]:
model = Model(hidden_channels=32).to(device)

In [52]:
# Due to lazy initialization, we need to run one model step so the number
# of parameters can be inferred:
with torch.no_grad():
    model.encoder(train_data.x_dict, train_data.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [53]:
def train():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['session', 'aid'].edge_label_index)
    target = train_data['session', 'aid'].edge_label
    loss = weighted_mse_loss(pred, target, weight)
    loss.backward()
    optimizer.step()
    return float(loss)

In [54]:
@torch.no_grad()
def test(data):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['session', 'aid'].edge_label_index)
    pred = pred.clamp(min=-1, max=2)
    target = data['session', 'aid'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

In [55]:
for epoch in range(0, 30):
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    test_rmse = test(test_data)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}, Test: {test_rmse:.4f}')

Epoch: 000, Loss: 5.1650, Train: 1.9162, Val: 1.5072, Test: 1.5297
Epoch: 001, Loss: 20.5864, Train: 0.4403, Val: 0.4473, Test: 0.4484
Epoch: 002, Loss: 4.8751, Train: 0.4503, Val: 0.4562, Test: 0.4571
Epoch: 003, Loss: 5.0090, Train: 0.4040, Val: 0.4138, Test: 0.4143
Epoch: 004, Loss: 3.4761, Train: 0.5509, Val: 0.4303, Test: 0.4328
Epoch: 005, Loss: 2.4483, Train: 0.9131, Val: 0.5743, Test: 0.5816
Epoch: 006, Loss: 1.8065, Train: 1.3928, Val: 0.8120, Test: 0.8247
Epoch: 007, Loss: 2.3710, Train: 1.4948, Val: 0.8744, Test: 0.8883
Epoch: 008, Loss: 2.6646, Train: 1.2891, Val: 0.7669, Test: 0.7787
Epoch: 009, Loss: 2.1332, Train: 1.0013, Val: 0.6272, Test: 0.6359
Epoch: 010, Loss: 1.8047, Train: 0.7801, Val: 0.5311, Test: 0.5372
Epoch: 011, Loss: 1.9093, Train: 0.6687, Val: 0.4862, Test: 0.4909
Epoch: 012, Loss: 2.1021, Train: 0.6274, Val: 0.4702, Test: 0.4743
Epoch: 013, Loss: 2.2037, Train: 0.6385, Val: 0.4733, Test: 0.4775
Epoch: 014, Loss: 2.1745, Train: 0.6949, Val: 0.4937, Test: 0

In [56]:
model

Model(
  (encoder): GraphModule(
    (conv1): ModuleDict(
      (session__event__aid): SAGEConv((-1, -1), 32, aggr=mean)
      (aid__rev_event__session): SAGEConv((-1, -1), 32, aggr=mean)
    )
    (conv2): ModuleDict(
      (session__event__aid): SAGEConv((-1, -1), 32, aggr=mean)
      (aid__rev_event__session): SAGEConv((-1, -1), 32, aggr=mean)
    )
  )
  (decoder): EdgeDecoder(
    (lin1): Linear(64, 32, bias=True)
    (lin2): Linear(32, 1, bias=True)
  )
)

In [57]:
data

HeteroData(
  [1msession[0m={
    num_nodes=208219,
    x=[208219, 300]
  },
  [1maid[0m={ x=[127796, 300] },
  [1m(session, event, aid)[0m={
    edge_index=[2, 216716],
    edge_label=[216716]
  },
  [1m(aid, rev_event, session)[0m={ edge_index=[2, 216716] }
)

In [58]:
model.encoder(data.x_dict, data.edge_index_dict)

{'session': tensor([[-3.5903, -0.6269,  0.8789,  ..., -0.0389, -3.3460,  2.3984],
         [-3.7468, -0.6740,  0.9323,  ..., -0.0577, -3.4996,  2.4976],
         [-3.7805, -0.7761,  0.9828,  ..., -0.1320, -3.5745,  2.4874],
         ...,
         [-3.7491, -0.6924,  0.9406,  ..., -0.0715, -3.5099,  2.4930],
         [-3.8528, -0.6636,  0.9506,  ..., -0.0380, -3.5844,  2.5793],
         [-3.8346, -0.6761,  0.9520,  ..., -0.0496, -3.5747,  2.5617]],
        grad_fn=<AddBackward0>),
 'aid': tensor([[ 0.5448, -1.9583, -1.7577,  ...,  2.4216,  1.0269, -1.8583],
         [ 0.5748, -2.1003, -1.8678,  ...,  2.5483,  1.0993, -1.9665],
         [ 0.5725, -2.0724, -1.8366,  ...,  2.4797,  1.0972, -1.9127],
         ...,
         [ 0.5617, -2.0295, -1.8080,  ...,  2.4627,  1.0694, -1.8958],
         [ 0.5518, -1.9613, -1.7436,  ...,  2.3486,  1.0494, -1.8042],
         [ 0.5624, -2.0436, -1.8250,  ...,  2.5031,  1.0689, -1.9273]],
        grad_fn=<AddBackward0>)}

# Test dataset

In [None]:
#df = pd.read_parquet('/content/kaggle/test.parquet')

In [None]:
#df = df.sample(frac=0.01, replace=False)

In [None]:
#df

## Construct heterogenous graph for test

In [None]:
'''
# session index dict
session = df['session'].unique()
source_idx = {id:idx for idx, id in enumerate(session)}

# aid(article id) index dict
aid = df['aid'].unique()
target_idx = {id:idx for idx, id in enumerate(aid)}
'''

In [None]:
'''
connected = df[['session', 'aid']]
connected['session'] = connected['session'].map(source_idx)
connected['aid'] = connected['aid'].map(target_idx)

source = connected['session']
target = connected['aid']
edge_index = torch.tensor((source.values, target.values)).type(torch.int64)
'''

In [None]:
'''
## Nodes Atrributes
session_num_nodes = df['session'].nunique()
aid_num_nodes = df['aid'].nunique()
aid_features = torch.rand((aid_num_nodes, 300)) # Create (random) article features with shape [num_node_aid, dimensions]

## Edges Atrributes
edge_index = edge_index
edge_label = torch.tensor(df['type'].values).type(torch.int64)
'''

In [None]:
'''
node_types = {
    'session': {
        'num_nodes': session_num_nodes
    },
    'aid': {
        'x': aid_features
    }
}

edge_types = {
    ('session', 'event', 'aid'): {
        'edge_index': edge_index,
        'edge_label': edge_label,
        'edge_label_index': edge_index
    }#,
    #('session', 'cart', 'aid'): {
        
    #},
    #('session', 'buy', 'aid'): {
        
    #}
}
'''

In [None]:
#Rtest_data = HeteroData({**node_types, **edge_types})

In [None]:
#data

In [None]:
#Rtest_data

In [None]:
'''
# add sesion features for message passing:
Rtest_data['session'].x = torch.rand(Rtest_data['session'].num_nodes, 300)

Rtest_data = T.ToUndirected()(Rtest_data)
del Rtest_data['aid', 'rev_event', 'session'].edge_label  # Remove "reverse" label.
'''

In [None]:
#Rtest_data

## Testing

In [None]:
'''
with torch.no_grad():
    model.eval()
    gg = model(Rtest_data.x_dict, Rtest_data.edge_index_dict, Rtest_data['session', 'aid'].edge_label_index)
'''

In [None]:
#data['session', 'aid'].edge_label

tensor([0, 0, 0,  ..., 0, 0, 0])

In [None]:
#Rtest_data['session', 'aid'].edge_label

tensor([0, 0, 0,  ..., 0, 0, 0])

In [None]:
#gg

tensor([0.9765, 0.9353, 0.9445,  ..., 0.9290, 0.9750, 0.9879])

In [None]:
#data

HeteroData(
  [1msession[0m={
    num_nodes=208318,
    x=[208318, 300]
  },
  [1maid[0m={ x=[128140, 300] },
  [1m(session, event, aid)[0m={
    edge_index=[2, 216716],
    edge_label=[216716]
  },
  [1m(aid, rev_event, session)[0m={ edge_index=[2, 216716] }
)

# [TEMPORARY DROP]

In [None]:
# Temporary comment
'''
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, conv):
        super().__init__()
        # conv(#in_channels, #out_channels)
        ''''''
        in_channels (int or tuple): 
            Size of each input sample, or :obj:`-1` to
            derive the size from the first input(s) to the forward method.
            A tuple corresponds to the sizes of source and target
            dimensionalities.
        ''''''
        self.conv1 = conv((-1, -1), hidden_channels)
        self.conv2 = conv((-1, -1), out_channels)
        self.linear1 = Linear(-1, out_channels)
        self.linear2 = Linear(-1, out_channels)

    def forward(self, x, edge_index):
        x0 = self.linear1(x)
        x2 = self.conv1(x0, edge_index).relu()
        x3 = self.conv2(x2, edge_index)
        x4 = self.linear2(x2 + x3)
        # Add combined layer to reduce over-smoothing
        return x4

class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['session'][row], z_dict['aid'][col]], dim=-1)
        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels,  conv=SAGEConv):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels,  conv) # Initialize GNNEncoder
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels) # Initialze EdgeDecoder

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict) # Here the call from model.encoder(...)
        return self.decoder(z_dict, edge_label_index)
'''

"\nclass GNNEncoder(torch.nn.Module):\n    def __init__(self, hidden_channels, out_channels, conv):\n        super().__init__()\n        # conv(#in_channels, #out_channels)\n        \n        in_channels (int or tuple): \n            Size of each input sample, or :obj:`-1` to\n            derive the size from the first input(s) to the forward method.\n            A tuple corresponds to the sizes of source and target\n            dimensionalities.\n        \n        self.conv1 = conv((-1, -1), hidden_channels)\n        self.conv2 = conv((-1, -1), out_channels)\n        self.linear1 = Linear(-1, out_channels)\n        self.linear2 = Linear(-1, out_channels)\n\n    def forward(self, x, edge_index):\n        x0 = self.linear1(x)\n        x2 = self.conv1(x0, edge_index).relu()\n        x3 = self.conv2(x2, edge_index)\n        x4 = self.linear2(x2 + x3)\n        # Add combined layer to reduce over-smoothing\n        return x4\n\nclass EdgeDecoder(torch.nn.Module):\n    def __init__(self, hid

In [None]:
'''
def train(train_data, model, optimizer, loss=weighted_mse_loss):
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['session', 'aid'].edge_label_index)
    target = train_data['session', 'aid'].edge_label
    loss = weighted_mse_loss(pred, target, weight)
    loss.backward()
    optimizer.step()
    return float(loss.sqrt())
'''

"\ndef train(train_data, model, optimizer, loss=weighted_mse_loss):\n    model.train()\n    optimizer.zero_grad()\n    pred = model(train_data.x_dict, train_data.edge_index_dict,\n                 train_data['session', 'aid'].edge_label_index)\n    target = train_data['session', 'aid'].edge_label\n    loss = weighted_mse_loss(pred, target, weight)\n    loss.backward()\n    optimizer.step()\n    return float(loss.sqrt())\n"

In [None]:
## set pred.clamp
'''
@torch.no_grad()
def test(data, model, metric=F.mse_loss):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['session', 'aid'].edge_label_index)
    pred = pred.clamp(min=0, max=2)
    target = data['session', 'aid'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse) # Return RMSE loss
'''

"\n@torch.no_grad()\ndef test(data, model, metric=F.mse_loss):\n    model.eval()\n    pred = model(data.x_dict, data.edge_index_dict,\n                 data['session', 'aid'].edge_label_index)\n    pred = pred.clamp(min=0, max=2)\n    target = data['session', 'aid'].edge_label.float()\n    rmse = F.mse_loss(pred, target).sqrt()\n    return float(rmse) # Return RMSE loss\n"

In [None]:
#from tqdm import tqdm
#from IPython.display import clear_output

In [None]:
"""
def train_test(model, model_params, learning_rate=0.01, e_patience = 10, min_acc= 0.05, n_epochs=500):
    t0 = time.time()

    model = model(**model_params) # Define the model

    # Due to lazy initialization, we need to run one model step so the number
    # of parameters can be inferred:
    with torch.no_grad():
        model.encoder(train_data.x_dict, train_data.edge_index_dict) # Run once with torch.no_grad() to get parameter for optimizer below

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    k=0
    loss, train_rmse, val_rmse, test_rmse = [], [], [], []
    train_wrmse, val_wrmse, test_wrmse = [], [], []
    for epoch in tqdm(range(n_epochs)):
        # Call train fuction here >> return loss
        loss += [train(train_data, model, optimizer, loss=weighted_mse_loss)]
        
        # Call test function here >> return RMSE loss
        train_wrmse += [test(train_data, model, metric=weighted_mse_loss)]
        train_rmse += [test(train_data, model, metric=F.mse_loss)]
        
        val_wrmse += [test(val_data, model, metric=weighted_mse_loss)]
        val_rmse += [test(val_data, model, metric=F.mse_loss)]
        
        test_wrmse += [test(test_data, model, metric=weighted_mse_loss)]
        test_rmse += [test(test_data, model, metric=F.mse_loss)]

        if epoch+1 %10==0:
            print(f'Epoch: {epoch+1:03d}, Loss: {loss[-1]:.4f}, Train: {train_rmse[-1]:.4f}, '
                  f'Val: {val_rmse[-1]:.4f}, Test: {test_rmse[-1]:.4f}')

        results = pd.DataFrame({
            'loss': loss,
            'train_rmse': train_rmse, 'val_rmse': val_rmse, 'test_rmse': test_rmse,
            'train_wrmse': train_wrmse, 'val_wrmse': val_wrmse, 'test_wrmse': test_wrmse,
            'time':(time.time()-t0)/60
        })
        
        ## Debugging
        #clear_output()
        '''
        print('\nloss: ', loss, 
              '\ntrain_rmse: ', train_rmse, 
              '\nval_rmse: ', val_rmse, 
              '\ntest_rmse: ', test_rmse,
              '\ntrain_wrmse: ', train_wrmse, 
              '\nval_wrmse: ', val_wrmse, 
              '\ntest_wrmse: ', test_wrmse,
              '\ntime: ', (time.time()-t0)/60)
        '''
        #visualize_loss(results, metric='wrmse').show()
        #print(results.to_string())

        # enable early stopping
        if (epoch > 1) and abs(loss[-1]/loss[-2]-1) < min_acc :
            k += 1
        if k> e_patience:
            print('Early stopping')
            break

    return results, model
"""

"\ndef train_test(model, model_params, learning_rate=0.01, e_patience = 10, min_acc= 0.05, n_epochs=500):\n    t0 = time.time()\n\n    model = model(**model_params) # Define the model\n\n    # Due to lazy initialization, we need to run one model step so the number\n    # of parameters can be inferred:\n    with torch.no_grad():\n        model.encoder(train_data.x_dict, train_data.edge_index_dict) # Run once with torch.no_grad() to get parameter for optimizer below\n\n    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)\n\n    k=0\n    loss, train_rmse, val_rmse, test_rmse = [], [], [], []\n    train_wrmse, val_wrmse, test_wrmse = [], [], []\n    for epoch in tqdm(range(n_epochs)):\n        # Call train fuction here >> return loss\n        loss += [train(train_data, model, optimizer, loss=weighted_mse_loss)]\n        \n        # Call test function here >> return RMSE loss\n        train_wrmse += [test(train_data, model, metric=weighted_mse_loss)]\n        train_rmse +=

In [None]:
'''
def visualize_loss(results, metric='rmse'):
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=results.index, y=results['train_'+metric], name = 'train_'+metric))
    fig.add_trace(go.Scatter(x=results.index, y=results['val_'+metric], name = 'val_'+metric))
    fig.add_trace(go.Scatter(x=results.index, y=results['test_'+metric], name = 'test_'+metric))
    fig.add_trace(go.Scatter(x=results.index, y=results['loss'], name = 'loss'))

    fig.update_yaxes(title_text=metric.upper())
    fig.update_xaxes(title_text="Epoch")

    return fig
'''

'\ndef visualize_loss(results, metric=\'rmse\'):\n    fig = go.Figure()\n\n    fig.add_trace(go.Scatter(x=results.index, y=results[\'train_\'+metric], name = \'train_\'+metric))\n    fig.add_trace(go.Scatter(x=results.index, y=results[\'val_\'+metric], name = \'val_\'+metric))\n    fig.add_trace(go.Scatter(x=results.index, y=results[\'test_\'+metric], name = \'test_\'+metric))\n    fig.add_trace(go.Scatter(x=results.index, y=results[\'loss\'], name = \'loss\'))\n\n    fig.update_yaxes(title_text=metric.upper())\n    fig.update_xaxes(title_text="Epoch")\n\n    return fig\n'

In [None]:
'''
N_EPOCHS = 300
E_PATIENCE = 50
LEARNING_RATE = 0.01

model_params = {"hidden_channels":32, 'conv':SAGEConv}

results, trained_model = train_test(
    Model, model_params, learning_rate=LEARNING_RATE, e_patience = E_PATIENCE, n_epochs=N_EPOCHS)
'''

'\nN_EPOCHS = 300\nE_PATIENCE = 50\nLEARNING_RATE = 0.01\n\nmodel_params = {"hidden_channels":32, \'conv\':SAGEConv}\n\nresults, trained_model = train_test(\n    Model, model_params, learning_rate=LEARNING_RATE, e_patience = E_PATIENCE, n_epochs=N_EPOCHS)\n'

In [None]:
#visualize_loss(results, metric='wrmse')

In [None]:
#trained_model.encoder(test_data.x_dict, test_data.edge_index_dict)['session']

In [None]:
#trained_model.state_dict()['encoder.linear2.session.weight']

# Recommendation

In [None]:
'''
@torch.no_grad()
def recommendation(user_id, model, x_dict, edge_index_dict):
  # Get model decoder
  #model = Model(**model_params)
  with torch.no_grad():
    encoder = model.encoder(data.x_dict, data.edge_index_dict)

  # Get node representations for users and movies
  user_representations = encoder['session']
  movie_representations = encoder['aid']

  # Compute the dot product between user and movie representations to get edge weights
  edge_weights = user_representations.mm(movie_representations.T)

  # Make predictions for each user by taking the top k largest edge weights
  k = 20  # number of recommendations to make
  _, top_k_indices = edge_weights.topk(k, dim=1)
  recommendations = top_k_indices.numpy()

  # Print recommendations for the first user
  print(f'Recommendations for user {user_id}: {recommendations[user_id]}')
'''

In [None]:
#session = 1
#recommendation(session, model, Rtest_data.x_dict, Rtest_data.edge_index_dict)

In [None]:
#session = 2
#recommendation(session, model, data.x_dict, data.edge_index_dict)

In [None]:
#session = 999
#recommendation(session, model, data.x_dict, data.edge_index_dict)

# Submission

Submission File
For each `session` id and `type` combination in the test set, you must predict the `aid` values in the `label` column, which is space delimited. You can predict up to 20 `aid` values per row. The file should contain a header and have the following format:

```
session_type,labels
12906577_clicks,135193 129431 119318 ...
12906577_carts,135193 129431 119318 ...
12906577_orders,135193 129431 119318 ...
12906578_clicks, 135193 129431 119318 ...
etc.
```