# Graph Neural Networks, [yoochoose](https://2015.recsyschallenge.com/challenge.html)

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
import csv
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric

In [3]:
from tqdm import tqdm
tqdm.monitor_interval = 0

In [4]:
np.random.seed(9)

## Set Configs

In [5]:
BATCH_SIZE = 1024
LR = 0.005
EMBEDDING_DIM = 128

In [6]:
# decide which device we want to run on
device = torch.device('cuda:0' if (torch.cuda.is_available()) else 'cpu')

## Load Datasets

The challenge to solve from this datasets is as the following:

>1. Predict whether there will be a buy event followed by a sequence of clicks (solved in this kernel)
>2. Predict which item will be bought

#### 1. User Clicks Data

In [7]:
# containing click events
df_clicks = pd.read_csv('./datasets/yoochoose-data/yoochoose-clicks.dat')
df_clicks.columns=['session_id','timestamp','item_id','category']

In [8]:
df_clicks.head()

Unnamed: 0,session_id,timestamp,item_id,category
0,1,2014-04-07T10:54:09.868Z,214536500,0
1,1,2014-04-07T10:54:46.998Z,214536506,0
2,1,2014-04-07T10:57:00.306Z,214577561,0
3,2,2014-04-07T13:56:37.614Z,214662742,0
4,2,2014-04-07T13:57:19.373Z,214662742,0


In [9]:
from sklearn.preprocessing import LabelEncoder

# item_ids are categorically encoded to ensure the encoded item_ids
# later will be mapped to an embedding matrix, starts at 0
le = LabelEncoder()
df_clicks['item_id'] = le.fit_transform(df_clicks.item_id)

In [10]:
df_clicks.head()

Unnamed: 0,session_id,timestamp,item_id,category
0,1,2014-04-07T10:54:09.868Z,2052,0
1,1,2014-04-07T10:54:46.998Z,2054,0
2,1,2014-04-07T10:57:00.306Z,9876,0
3,2,2014-04-07T13:56:37.614Z,19448,0
4,2,2014-04-07T13:57:19.373Z,19448,0


In [11]:
# since the data is quite large, we subsample it for easier demonstration
# randomly sample a couple of them
sampled_session_id = np.random.choice(df_clicks.session_id.unique(), 1000000, replace=False)
df_clicks = df_clicks.loc[df_clicks.session_id.isin(sampled_session_id)]
df_clicks.nunique()

session_id    1000000
timestamp     3568794
item_id         35677
category          242
dtype: int64

#### 2. User Buyings Data

In [12]:
# containing buy events
df_buys = pd.read_csv('./datasets/yoochoose-data/yoochoose-buys.dat', header=None)
df_buys.columns=['session_id','timestamp','item_id','price','quantity']

In [13]:
df_buys.head()

Unnamed: 0,session_id,timestamp,item_id,price,quantity
0,420374,2014-04-06T18:44:58.314Z,214537888,12462,1
1,420374,2014-04-06T18:44:58.325Z,214537850,10471,1
2,281626,2014-04-06T09:40:13.032Z,214535653,1883,1
3,420368,2014-04-04T06:13:28.848Z,214530572,6073,1
4,420368,2014-04-04T06:13:28.858Z,214835025,2617,1


## Create Custom Datasets

In [14]:
from torch_geometric.data import Data
from torch_geometric.data import InMemoryDataset

In [15]:
class YooChooseBinaryDataset(InMemoryDataset):
    
    def __init__(self, root, transform=None, pre_transform=None):
        super(YooChooseBinaryDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])
    
    # returns a list that shows a list of raw, unprocessed data
    @property
    def raw_file_names(self):
        return []
    
    # returns a list containing the file names of all the processed data
    @property
    def processed_file_names(self):
        return ['yoochoose_click_binary_1M_sess.dataset']
    
    # download the data you are working on to the directory specified
    def download(self):
        pass
    
    # gather the data into a list of Data objects
    def process(self):
        
        data_list = []
        
        grouped_clicks = df_clicks.groupby('session_id')
        
        for session_id, group in tqdm(grouped_clicks):
            try:
                sess_item_id = LabelEncoder().fit_transform(group.item_id)
                group = group.reset_index(drop=True)
                group['sess_item_id'] = sess_item_id

                node_features = group.loc[group.session_id==session_id, ['sess_item_id', 'item_id']].sort_values('sess_item_id').item_id.drop_duplicates().values
                node_features = torch.LongTensor(node_features).unsqueeze(1)

                target_nodes = group.sess_item_id.values[1:]
                source_nodes = group.sess_item_id.values[:-1]

                # create the graph connectivity/ relations
                edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)

                x = node_features
                y = torch.FloatTensor([group.category.values[0]])

                # create graph from our datasets and its defined edges
                data = Data(x=x, edge_index=edge_index, y=y)
                data_list.append(data)
            except:
                continue
            
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

In [16]:
datasets = YooChooseBinaryDataset(root='./datasets/yoochoose-data/')

Processing...


100%|██████████| 1000000/1000000 [57:13<00:00, 291.26it/s]


Done!


In [19]:
datasets = datasets.shuffle()
train_datasets = datasets[:250000]
valid_datasets = datasets[250000:375000]
test_datasets = datasets[375000:]

print('Train Datasets:',len(train_datasets))
print('Valid Datasets:',len(valid_datasets))
print('Test Datasets:',len(test_datasets))

Train Datasets: 250000
Valid Datasets: 125000
Test Datasets: 43328


## Create Data Loader

In [20]:
from torch_geometric.data import DataLoader

In [28]:
# feed data by batch into the model effortlessly yield batch aggregating x, y, and edge_index 
train_loader = DataLoader(train_datasets, batch_size=BATCH_SIZE)
valid_loader = DataLoader(valid_datasets, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_datasets, batch_size=BATCH_SIZE)

num_items = df_clicks.max().values[0] + 1
print('Total Items: ',str(num_items))

Total Items:  11562137


## Build [Graph Network](https://arxiv.org/pdf/1812.08434.pdf) Architecture

In [29]:
# refers into this work: https://github.com/rusty1s/pytorch_geometric/blob/master/examples/enzymes_topk_pool.py

In [30]:
from torch.nn import Sequential, Linear, ReLU
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import remove_self_loops, add_self_loops

In [31]:
class SAGEConv(MessagePassing):
    
    def __init__(self, in_channels, out_channels):
        super(SAGEConv, self).__init__(aggr='max')
        self.fc_layer = torch.nn.Linear(in_channels, out_channels)
        self.update_fc_layer = torch.nn.Linear(in_channels + out_channels, in_channels, bias=False)
        self.relu = torch.nn.ReLU()

    def forward(self, x, edge_index):
        
        edge_index, _ = remove_self_loops(edge_index)
        edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))
        
        # call message describing how node embeddings are learned and update
        return self.propagate(edge_index, size=(x.size(0), x.size(0)), x=x)
    
    # construct “message” for each of the node pair
    def message(self, x_j):
        
        x_j = self.fc_layer(x_j)
        x_j = self.relu(x_j)
        
        return x_j
    
    # the aggregated message and the current node embedding is aggregated
    # takes in the aggregated message and other arguments passed into propagate, assigning a new embedding value for each node
    def update(self, aggr_out, x):
        
        new_embedding = torch.cat([aggr_out, x], dim=1)
        new_embedding = self.update_fc_layer(new_embedding)
        new_embedding = self.relu(new_embedding)
        
        return new_embedding

In [32]:
from torch_geometric.nn import GraphConv, TopKPooling, GatedGraphConv
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp

In [41]:
class GNN(nn.Module):
    
    def __init__(self):
        super(GNN, self).__init__()
        
        self.conv_layer = SAGEConv(EMBEDDING_DIM, 128)
        self.pool_layer = TopKPooling(128, ratio=0.8)
        self.item_embedding = torch.nn.Embedding(num_embeddings=df_clicks.item_id.max()+1, embedding_dim=EMBEDDING_DIM)
        self.fc1_layer = torch.nn.Linear(256, 128)
        self.fc2_layer = torch.nn.Linear(128, 64)
        self.fc3_layer = torch.nn.Linear(64, 1)
        self.bn1_layer = torch.nn.BatchNorm1d(128)
        self.bn2_layer = torch.nn.BatchNorm1d(64)
        self.relu = torch.nn.ReLU()
        
    def forward(self, data):
        
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.item_embedding(x)
        x = x.squeeze(1)
        
        x = F.relu(self.conv_layer(x, edge_index))
        x, edge_index, _, batch, _, _ = self.pool_layer(x, edge_index, None, batch)
        x1 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)
        
        x = F.relu(self.conv_layer(x, edge_index))
        x, edge_index, _, batch, _, _ = self.pool_layer(x, edge_index, None, batch)
        x2 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)
        
        x = F.relu(self.conv_layer(x, edge_index))
        x, edge_index, _, batch, _, _ = self.pool_layer(x, edge_index, None, batch)
        x3 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)
        
        x = x1 + x2 + x3
        
        x = self.relu(self.fc1_layer(x))
        x = self.relu(self.fc2_layer(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x = torch.sigmoid(self.fc3_layer(x)).squeeze(1)
        
        return x

#### Initialize The Network

In [42]:
gnn = GNN().to(device)

## Set Loss Function

In [43]:
bce_loss = torch.nn.BCELoss()

## Set Optimizer

In [44]:
optimizer = torch.optim.Adam(gnn.parameters(), lr=LR)

## Train The Network

In [45]:
from sklearn.metrics import roc_auc_score

In [46]:
for epoch in range(3):
    
    gnn.train()
    
    train_loss = 0
    for data in train_loader:
        
        data = data.to(device)
        label = data.y.to(device)
        
        optimizer.zero_grad()
        result = gnn(data)
        loss = bce_loss(result, label)
        loss.backward()
        optimizer.step()
        
        train_loss += data.num_graphs * loss.item()
    
    gnn.eval()
    
    result_all = []
    label_all = []
    
    with torch.no_grad():
        for data in valid_loader:
            
            data = data.to(device)
            label = data.y
            label = label.detach().cpu().numpy()
            label_all.append(label)
            
            result = gnn(data)
            result = result.detach().cpu().numpy()
            result_all.append(result)
       
    label_all = np.hstack(label_all)
    result_all = np.hstack(result_all)
    
    # the label is highly unbalanced
    # Area Under Curve (AUC) is a better metric for this task as it only cares if the positive examples are scored higher than the negative examples
    valid_roc_auc = roc_auc_score(label_all, result_all)
    print('Epoch: {:03d}, Train Loss: {:.5f}, Valid ROC/AUC: {:.5f}'.format(epoch, train_loss, valid_roc_auc))       

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

---