## Importing libraries

In [None]:
!pip install dgl-cu102

Collecting dgl-cu102
  Downloading dgl_cu102-0.6.1-cp37-cp37m-manylinux1_x86_64.whl (36.8 MB)
[K     |████████████████████████████████| 36.8 MB 22 kB/s 
Installing collected packages: dgl-cu102
Successfully installed dgl-cu102-0.6.1


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import random

import dgl
import dgl.nn as dglnn
import dgl.function as fn

import torch
import torch.nn as nn
import torch.nn.functional as F


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


DGL backend not selected or invalid.  Assuming PyTorch for now.
Using backend: pytorch


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Defining paths

In [None]:
root_dir = '/content/drive/MyDrive/Hackathon/Hack2'
train_data_dir = root_dir + '/Training Data.csv'
train_target_dir = root_dir + '/Training Data Target.csv'
test_data_dir = root_dir + '/Test Data.csv'
sample_sub_file = root_dir + '/Sample Submission.csv'

## Loading data into pandas dataframe, and data reading

In [None]:
train_df = pd.read_csv(train_data_dir)
train_target_df = pd.read_csv(train_target_dir)
test_df = pd.read_csv(test_data_dir)
sample_sub_df = pd.read_csv(sample_sub_file)

In [None]:
# visualizing train data
print('Train data')
print(train_df.head(10))

total_train_samples = len(train_df['user_id'])
total_unique_train_samples = len(train_df['user_id'].unique())

print(f'Total test samples: {total_train_samples}')
print(f'Total unqiue test samples: {total_unique_train_samples}')

Train data
   user_id      aov    category
0    37327  29128.0      Phones
1    37327    354.0     Fashion
2    37327    460.0  Home Decor
3    37327    761.0     Fashion
4    41260  16658.0      Phones
5    41260    263.0  Home Decor
6    41260    988.0     Fashion
7    19360  10214.0      Phones
8    19360  16761.0      Phones
9    19360   6382.0      Phones
Total test samples: 257407
Total unqiue test samples: 29972


In [None]:
unique_categories_features = train_df['category'].unique()
print(unique_categories_features)
print(f'Total unique train categories: {len(unique_categories_features)}')

['Phones' 'Fashion' 'Home Decor' 'Consumer Durables' 'Back to School'
 'Toys' 'Books' 'Groceries' 'Fitness' 'Board Games' 'Painiting Supplies'
 'Gaming' 'TVs' 'Laptops' 'Beauty Products' 'Pet Supplies' 'Ereaders'
 'Kitchen cleaning Supplies']
Total unique train categories: 18


In [None]:
# visualizing test dataframe
print(test_df.head(10))

total_test_samples = len(test_df['user_id'])
total_unique_test_samples = len(test_df['user_id'].unique())

print(f'Total test samples: {total_test_samples}')
print(f'Total unqiue test samples: {total_unique_test_samples}')

   Unnamed: 0  user_id      aov           category
0           0    43323  19115.0             Phones
1           1    43323  29309.0             Phones
2           2    43323  15293.0             Phones
3           3    43323  23548.0             Phones
4           4    43323    751.0            Fashion
5           5    43323   1017.0            Fashion
6           6    43323  41480.0  Consumer Durables
7           7    43323    419.0     Back to School
8           8    43323    610.0            Fashion
9           9    43323    642.0              Books
Total test samples: 172554
Total unqiue test samples: 19981


In [None]:
test_data_categories_features = test_df['category'].unique()
print(test_data_categories_features)
print(f'Total unique test data categories: {len(test_data_categories_features)}')

['Phones' 'Fashion' 'Consumer Durables' 'Back to School' 'Books'
 'Ereaders' 'Home Decor' 'Toys' 'Groceries' 'Board Games' 'Gaming' 'TVs'
 'Beauty Products' 'Kitchen cleaning Supplies' 'Laptops' 'Pet Supplies'
 'Fitness' 'Painiting Supplies']
Total unique test data categories: 18


In [None]:
print(f'Number of unique samples in train dataset',len(train_df['user_id'].unique()))

Number of unique samples in train dataset 29972


In [None]:
train_user2CatsP = {}
for i in range(len(train_df)):
  user_id = train_df.loc[i]['user_id']

  cat = train_df.loc[i]['category']
  if (user_id not in train_user2CatsP):
    train_user2CatsP[user_id] = {}
    train_user2CatsP[user_id][cat] = 1
  else:
    if (cat not in train_user2CatsP[user_id]):
      train_user2CatsP[user_id][cat] = 1
    else: 
      train_user2CatsP[user_id][cat] += 1

In [None]:
test_user2CatsP = {}
for i in range(len(test_df)):
  user_id = test_df.loc[i]['user_id']
  
  cat = train_df.loc[i]['category']
  if (user_id not in test_user2CatsP):
    test_user2CatsP[user_id] = {}
    test_user2CatsP[user_id][cat] = 1
  else:
    if (cat not in test_user2CatsP[user_id]):
      test_user2CatsP[user_id][cat] = 1
    else: 
      test_user2CatsP[user_id][cat] += 1

In [None]:
print(len(train_user2CatsP))
print(len(test_user2CatsP))

29972
19981


In [None]:
print(train_user2CatsP)

{37327: {'Phones': 1, 'Fashion': 2, 'Home Decor': 1}, 41260: {'Phones': 1, 'Home Decor': 1, 'Fashion': 1}, 19360: {'Phones': 4, 'Consumer Durables': 1, 'Back to School': 1, 'Toys': 2, 'Fashion': 2, 'Home Decor': 1, 'Books': 1, 'Groceries': 1}, 39634: {'Home Decor': 3, 'Fitness': 1, 'Board Games': 1, 'Back to School': 1, 'Phones': 1, 'Fashion': 2, 'Toys': 1, 'Consumer Durables': 1, 'Painiting Supplies': 1}, 25977: {'Gaming': 1, 'Home Decor': 1, 'Fashion': 4, 'Consumer Durables': 1, 'Phones': 2, 'Groceries': 1}, 9818: {'Phones': 5, 'TVs': 1, 'Groceries': 1, 'Gaming': 3, 'Fashion': 4, 'Board Games': 1, 'Toys': 1, 'Fitness': 1, 'Back to School': 1}, 30989: {'Fashion': 2, 'Home Decor': 2, 'Toys': 2, 'Phones': 2, 'Back to School': 1, 'Consumer Durables': 1, 'Laptops': 1}, 13086: {'Fashion': 2, 'Phones': 1, 'Back to School': 2, 'Toys': 1, 'Home Decor': 2, 'Groceries': 3, 'Gaming': 1}, 28938: {'Phones': 3, 'Back to School': 2, 'Gaming': 1, 'Beauty Products': 1, 'Fashion': 3, 'Pet Supplies': 1}

In [None]:
print(test_user2CatsP)

{43323: {'Phones': 6, 'Fashion': 3, 'Home Decor': 2, 'Consumer Durables': 1, 'Back to School': 1}, 4269: {'Toys': 2, 'Fashion': 2, 'Home Decor': 1, 'Books': 1, 'Groceries': 1}, 21731: {'Home Decor': 4, 'Fitness': 1, 'Board Games': 1, 'Back to School': 1, 'Phones': 1, 'Fashion': 2, 'Toys': 1, 'Consumer Durables': 1, 'Painiting Supplies': 1, 'Gaming': 1}, 14422: {'Fashion': 4, 'Consumer Durables': 1, 'Phones': 5, 'Groceries': 1}, 12862: {'Phones': 3, 'TVs': 1, 'Groceries': 1, 'Gaming': 3, 'Fashion': 4, 'Board Games': 1, 'Toys': 1, 'Fitness': 1, 'Back to School': 1}, 49017: {'Fashion': 2, 'Home Decor': 2, 'Toys': 2, 'Phones': 3, 'Back to School': 1, 'Consumer Durables': 1}, 44259: {'Laptops': 1, 'Fashion': 2, 'Phones': 1, 'Back to School': 1, 'Toys': 1, 'Home Decor': 2}, 19672: {'Groceries': 2, 'Back to School': 2, 'Gaming': 2, 'Phones': 3, 'Beauty Products': 1}, 7322: {'Fashion': 3, 'Back to School': 1, 'Pet Supplies': 1, 'Phones': 3, 'Ereaders': 1, 'Kitchen cleaning Supplies': 1, 'Paini

## DGL Graph Construction

In [None]:
total_train_samples = 2000

In [None]:
# UserId mapping
node2user = {}
user2node = {}

idx = 0
for user,_ in train_user2CatsP.items():
  user2node[user] = idx
  node2user[idx] = user
  idx += 1
  if idx == total_train_samples:
    break

for user,_ in test_user2CatsP.items():
  user2node[user] = idx
  node2user[idx] = user
  idx += 1


# Items mapping
node2item = {}
item2node = {}

unique_categories_features_sorted = sorted(unique_categories_features)
idx = 0
for item in unique_categories_features_sorted:
  item2node[item] = idx
  node2item[idx] = item
  idx += 1

In [None]:
print(f'Number of user nodes: {len(user2node)}')
print(f'Number of item nodes: {len(item2node)}')

Number of user nodes: 21981
Number of item nodes: 18


In [None]:
# first_test_node = total_unique_train_samples
# print(f'First test node: {first_test_node}')
# print(f'First test node\'s user id: {node2user[first_test_node]}' )


first_test_node = total_train_samples
print(f'First test node: {first_test_node}')
print(f'First test node\'s user id: {node2user[first_test_node]}' )


First test node: 2000
First test node's user id: 43323


In [None]:
purchase_src = []
purchase_dst = []

n_features = len(item2node)

user_features = torch.zeros(len(user2node), n_features).to(device)
item_features = torch.eye(n_features).to(device)

idx = 0
for user_id, v in train_user2CatsP.items():
  node_user_id = user2node[user_id]
  for item, cnt in train_user2CatsP[user_id].items():
    node_item_id = item2node[item]
    purchase_src.append(node_user_id)
    purchase_dst.append(node_item_id)
  
    user_features[node_user_id][node_item_id] = int(cnt)
  
  idx += 1
  if idx == total_train_samples:
    break

n_train_pur = len(purchase_src)
print(f'Total number of edges in train samples: {len(purchase_src)}')

for user_id, v in test_user2CatsP.items():
  node_user_id = user2node[user_id]
  for item, cnt in test_user2CatsP[user_id].items():
    node_item_id = item2node[item]
    purchase_src.append(node_user_id)
    purchase_dst.append(node_item_id)

    user_features[node_user_id][node_item_id] = int(cnt)

print(f'Total number of edges in test samples: {len(purchase_src) - n_train_pur}')

purchase_src = np.array(purchase_src)
purchase_dst = np.array(purchase_dst)

purchase_src = torch.from_numpy(purchase_src).to(device)
purchase_dst = torch.from_numpy(purchase_dst).to(device)

Total number of edges in train samples: 11487
Total number of edges in test samples: 104096


In [None]:
# Construct heterogenous graph
hetero_graph = dgl.heterograph({
    ('user', 'purchase', 'item') : (purchase_src, purchase_dst),
    ('item', 'purchased_by', 'user'): (purchase_dst, purchase_src)
}, ).to(device)

In [None]:
print(f'Total number of nodes: {hetero_graph.num_nodes()}')
print(f'Total number of edges: {hetero_graph.num_edges()}')

Total number of nodes: 21999
Total number of edges: 231166


In [None]:
# Number of purchases = Number of edges
n_purchases = len(purchase_src)
print(n_purchases)

115583


In [None]:
hetero_graph

Graph(num_nodes={'item': 18, 'user': 21981},
      num_edges={('item', 'purchased_by', 'user'): 115583, ('user', 'purchase', 'item'): 115583},
      metagraph=[('item', 'user', 'purchased_by'), ('user', 'item', 'purchase')])

In [None]:
# Node features
hetero_graph.nodes['user'].data['features'] = user_features

# item features
hetero_graph.nodes['item'].data['features'] = item_features

total_train_purchases = np.arange(n_train_pur)
random.shuffle(total_train_purchases)
total_train_purchases = torch.from_numpy(total_train_purchases)

train_size = int(5000 * 0.8)

train_purchases = torch.zeros(n_purchases, dtype=torch.bool).to(device)
train_purchases[total_train_purchases[:train_size]] = 1

val_purchases = torch.zeros(n_purchases, dtype=torch.bool).to(device)
val_purchases[total_train_purchases[train_size:]] = 1

test_purchases = torch.zeros(n_purchases, dtype=torch.bool).to(device)
test_purchases[n_train_pur:] = 1

# Train mask
hetero_graph.edges['purchase'].data['train_mask'] = train_purchases

# Val mask
hetero_graph.edges['purchase'].data['val_mask'] = val_purchases

# Test mask
hetero_graph.edges['purchase'].data['test_mask'] = test_purchases

In [None]:
x = hetero_graph.edges['purchase'].data['train_mask'].sum()
y = hetero_graph.edges['purchase'].data['val_mask'].sum()
z = hetero_graph.edges['purchase'].data['test_mask'].sum()

assert(n_purchases == (x+y+z))

## Utilities

In [None]:
class HeteroDotProductPredictor(nn.Module):
  def forward(self, graph, h, etype):
    with graph.local_scope():
      graph.ndata['h'] = h
      graph.apply_edges(fn.u_dot_v('h', 'h', 'score'), etype=etype)
      return graph.edges[etype].data['score']

In [None]:
def construct_negative_graph(graph, k, mask, etype):
  utype, _, vtype = etype
  src, dst = graph.edges(etype=etype)
  src = src[mask]
  neg_src = src.repeat_interleave(k).to(device)
  neg_dst = torch.randint(0, graph.num_nodes(vtype), (len(src) * k,)).to(device)
  return dgl.heterograph(
    {etype: (neg_src, neg_dst)},
    num_nodes_dict={ntype: graph.num_nodes(ntype) for ntype in graph.ntypes})

## GNN Architecture

In [None]:
class RGCN(nn.Module):
  def __init__(self, in_feats, hid_feats, out_feats, edge_types):
    super().__init__()

    self.conv1 = dglnn.HeteroGraphConv({
        edge_type: dglnn.GraphConv(in_feats, hid_feats)
        for edge_type in edge_types}, aggregate='sum')
    self.conv2 = dglnn.HeteroGraphConv({
        edge_type: dglnn.GraphConv(hid_feats, out_feats)
        for edge_type in edge_types}, aggregate='sum')

  def forward(self, graph, inputs):
    # inputs: dictionary with keys as node types and values as their features
    # key: 'user'
    # value: feature matrix of shape: (num_users, feature_size)
    # key: 'item'
    # value: feature matrix of shape: (num_items, feature_size)
    h = self.conv1(graph, inputs)
    h = {k: F.relu(v) for k, v in h.items()}
    h = self.conv2(graph, h)
    return h

In [None]:
class Model(nn.Module):
  def __init__(self, in_features, hidden_features, out_features, edge_types):
    super().__init__()
    self.sage = RGCN(in_features, hidden_features, out_features, edge_types)
    self.pred = HeteroDotProductPredictor()
  def forward(self, g, neg_g, x, etype):
    h = self.sage(g, x)
    return h, self.pred(g, h, etype), self.pred(neg_g, h, etype)

## Loss function

In [None]:
# Max Margin Loss
def compute_loss(pos_score, neg_score):
  n_edges = pos_score.shape[0]
  return (1 - pos_score.unsqueeze(1) + neg_score.view(n_edges, -1)).clamp(min=0).mean()

## Training

In [None]:
k = 2
in_features = 18
hidden_features = 10
out_features = 18

hetero_graph = hetero_graph.to(device)

model = Model(in_features, hidden_features, out_features, hetero_graph.etypes).to(device)
user_feats = hetero_graph.nodes['user'].data['features']
item_feats = hetero_graph.nodes['item'].data['features']
node_features = {'user': user_feats, 'item': item_feats}


opt = torch.optim.Adam(model.parameters())


def train():
  best_val_loss = np.inf
  best_h = None
  for epoch in range(epochs):
    # train
    model.train()
    train_mask = hetero_graph.edges['purchase'].data['train_mask']
    negative_graph = construct_negative_graph(hetero_graph, k, train_mask, ('user', 'purchase', 'item'))
    h, pos_score, neg_score = model(hetero_graph, negative_graph, node_features, ('user', 'purchase', 'item'))
    
    train_loss = compute_loss(pos_score[train_mask], neg_score)
    opt.zero_grad()
    train_loss.backward()
    opt.step()
    
    # validate
    model.eval()
    val_mask = hetero_graph.edges['purchase'].data['val_mask']
    negative_graph = construct_negative_graph(hetero_graph, k, val_mask, ('user', 'purchase', 'item'))
    _, pos_score, neg_score = model(hetero_graph, negative_graph, node_features, ('user', 'purchase', 'item'))
    
    val_loss = compute_loss(pos_score[val_mask], neg_score)

    print(f'Epoch: [{epoch+1}/{epochs}], Train Loss: {train_loss}, Val Loss: {val_loss}')

    if (val_loss < best_val_loss):
      print('=>Validation loss decreased. Saving model...')
      best_val_loss = val_loss
      best_h = h
  return best_h

In [None]:
epochs = 10
h = train()

Epoch: [1/10], Train Loss: 0.9797840714454651, Val Loss: 0.9564928412437439
=>Validation loss decreased. Saving model...
Epoch: [2/10], Train Loss: 0.9572234153747559, Val Loss: 0.932738184928894
=>Validation loss decreased. Saving model...
Epoch: [3/10], Train Loss: 0.9334548711776733, Val Loss: 0.9058462381362915
=>Validation loss decreased. Saving model...
Epoch: [4/10], Train Loss: 0.9099428653717041, Val Loss: 0.8787946105003357
=>Validation loss decreased. Saving model...
Epoch: [5/10], Train Loss: 0.8786882758140564, Val Loss: 0.8461350202560425
=>Validation loss decreased. Saving model...
Epoch: [6/10], Train Loss: 0.8487399816513062, Val Loss: 0.8120384216308594
=>Validation loss decreased. Saving model...
Epoch: [7/10], Train Loss: 0.8149893283843994, Val Loss: 0.7735045552253723
=>Validation loss decreased. Saving model...
Epoch: [8/10], Train Loss: 0.7775235176086426, Val Loss: 0.7391965389251709
=>Validation loss decreased. Saving model...
Epoch: [9/10], Train Loss: 0.7402

## Inference

In [None]:
st = first_test_node
en = hetero_graph.num_nodes()-18-1

u_id = []
items = []

for i in range(st, en+1):
  user = node2user[i]
  op = torch.zeros(18)
  for j in range(0, 18):
    prod = node2item[j]
    
    user_h = h['user'][i]
    item_h = h['item'][j]

    num = torch.dot(user_h, item_h)
    den = torch.linalg.norm(user_h) * torch.linalg.norm(item_h)
    den = max(den, 1e-8)
    
    c_sim = (num / den).item()
    op[j] = c_sim
    
  _,indices = op.topk(3)
  u_id.append(user)
  item_string = ""
  item_string += node2item[indices[0].item()]
  item_string += ","
  item_string += node2item[indices[1].item()]
  item_string += ","
  item_string += node2item[indices[2].item()]
  items.append(item_string)

In [None]:
sub_df = pd.DataFrame({'user_id': u_id, 'pred3': items})

In [None]:
from google.colab import files

sub_df.to_csv('sub.csv', index=False)
files.download('sub.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>