# 1. Imports

#### Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/MyDrive')  # change current dir to folder with datasets
!ls                                 # datasets should be listed

datasets  Storage


#### General

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor

In [None]:
import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, f1_score

#### PyG

In [None]:
os.environ['TORCH'] = torch.__version__
print('torch version:', torch.__version__)

!pip3 install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip3 install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip3 install -q git+https://github.com/pyg-team/pytorch_geometric.git

torch version: 2.0.0+cu118
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone


In [None]:
from torch_geometric.nn import GCNConv, GATConv, SAGEConv, to_hetero
from torch_geometric.data import Data, HeteroData
from torch_geometric.utils import negative_sampling
from torch_geometric.loader import LinkNeighborLoader
import torch_geometric.transforms as T

#### Transformers

In [None]:
!pip3 install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import BertTokenizer, BertModel, AutoModel, AutoTokenizer

### Device

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('device:', device)

device: cpu


# 2. Dataframes

In [None]:
def get_users_and_items(users_csv, items_csv, user_naming, item_naming):
    df_users = pd.read_csv(users_csv)
    print('users\n===========')
    print('users shape:', df_users.shape)
    print('unique users:', len(df_users[user_naming].unique()))
    print('unique articles:', len(df_users[item_naming].unique()))
    print(df_users.head())
    print()

    df_items = pd.read_csv(items_csv)
    print('items\n===========')
    print('items shape:', df_items.shape)
    print('unique articles:', len(df_items[item_naming].unique()))
    print(df_items.head())
    print()

    print('article set is same in two files:', set(df_users[item_naming].unique()) == set(df_items[item_naming].unique()))
    assert set(df_users[item_naming].unique()) == set(df_items[item_naming].unique())
    assert df_items.shape[0] == df_items[item_naming].nunique()
    
    return df_users, df_items

Select one of the below datasets and run its cell.

## Adressa

### Adressa Norwegian nbBert

In [None]:
fusers = 'datasets/adressa/impressions/common_user_item.csv'
fitems = 'datasets/adressa/content/read_articles.csv'
user_naming, item_naming = 'userId', 'id'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = 'datasets/adressa/content/feat_tensor_no_nbbert.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (1857389, 2)
unique users: 535259
unique articles: 15721
                                              userId  \
0  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
1  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
2  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
3  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
4  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   

                                         id  
0  01c74ff7ec02862bed5f861cba7e7226dfd31beb  
1  2228c8e09bd615b509f87347675f4f0f1b74439e  
2  265d0e347e08d19ab62ffe4eadd0333eaad6e57c  
3  3011c1cce4999e3c4e02bd887468d1d1d90b7807  
4  44d3e6498035638f2e1c0580332b67e0d71af45c  

items
items shape: (15721, 2)
unique articles: 15721
                                         id  \
0  6a0612e60690288a776834811004ce133f326cee   
1  b40a30877124510cf65683b6c9391d927e20f89d   
2  29b7afc1d16d34b639597cab200c9c4c96e2b69e   
3  e6877439db8ca3bde8be8f6edb741fcc22c2772c   
4  a30fec2f3e9f2e408a2

### Adressa Norwegian mBert

In [None]:
fusers = 'datasets/adressa/impressions/common_user_item.csv'
fitems = 'datasets/adressa/content/read_articles.csv'
user_naming, item_naming = 'userId', 'id'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = 'datasets/adressa/content/feat_tensor_no_mbert.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (1857389, 2)
unique users: 535259
unique articles: 15721
                                              userId  \
0  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
1  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
2  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
3  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
4  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   

                                         id  
0  01c74ff7ec02862bed5f861cba7e7226dfd31beb  
1  2228c8e09bd615b509f87347675f4f0f1b74439e  
2  265d0e347e08d19ab62ffe4eadd0333eaad6e57c  
3  3011c1cce4999e3c4e02bd887468d1d1d90b7807  
4  44d3e6498035638f2e1c0580332b67e0d71af45c  

items
items shape: (15721, 2)
unique articles: 15721
                                         id  \
0  6a0612e60690288a776834811004ce133f326cee   
1  b40a30877124510cf65683b6c9391d927e20f89d   
2  29b7afc1d16d34b639597cab200c9c4c96e2b69e   
3  e6877439db8ca3bde8be8f6edb741fcc22c2772c   
4  a30fec2f3e9f2e408a2

### Adressa Norwegian xlm

In [None]:
fusers = 'datasets/adressa/impressions/common_user_item.csv'
fitems = 'datasets/adressa/content/read_articles.csv'
user_naming, item_naming = 'userId', 'id'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = 'datasets/adressa/content/feat_tensor_no_xlm.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (1857389, 2)
unique users: 535259
unique articles: 15721
                                              userId  \
0  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
1  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
2  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
3  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
4  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   

                                         id  
0  01c74ff7ec02862bed5f861cba7e7226dfd31beb  
1  2228c8e09bd615b509f87347675f4f0f1b74439e  
2  265d0e347e08d19ab62ffe4eadd0333eaad6e57c  
3  3011c1cce4999e3c4e02bd887468d1d1d90b7807  
4  44d3e6498035638f2e1c0580332b67e0d71af45c  

items
items shape: (15721, 2)
unique articles: 15721
                                         id  \
0  6a0612e60690288a776834811004ce133f326cee   
1  b40a30877124510cf65683b6c9391d927e20f89d   
2  29b7afc1d16d34b639597cab200c9c4c96e2b69e   
3  e6877439db8ca3bde8be8f6edb741fcc22c2772c   
4  a30fec2f3e9f2e408a2

### Adressa Norwegian translated to English enBert

In [None]:
fusers = 'datasets/adressa/impressions/common_user_item.csv'
fitems = 'datasets/adressa/content/read_artciles_tr.csv'
user_naming, item_naming = 'userId', 'id'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = 'datasets/adressa/content/feat_tensor_tr_enbert.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (1857389, 2)
unique users: 535259
unique articles: 15721
                                              userId  \
0  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
1  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
2  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
3  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
4  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   

                                         id  
0  01c74ff7ec02862bed5f861cba7e7226dfd31beb  
1  2228c8e09bd615b509f87347675f4f0f1b74439e  
2  265d0e347e08d19ab62ffe4eadd0333eaad6e57c  
3  3011c1cce4999e3c4e02bd887468d1d1d90b7807  
4  44d3e6498035638f2e1c0580332b67e0d71af45c  

items
items shape: (15721, 2)
unique articles: 15721
                                         id  \
0  6a0612e60690288a776834811004ce133f326cee   
1  b40a30877124510cf65683b6c9391d927e20f89d   
2  29b7afc1d16d34b639597cab200c9c4c96e2b69e   
3  e6877439db8ca3bde8be8f6edb741fcc22c2772c   
4  a30fec2f3e9f2e408a2

## CIT

### CIT Portuguese ptBert

In [None]:
fusers = 'datasets/cit/common_user_item_pt.csv'
fitems = 'datasets/cit/read_articles_pt.csv'
user_naming, item_naming = 'personId', 'contentId'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = 'datasets/cit/feat_tensor_pt_ptbert.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (14417, 4)
unique users: 1619
unique articles: 821
             contentId             personId  \
0   310515487419366995 -1130272294246983140   
1   310515487419366995   344280948527967603   
2   310515487419366995 -8763398617720485024   
3 -8864073373672512525  3609194402293569455   
4   310515487419366995  3609194402293569455   

                                                 url lang  
0  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
1  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
2  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
3  http://www.startupsstars.com/2016/06/video-int...   pt  
4  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  

items
items shape: (821, 4)
unique articles: 821
             contentId                                              title  \
0 -9157338616628196758  Situação financeira ruim de varejistas pressio...   
1 -5917314377186856799  Artigos e Palestras - Programa Agricultura de ...   
2  615

### CIT Portuguese mBert

In [None]:
fusers = 'datasets/cit/common_user_item_pt.csv'
fitems = 'datasets/cit/read_articles_pt.csv'
user_naming, item_naming = 'personId', 'contentId'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = 'datasets/cit/feat_tensor_pt_mbert.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (14417, 4)
unique users: 1619
unique articles: 821
             contentId             personId  \
0   310515487419366995 -1130272294246983140   
1   310515487419366995   344280948527967603   
2   310515487419366995 -8763398617720485024   
3 -8864073373672512525  3609194402293569455   
4   310515487419366995  3609194402293569455   

                                                 url lang  
0  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
1  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
2  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
3  http://www.startupsstars.com/2016/06/video-int...   pt  
4  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  

items
items shape: (821, 4)
unique articles: 821
             contentId                                              title  \
0 -9157338616628196758  Situação financeira ruim de varejistas pressio...   
1 -5917314377186856799  Artigos e Palestras - Programa Agricultura de ...   
2  615

### CIT Portuguese xlm

In [None]:
fusers = 'datasets/cit/common_user_item_pt.csv'
fitems = 'datasets/cit/read_articles_pt.csv'
user_naming, item_naming = 'personId', 'contentId'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = 'datasets/cit/feat_tensor_pt_xlm.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (14417, 4)
unique users: 1619
unique articles: 821
             contentId             personId  \
0   310515487419366995 -1130272294246983140   
1   310515487419366995   344280948527967603   
2   310515487419366995 -8763398617720485024   
3 -8864073373672512525  3609194402293569455   
4   310515487419366995  3609194402293569455   

                                                 url lang  
0  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
1  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
2  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
3  http://www.startupsstars.com/2016/06/video-int...   pt  
4  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  

items
items shape: (821, 4)
unique articles: 821
             contentId                                              title  \
0 -9157338616628196758  Situação financeira ruim de varejistas pressio...   
1 -5917314377186856799  Artigos e Palestras - Programa Agricultura de ...   
2  615

### CIT Portuguese Translated to English enBert

In [None]:
fusers = 'datasets/cit/common_user_item_pt.csv'
fitems = 'datasets/cit/read_articles_tr.csv'
user_naming, item_naming = 'personId', 'url'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = 'datasets/cit/feat_tensor_tr_enbert.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (14417, 4)
unique users: 1619
unique articles: 821
             contentId             personId  \
0   310515487419366995 -1130272294246983140   
1   310515487419366995   344280948527967603   
2   310515487419366995 -8763398617720485024   
3 -8864073373672512525  3609194402293569455   
4   310515487419366995  3609194402293569455   

                                                 url lang  
0  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
1  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
2  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
3  http://www.startupsstars.com/2016/06/video-int...   pt  
4  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  

items
items shape: (821, 3)
unique articles: 821
      contentId                                              title  \
0 -9.160000e+18  Bad financial situation of retailers puts pres...   
1 -5.920000e+18  Articles and Lectures - SENAR Precision Agricu...   
2  6.160000e+18            

### CIT English enBert

In [None]:
fusers = 'datasets/cit/common_user_item_en.csv'
fitems = 'datasets/cit/read_articles_en.csv'
user_naming, item_naming = 'personId', 'contentId'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = 'datasets/cit/feat_tensor_en_enbert.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (26206, 4)
unique users: 1644
unique articles: 2154
             contentId             personId  \
0 -3499919498720038879 -8845298781299428018   
1  8890720798209849691 -1032019229384696495   
2 -7820640624231356730  -445337111692715325   
3 -1492913151930215984  4254153380739593270   
4  3064370296170038610  3609194402293569455   

                                                 url lang  
0             http://techcrunch.com/2016/06/07/hiri/   en  
1  https://www.nngroup.com/articles/top-intranet-...   en  
2  http://www.fastcompany.com/3058876/your-most-p...   en  
3  https://developer.chrome.com/devtools/docs/con...   en  
4  https://www.linkedin.com/pulse/google-amazon-u...   en  

items
items shape: (2154, 4)
unique articles: 2154
             contentId                                              title  \
0 -6451309518266745024  Ethereum, a Virtual Currency, Enables Transact...   
1 -4110354420726924665  Ethereum, a Virtual Currency, Enables Transact...   
2 -

### CIT Mixed Portuguese+English mBert

In [None]:
fusers = 'datasets/cit/common_user_item_pten.csv'
fitems = 'datasets/cit/read_articles.csv'
user_naming, item_naming = 'personId', 'contentId'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = 'datasets/cit/feat_tensor_pten_mbert.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (40623, 4)
unique users: 1894
unique articles: 2975
             contentId             personId  \
0 -3499919498720038879 -8845298781299428018   
1  8890720798209849691 -1032019229384696495   
2 -7820640624231356730  -445337111692715325   
3 -1492913151930215984  4254153380739593270   
4  3064370296170038610  3609194402293569455   

                                                 url lang  
0             http://techcrunch.com/2016/06/07/hiri/   en  
1  https://www.nngroup.com/articles/top-intranet-...   en  
2  http://www.fastcompany.com/3058876/your-most-p...   en  
3  https://developer.chrome.com/devtools/docs/con...   en  
4  https://www.linkedin.com/pulse/google-amazon-u...   en  

items
items shape: (2975, 4)
unique articles: 2975
             contentId                                              title  \
0 -6451309518266745024  Ethereum, a Virtual Currency, Enables Transact...   
1 -4110354420726924665  Ethereum, a Virtual Currency, Enables Transact...   
2 -

### CIT Mixed Portuguese+English xlm

In [None]:
fusers = 'datasets/cit/common_user_item_pten.csv'
fitems = 'datasets/cit/read_articles.csv'
user_naming, item_naming = 'personId', 'contentId'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = 'datasets/cit/feat_tensor_pten_xlm.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (40623, 4)
unique users: 1894
unique articles: 2975
             contentId             personId  \
0 -3499919498720038879 -8845298781299428018   
1  8890720798209849691 -1032019229384696495   
2 -7820640624231356730  -445337111692715325   
3 -1492913151930215984  4254153380739593270   
4  3064370296170038610  3609194402293569455   

                                                 url lang  
0             http://techcrunch.com/2016/06/07/hiri/   en  
1  https://www.nngroup.com/articles/top-intranet-...   en  
2  http://www.fastcompany.com/3058876/your-most-p...   en  
3  https://developer.chrome.com/devtools/docs/con...   en  
4  https://www.linkedin.com/pulse/google-amazon-u...   en  

items
items shape: (2975, 4)
unique articles: 2975
             contentId                                              title  \
0 -6451309518266745024  Ethereum, a Virtual Currency, Enables Transact...   
1 -4110354420726924665  Ethereum, a Virtual Currency, Enables Transact...   
2 -

### CIT Train on English only and Test on Translated + English enBert

Train on Orig EN first -> 'CIT English enBert'

Then test on the following Mix

In [None]:
fusers = 'datasets/cit/common_user_item_tren.csv'
fitems = 'datasets/cit/read_articles_tren.csv'
user_naming, item_naming = 'personId', 'url'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = 'datasets/cit/feat_tensor_tren_enbert.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (2956, 4)
unique users: 653
unique articles: 2956
             contentId             personId  \
0 -3499919498720038879 -8845298781299428018   
1  8890720798209849691 -1032019229384696495   
2 -7820640624231356730  -445337111692715325   
3 -1492913151930215984  4254153380739593270   
4  3064370296170038610  3609194402293569455   

                                                 url lang  
0             http://techcrunch.com/2016/06/07/hiri/   en  
1  https://www.nngroup.com/articles/top-intranet-...   en  
2  http://www.fastcompany.com/3058876/your-most-p...   en  
3  https://developer.chrome.com/devtools/docs/con...   en  
4  https://www.linkedin.com/pulse/google-amazon-u...   en  

items
items shape: (2956, 4)
unique articles: 2956
      contentId                                              title  \
0 -6.451310e+18  Ethereum, a Virtual Currency, Enables Transact...   
1 -7.292285e+18  Bitcoin Future: When GBPcoin of Branson Wins O...   
2 -6.151852e+18           

### CIT Train with Translated + English and Test on Translated + English enBert

In [None]:
fusers = 'datasets/cit/common_user_item_tren.csv'
fitems = 'datasets/cit/read_articles_tren.csv'
user_naming, item_naming = 'personId', 'url'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = 'datasets/cit/feat_tensor_tren_enbert.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (2956, 4)
unique users: 653
unique articles: 2956
             contentId             personId  \
0 -3499919498720038879 -8845298781299428018   
1  8890720798209849691 -1032019229384696495   
2 -7820640624231356730  -445337111692715325   
3 -1492913151930215984  4254153380739593270   
4  3064370296170038610  3609194402293569455   

                                                 url lang  
0             http://techcrunch.com/2016/06/07/hiri/   en  
1  https://www.nngroup.com/articles/top-intranet-...   en  
2  http://www.fastcompany.com/3058876/your-most-p...   en  
3  https://developer.chrome.com/devtools/docs/con...   en  
4  https://www.linkedin.com/pulse/google-amazon-u...   en  

items
items shape: (2956, 4)
unique articles: 2956
      contentId                                              title  \
0 -6.451310e+18  Ethereum, a Virtual Currency, Enables Transact...   
1 -7.292285e+18  Bitcoin Future: When GBPcoin of Branson Wins O...   
2 -6.151852e+18           

## MIND

In [None]:
i = 0
fusers = f'datasets/mind/MINDlarge_train/common_user_item_{i}.csv'
fitems = f'datasets/mind/MINDlarge_train/read_article_titles_{i}.csv'
user_naming, item_naming = 'User_ID', 'News_ID'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = f'datasets/mind/MINDlarge_train/feat_tensor_{i}.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (4658157, 2)
unique users: 155855
unique articles: 67147
   User_ID  News_ID
0  U321454  N128643
1  U578952  N122359
2  U578952  N110096
3  U578952   N20583
4  U578952  N128736

items
items shape: (67147, 2)
unique articles: 67147
  News_ID                                              title
0  N88753  The Brands Queen Elizabeth, Prince Charles, an...
1  N45436    Walmart Slashes Prices on Last-Generation iPads
2  N23144                      50 Worst Habits For Belly Fat
3  N86255  Dispose of unwanted prescription drugs during ...
4  N93187  The Cost of Trump's Aid Freeze in the Trenches...

article set is same in two files: True
feature tensor: torch.Size([67147, 768])


# Run PLM

In [None]:
plm_models = {
    'mbert':  'bert-base-multilingual-cased',
    'enbert': 'bert-base-cased',
    'nbbert': 'NbAiLab/nb-bert-base',
    'ptbert': 'neuralmind/bert-base-portuguese-cased',
    'xlm':    'microsoft/infoxlm-base',
    'gpt2':   'gpt2'
}

In [None]:
def load_plm_model(plm_model_name, device):
    lib_plm_model_name = plm_models[plm_model_name]
    print('Loading PLM: ', lib_plm_model_name)
    plm_model = AutoModel.from_pretrained(lib_plm_model_name).to(device)
    plm_tokenizer = AutoTokenizer.from_pretrained(lib_plm_model_name)
    return plm_model, plm_tokenizer


def get_plm_embeddings(plm_model, plm_tokenizer, df_items, title_naming, device, max_encoding_length=50, batch_size=1024):
    print('Getting embeddings for article titles using PLM')
    print('df_items shape:', df_items.shape)

    encoded_inputs = plm_tokenizer(list(df_items[title_naming]), max_length=max_encoding_length, padding=True, truncation=True, return_tensors="pt")
    input_ids = encoded_inputs['input_ids'].to(device)
    print('input_ids shape:', input_ids.shape)

    feat_vectors = []
    for i in range(0, len(input_ids), batch_size):
        print(f'Progress step: {i+1} / {len(input_ids)//batch_size + 1}')
        encoded_inputs_batch = input_ids[i:, :] if i+batch_size >= len(input_ids) else input_ids[i:i+batch_size, :]
        with torch.no_grad():
            plm_output = plm_model(encoded_inputs_batch)[1]
        feat_vectors.append(plm_output)
    
    feat_tensor = torch.cat(feat_vectors, 0)
    return feat_tensor

In [None]:
plm_model_name = 'ptbert'
plm_model, plm_tokenizer = load_plm_model(plm_model_name, device)

Loading PLM:  neuralmind/bert-base-portuguese-cased


Downloading (…)lve/main/config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
title_naming = 'title'
feat_tensor = get_plm_embeddings(plm_model, plm_tokenizer, df_items, title_naming, device)
print('Output feat_tensor shape:', feat_tensor.shape)

Getting embeddings for article titles using PLM
df_items shape: (821, 4)
input_ids shape: torch.Size([821, 50])
Progress step: 1 / 1
Output feat_tensor shape: torch.Size([821, 768])


# 3. Graph Construction

We construct graph from the user-item interaction.

In [None]:
user_has_features = True
user_feature_size = 16          # 32
item_feature_init = 'plm'       # 'plm', 'zero', 'random'
item_feature_size = 768         # feature_tensor.size()[1]
random_edges = False

#### Reload

In [None]:
# Create a mapping from unique user indices to range [0, num_user_nodes):
unique_user_id = df_users[user_naming].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedID': pd.RangeIndex(len(unique_user_id)),
})
print("Mapping of user IDs to consecutive values:")
print("==========================================")
print(unique_user_id.head())
print()

Mapping of user IDs to consecutive values:
                userId  mappedID
0 -1130272294246983140         0
1   344280948527967603         1
2 -8763398617720485024         2
3  3609194402293569455         3
4  1908339160857512799         4



In [None]:
# Create a mapping from unique item indices to range [0, num_item_nodes):
assert not any(df_items[item_naming].values != df_items[item_naming].unique())
unique_item_id = df_items[item_naming]
unique_item_id = pd.DataFrame(data={
    'itemId': unique_item_id,
    'mappedID': pd.RangeIndex(len(unique_item_id)),
})
print("Mapping of item IDs to consecutive values:")
print("===========================================")
print(unique_item_id.head())

Mapping of item IDs to consecutive values:
                itemId  mappedID
0 -9157338616628196758         0
1 -5917314377186856799         1
2  6157037646878010131         2
3 -1672166631728511207         3
4  5714314286511882372         4


In [None]:
# Perform merge to obtain the edges from users and items:
ratings_user_id = pd.merge(df_users[user_naming], unique_user_id,
                            left_on=user_naming, right_on='userId', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
ratings_item_id = pd.merge(df_users[item_naming], unique_item_id,
                            left_on=item_naming, right_on='itemId', how='left')
ratings_item_id = torch.from_numpy(ratings_item_id['mappedID'].values)

Random edges result in 0.5 AUC and 0.0 F1 score.

In [None]:
# Random edges
if random_edges:
  ratings_user_id = torch.from_numpy(np.random.randint(low=0, high=len(unique_user_id)-1, size=len(unique_user_id)))
  ratings_item_id = torch.from_numpy(np.random.randint(low=0, high=len(unique_item_id)-1, size=len(unique_user_id)))

In [None]:
# With this, we are ready to construct our `edge_index` in COO format
# following PyG semantics:
edge_index_coo_user_to_item = torch.stack([ratings_user_id, ratings_item_id], dim=0)

print()
print("Final edge indices pointing from users to items:")
print("=================================================")
print(edge_index_coo_user_to_item)


Final edge indices pointing from users to items:
tensor([[   0,    1,    2,  ...,   46, 1405, 1468],
        [ 260,  260,  260,  ...,  783,  618,  782]])


In [None]:
data = HeteroData()

# Save node indices:
data["user"].node_id = torch.arange(len(unique_user_id))
data["item"].node_id = torch.arange(len(unique_item_id))

# Add the node features and edge indices:
if item_feature_init == 'plm':
  data["item"].x = feature_tensor  # DONE
  if user_has_features:
  # only random init for user features
    data["user"].x = torch.Tensor(np.random.random((len(unique_user_id), user_feature_size)))
    print('user features shape:', data["user"].x.shape)
elif item_feature_init == 'random':
  data["item"].x = torch.Tensor(np.random.random((len(unique_item_id), item_feature_size)))
elif item_feature_init == 'zero':
  data["item"].x = torch.Tensor(np.zeros((len(unique_item_id), item_feature_size)))
else:
  raise 'Need item feature init type: plm, random, zero'

print('item features shape:', data["item"].x.shape)
data["user", "rates", "item"].edge_index = edge_index_coo_user_to_item  # DONE

# We also need to make sure to add the reverse edges from items to users
# in order to let a GNN be able to pass messages in both directions.
# We can leverage the `T.ToUndirected()` transform for this from PyG:
data = T.ToUndirected()(data)
del data['item', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.

print(data)
print("=============================")
print('user num_nodes:', data["user"].num_nodes)
print('user num_features:', data["user"].num_features)
print('item num_nodes:', data["item"].num_nodes)
print('item num_features:', data["item"].num_features)
print('num_edges user->item:', data["user", "rates", "item"].num_edges)
print('num_edges item->user:', data["item", "rev_rates", "user"].num_edges)

user features shape: torch.Size([1619, 16])
item features shape: torch.Size([821, 768])
HeteroData(
  [1muser[0m={
    node_id=[1619],
    x=[1619, 16]
  },
  [1mitem[0m={
    node_id=[821],
    x=[821, 768]
  },
  [1m(user, rates, item)[0m={ edge_index=[2, 14417] },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 14417] }
)
user num_nodes: 1619
user num_features: 16
item num_nodes: 821
item num_features: 768
num_edges user->item: 14417
num_edges item->user: 14417


# 4. Data Loaders

In [None]:
neg_sampling_ratio = 1.0        # hyper: 1, 2
batch_size = 128
num_neighbors = [10, 5]         # hyper: [10, 5], [10, 10], [20, 10], [20, 20]

#### Reload

In [None]:
# For this, we first split the set of edges into
# training (80%), validation (10%), and testing edges (10%).
# Across the training edges, we use 70% of edges for message passing,
# and 30% of edges for supervision.
# We further want to generate fixed negative edges for evaluation with a ratio of 2:1.
# Negative edges during training will be generated on-the-fly.
# We can leverage the `RandomLinkSplit()` transform for this from PyG:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=neg_sampling_ratio,
    add_negative_train_samples=False,       # negative samples generated on-the-fly
    edge_types=("user", "rates", "item"),
    rev_edge_types=("item", "rev_rates", "user"), 
)


train_data, val_data, test_data = transform(data)
print("Training data:")
print("==============")
print(train_data)
print()
print("Validation data:")
print("================")
print(val_data)
print()
print("Test data:")
print("================")
print(test_data)

Training data:
HeteroData(
  [1muser[0m={
    node_id=[1619],
    x=[1619, 16]
  },
  [1mitem[0m={
    node_id=[821],
    x=[821, 768]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 8075],
    edge_label=[3460],
    edge_label_index=[2, 3460]
  },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 8075] }
)

Validation data:
HeteroData(
  [1muser[0m={
    node_id=[1619],
    x=[1619, 16]
  },
  [1mitem[0m={
    node_id=[821],
    x=[821, 768]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 11535],
    edge_label=[2882],
    edge_label_index=[2, 2882]
  },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 11535] }
)

Test data:
HeteroData(
  [1muser[0m={
    node_id=[1619],
    x=[1619, 16]
  },
  [1mitem[0m={
    node_id=[821],
    x=[821, 768]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 12976],
    edge_label=[2882],
    edge_label_index=[2, 2882]
  },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 12976] }
)


In [None]:
print('Train data:')
print('=============')
print('present num_edges user->item:', train_data["user", "rates", "item"].num_edges)
print('to be predicted positive num_edges user->item:', train_data["user", "rates", "item"].edge_label_index.shape[1])
print('to be predicted edge classes:', torch.unique(train_data["user", "rates", "item"].edge_label))
print('negative edge classes [0.] would be generated during training on-the-fly')

Train data:
present num_edges user->item: 8075
to be predicted positive num_edges user->item: 3460
to be predicted edge classes: tensor([1.])
negative edge classes [0.] would be generated during training on-the-fly


In [None]:
# Training data:
# edge_index=[2, 1040140],      - number of edges to construct graph, 70% of edges for message passing,
#                               - edges already present in the graph
# edge_label=[445773],          - number of edges for training, 30% of edges for supervision.
#                               - labels of missing edges, [0, 1] - 0 for negative edge, 1 for positive
# edge_label_index=[2, 445773]  - edges that are absent in the graph for training 

So, we see that all the nodes are present both in training and validation data!

As well as all the item_node features.

Only the edge_index changes: which users connected to which items. Some of them are removed for train and val. 

We are now ready to create a mini-batch loader that will generate subgraphs that can be used as input into our GNN. While this step is not strictly necessary for small-scale graphs, it is absolutely necessary to apply GNNs on larger graphs that do not fit onto GPU memory otherwise. Here, we make use of the loader.LinkNeighborLoader which samples multiple hops from both ends of a link and creates a subgraph from it. Here, edge_label_index serves as the "seed links" to start sampling from.

In [None]:
# In the first hop, we sample at most 10 neighbors.
# In the second hop, we sample at most 5 neighbors.
# In addition, during training, we want to sample negative edges on-the-fly with
# a ratio of 2:1.
# We can make use of the `loader.LinkNeighborLoader` from PyG:
# Define seed edges:
edge_label_index = train_data["user", "rates", "item"].edge_label_index
edge_label = train_data["user", "rates", "item"].edge_label

train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=num_neighbors,
    neg_sampling_ratio=neg_sampling_ratio,
    edge_label_index=(("user", "rates", "item"), edge_label_index),
    edge_label=edge_label,
    batch_size=batch_size,
    shuffle=True,
)

# Inspect a sample:
sampled_data = next(iter(train_loader))

print("Sampled Train mini-batch:")
print("===================")
print(sampled_data)

assert sampled_data["user", "rates", "item"].edge_label_index.size(1) == (neg_sampling_ratio + 1) * batch_size # 2 edges (1 pos + 1 neg) * batch_size
assert sampled_data["user", "rates", "item"].edge_label.min() == 0
assert sampled_data["user", "rates", "item"].edge_label.max() == 1

Sampled Train mini-batch:
HeteroData(
  [1muser[0m={
    node_id=[949],
    x=[949, 16],
    n_id=[949]
  },
  [1mitem[0m={
    node_id=[682],
    x=[682, 768],
    n_id=[682]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 2860],
    edge_label=[256],
    edge_label_index=[2, 256],
    e_id=[2860],
    input_id=[128]
  },
  [1m(item, rev_rates, user)[0m={
    edge_index=[2, 3116],
    e_id=[3116]
  }
)




In [None]:
# Define the validation seed edges:
edge_label_index = val_data["user", "rates", "item"].edge_label_index
edge_label = val_data["user", "rates", "item"].edge_label

val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=num_neighbors,
    edge_label_index=(("user", "rates", "item"), edge_label_index),
    edge_label=edge_label,
    batch_size=int((neg_sampling_ratio + 1) * batch_size),
    shuffle=False,
)

sampled_data = next(iter(val_loader))

print("Sampled Validation mini-batch:")
print("===================")
print(sampled_data)

assert sampled_data["user", "rates", "item"].edge_label_index.size(1) == (neg_sampling_ratio + 1) * batch_size
assert sampled_data["user", "rates", "item"].edge_label.min() >= 0
assert sampled_data["user", "rates", "item"].edge_label.max() <= 1

Sampled Validation mini-batch:
HeteroData(
  [1muser[0m={
    node_id=[996],
    x=[996, 16],
    n_id=[996]
  },
  [1mitem[0m={
    node_id=[691],
    x=[691, 768],
    n_id=[691]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 3532],
    edge_label=[256],
    edge_label_index=[2, 256],
    e_id=[3532],
    input_id=[256]
  },
  [1m(item, rev_rates, user)[0m={
    edge_index=[2, 3873],
    e_id=[3873]
  }
)


In [None]:
# Define the test seed edges:
edge_label_index = test_data["user", "rates", "item"].edge_label_index
edge_label = test_data["user", "rates", "item"].edge_label

test_loader = LinkNeighborLoader(
    data=test_data,
    num_neighbors=num_neighbors,
    edge_label_index=(("user", "rates", "item"), edge_label_index),
    edge_label=edge_label,
    batch_size=int((neg_sampling_ratio + 1) * batch_size),
    shuffle=False,
)

sampled_data = next(iter(test_loader))

print("Sampled Test mini-batch:")
print("===================")
print(sampled_data)

assert sampled_data["user", "rates", "item"].edge_label_index.size(1) == (neg_sampling_ratio + 1) * batch_size
assert sampled_data["user", "rates", "item"].edge_label.min() >= 0
assert sampled_data["user", "rates", "item"].edge_label.max() <= 1

Sampled Test mini-batch:
HeteroData(
  [1muser[0m={
    node_id=[998],
    x=[998, 16],
    n_id=[998]
  },
  [1mitem[0m={
    node_id=[683],
    x=[683, 768],
    n_id=[683]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 3511],
    edge_label=[256],
    edge_label_index=[2, 256],
    e_id=[3511],
    input_id=[256]
  },
  [1m(item, rev_rates, user)[0m={
    edge_index=[2, 3980],
    e_id=[3980]
  }
)


# 5. Model

In [None]:
hidden_channels = 16                          # hyper: 16, 32, 64
GNN_Conv_Layer = SAGEConv                     # hyper: SAGEConv, GATConv
remove_embedding_layer = user_has_features    # False

#### Reload

We are now ready to create our heterogeneous GNN. The GNN is responsible for learning enriched node representations from the surrounding subgraphs, which can be then used to derive edge-level predictions. For defining our heterogenous GNN, we make use of nn.SAGEConv and the nn.to_hetero() function, which transforms a GNN defined on homogeneous graphs to be applied on heterogeneous ones.
In addition, we define a final link-level classifier, which simply takes both node embeddings of the link we are trying to predict, and applies a dot-product on them.
As users do not have any node-level information, we choose to learn their features jointly via a torch.nn.Embedding layer. In order to improve the expressiveness of movie features, we do the same for movie nodes, and simply add their shallow embeddings to the pre-defined genre features.

In [None]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        if GNN_Conv_Layer == GATConv:
            self.conv1 = GNN_Conv_Layer(hidden_channels, hidden_channels, add_self_loops=False)
            self.conv2 = GNN_Conv_Layer(hidden_channels, hidden_channels, add_self_loops=False)
        else:
            self.conv1 = GNN_Conv_Layer(hidden_channels, hidden_channels)
            self.conv2 = GNN_Conv_Layer(hidden_channels, hidden_channels)

    def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
        # Define a 2-layer GNN computation graph.
        # Use a *single* `ReLU` non-linearity in-between.
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:
class Classifier(torch.nn.Module):
    def forward(self, x_user: Tensor, x_item: Tensor, edge_label_index: Tensor) -> Tensor:
        # Convert node embeddings to edge-level representations:
        edge_feat_user = x_user[edge_label_index[0]]    # feat[ids of users in coo]
        edge_feat_item = x_item[edge_label_index[1]]    # feat[ids of items in coo]

        # Apply dot-product to get a prediction per supervision edge:
        return (edge_feat_user * edge_feat_item).sum(dim=-1)


class Model(torch.nn.Module):
    def __init__(self, user_fsize, item_fsize, num_user_nodes, num_item_nodes, hidden_channels, graph_metadata):
        super().__init__()
        # learn embedding matrices for users and items:
        if remove_embedding_layer:
          self.user_lin = torch.nn.Linear(user_fsize, hidden_channels)
        else:
          self.user_emb = torch.nn.Embedding(num_user_nodes, hidden_channels)
          self.item_emb = torch.nn.Embedding(num_item_nodes, hidden_channels)

        self.item_lin = torch.nn.Linear(item_fsize, hidden_channels)

        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)     # Encoder
        # Convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=graph_metadata, aggr='mean')
        self.classifier = Classifier()      # Deocder

    def forward(self, data: HeteroData) -> Tensor:
        if remove_embedding_layer:
          x_dict = {
            "user": self.user_lin(data["user"].x),
            "item": self.item_lin(data["item"].x),
          }
        else:
          x_dict = {
            "user": self.user_emb(data["user"].node_id),
            "item": self.item_lin(data["item"].x) + self.item_emb(data["item"].node_id),
          }

        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        # Encode
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        # Decode
        pred = self.classifier(
            x_dict["user"],
            x_dict["item"],
            data["user", "rates", "item"].edge_label_index,
        )

        return pred

In [None]:
# data contains the whole graph
num_user_nodes = data["user"].num_nodes
num_item_nodes = data["item"].num_nodes
graph_metadata = data.metadata()

model = Model(user_feature_size, item_feature_size, num_user_nodes, num_item_nodes, hidden_channels, graph_metadata)
model = model.to(device)
print(f"Device: '{device}'")
print(model)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = F.binary_cross_entropy_with_logits

Device: 'cpu'
Model(
  (user_lin): Linear(in_features=16, out_features=16, bias=True)
  (item_lin): Linear(in_features=768, out_features=16, bias=True)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (user__rates__item): SAGEConv(16, 16, aggr=mean)
      (item__rev_rates__user): SAGEConv(16, 16, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__item): SAGEConv(16, 16, aggr=mean)
      (item__rev_rates__user): SAGEConv(16, 16, aggr=mean)
    )
  )
  (classifier): Classifier()
)


# 6. Experiments

question is: is it enough to just translate news to english and use english recommender model?
or can use single multiling model for multiple languages?
or should we use a separate model for each language?

hypothesis: multiling is the best

In [None]:
epochs = 10      # mind=1, adressa=2, cit=50 

#### Reload

In [None]:
# training
train_loss = []
for epoch in range(1, epochs+1):
    model.train()
    total_loss = total_examples = 0
    for batch_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()
        batch_data = batch_data.to(device)
        
        pred = model(batch_data) # `forward` pass of the model
        ground_truth = batch_data['user', 'item'].edge_label
        
        loss = criterion(pred, ground_truth)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    
    epoch_loss = total_loss / total_examples
    train_loss.append(epoch_loss)
    print(f"\nEpoch: {epoch:03d}, Loss: {epoch_loss:.4f}")

100%|██████████| 28/28 [00:01<00:00, 17.75it/s]



Epoch: 001, Loss: 0.6875


100%|██████████| 28/28 [00:01<00:00, 17.74it/s]



Epoch: 002, Loss: 0.6642


100%|██████████| 28/28 [00:01<00:00, 16.65it/s]



Epoch: 003, Loss: 0.6514


100%|██████████| 28/28 [00:01<00:00, 16.95it/s]



Epoch: 004, Loss: 0.6413


100%|██████████| 28/28 [00:00<00:00, 28.78it/s]



Epoch: 005, Loss: 0.6245


100%|██████████| 28/28 [00:01<00:00, 20.47it/s]



Epoch: 006, Loss: 0.6087


100%|██████████| 28/28 [00:00<00:00, 57.32it/s]



Epoch: 007, Loss: 0.6102


100%|██████████| 28/28 [00:00<00:00, 57.95it/s]



Epoch: 008, Loss: 0.5969


100%|██████████| 28/28 [00:00<00:00, 60.78it/s]



Epoch: 009, Loss: 0.6060


100%|██████████| 28/28 [00:00<00:00, 56.87it/s]


Epoch: 010, Loss: 0.5995





In [None]:
# validation
preds, ground_truths = [], []
model.eval()
for batch_data in tqdm.tqdm(val_loader):
    with torch.no_grad():
        batch_data = batch_data.to(device)
        preds.append(model(batch_data))
        ground_truths.append(batch_data['user', 'item'].edge_label)

pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
auc = roc_auc_score(ground_truth, pred)
print()
print(f"Validation AUC: {auc:.4f}")
print(f'Neg sampling ratio:', neg_sampling_ratio)
print(f'Num neighbors:', num_neighbors)      
print(f'Hidden channels:',hidden_channels)
print(f'GNN_Conv_Layer:', GNN_Conv_Layer)


100%|██████████| 12/12 [00:00<00:00, 63.14it/s]



Validation AUC: 0.7312
Neg sampling ratio: 1.0
Num neighbors: [10, 5]
Hidden channels: 16
GNN_Conv_Layer: <class 'torch_geometric.nn.conv.sage_conv.SAGEConv'>


In [None]:
# test
preds, ground_truths = [], []
model.eval()
for batch_data in tqdm.tqdm(test_loader):
    with torch.no_grad():
        batch_data = batch_data.to(device)
        preds.append(model(batch_data))
        ground_truths.append(batch_data['user', 'item'].edge_label)

pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
auc = roc_auc_score(ground_truth, pred)
print()
print(f"Test AUC: {auc:.4f}")

100%|██████████| 12/12 [00:00<00:00, 40.80it/s]


Test AUC: 0.7146





# Main.py

Purpose of this part is to use latest version of `main.py` from git with free Colab GPU.

To run the following cells, you need to clone git repo to MyDrive.

In [None]:
!git clone https://github.com/mbekmyrz/newsrec.git

Cloning into 'newsrec'...
remote: Enumerating objects: 84, done.[K
remote: Counting objects: 100% (84/84), done.[K
remote: Compressing objects: 100% (60/60), done.[K
remote: Total 84 (delta 33), reused 68 (delta 20), pack-reused 0[K
Unpacking objects: 100% (84/84), 7.46 MiB | 2.48 MiB/s, done.


In [None]:
# We copy datasets into newsrec repo folder
!cp -r datasets newsrec/

In [None]:
os.chdir('./newsrec')
!ls

In [None]:
!python3 main.py --data "cit_pt" --plm "ptbert" --use_seperate_test_data --epochs 20 > logs_cit_pt_ptbert_seperate.txt