# 1. Imports

#### General

In [7]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor

In [4]:
import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, f1_score

#### PyG

In [None]:
os.environ['TORCH'] = torch.__version__
print('torch version:', torch.__version__)

!pip3 install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip3 install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip3 install -q git+https://github.com/pyg-team/pytorch_geometric.git

In [9]:
from torch_geometric.nn import GCNConv, GATConv, SAGEConv, to_hetero
from torch_geometric.data import Data, HeteroData
from torch_geometric.utils import negative_sampling
from torch_geometric.loader import LinkNeighborLoader
import torch_geometric.transforms as T

#### Transformers

In [5]:
!pip3 install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m67.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m90.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
from transformers import BertTokenizer, BertModel, AutoModel, AutoTokenizer

### Device

In [12]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('device:', device)

device: cpu


### Drive

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 2. Dataframes

In [14]:
def get_users_and_items(users_csv, items_csv, user_naming, item_naming):
    df_users = pd.read_csv(users_csv)
    print('users\n===========')
    print('users shape:', df_users.shape)
    print('unique users:', len(df_users[user_naming].unique()))
    print('unique articles:', len(df_users[item_naming].unique()))
    print(df_users.head())
    print()

    df_items = pd.read_csv(items_csv)
    print('items\n===========')
    print('items shape:', df_items.shape)
    print('unique articles:', len(df_items[item_naming].unique()))
    print(df_items.head())
    print()

    print('article set is same in two files:', set(df_users[item_naming].unique()) == set(df_items[item_naming].unique()))
    assert set(df_users[item_naming].unique()) == set(df_items[item_naming].unique())
    assert df_items.shape[0] == df_items[item_naming].nunique()
    
    return df_users, df_items

## Adressa

### Adressa Original: NB-Bert

In [None]:
fusers = '/content/drive/Shareddrives/NewsRec/data/adressa/impressions/common_user_item.csv'
fitems = '/content/drive/Shareddrives/NewsRec/data/adressa/content/read_articles.csv'
user_naming, item_naming = 'userId', 'id'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = '/content/drive/Shareddrives/NewsRec/data/adressa/content/feat_tensor_original.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (1857389, 2)
unique users: 535259
unique articles: 15721
                                              userId  \
0  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
1  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
2  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
3  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
4  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   

                                         id  
0  01c74ff7ec02862bed5f861cba7e7226dfd31beb  
1  2228c8e09bd615b509f87347675f4f0f1b74439e  
2  265d0e347e08d19ab62ffe4eadd0333eaad6e57c  
3  3011c1cce4999e3c4e02bd887468d1d1d90b7807  
4  44d3e6498035638f2e1c0580332b67e0d71af45c  

items
items shape: (15721, 2)
unique articles: 15721
                                         id  \
0  6a0612e60690288a776834811004ce133f326cee   
1  b40a30877124510cf65683b6c9391d927e20f89d   
2  29b7afc1d16d34b639597cab200c9c4c96e2b69e   
3  e6877439db8ca3bde8be8f6edb741fcc22c2772c   
4  a30fec2f3e9f2e408a2

### Adressa Multiling: mBert

In [None]:
fusers = '/content/drive/Shareddrives/NewsRec/data/adressa/impressions/common_user_item.csv'
fitems = '/content/drive/Shareddrives/NewsRec/data/adressa/content/read_articles.csv'
user_naming, item_naming = 'userId', 'id'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = '/content/drive/Shareddrives/NewsRec/data/adressa/content/feat_tensor_multiling.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (1857389, 2)
unique users: 535259
unique articles: 15721
                                              userId  \
0  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
1  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
2  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
3  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
4  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   

                                         id  
0  01c74ff7ec02862bed5f861cba7e7226dfd31beb  
1  2228c8e09bd615b509f87347675f4f0f1b74439e  
2  265d0e347e08d19ab62ffe4eadd0333eaad6e57c  
3  3011c1cce4999e3c4e02bd887468d1d1d90b7807  
4  44d3e6498035638f2e1c0580332b67e0d71af45c  

items
items shape: (15721, 2)
unique articles: 15721
                                         id  \
0  6a0612e60690288a776834811004ce133f326cee   
1  b40a30877124510cf65683b6c9391d927e20f89d   
2  29b7afc1d16d34b639597cab200c9c4c96e2b69e   
3  e6877439db8ca3bde8be8f6edb741fcc22c2772c   
4  a30fec2f3e9f2e408a2

### Adressa Multiling: XLM

In [None]:
fusers = '/content/drive/Shareddrives/NewsRec/data/adressa/impressions/common_user_item.csv'
fitems = '/content/drive/Shareddrives/NewsRec/data/adressa/content/read_articles.csv'
user_naming, item_naming = 'userId', 'id'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = '/content/drive/Shareddrives/NewsRec/data/adressa/content/feat_tensor_xlm.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (1857389, 2)
unique users: 535259
unique articles: 15721
                                              userId  \
0  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
1  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
2  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
3  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
4  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   

                                         id  
0  01c74ff7ec02862bed5f861cba7e7226dfd31beb  
1  2228c8e09bd615b509f87347675f4f0f1b74439e  
2  265d0e347e08d19ab62ffe4eadd0333eaad6e57c  
3  3011c1cce4999e3c4e02bd887468d1d1d90b7807  
4  44d3e6498035638f2e1c0580332b67e0d71af45c  

items
items shape: (15721, 2)
unique articles: 15721
                                         id  \
0  6a0612e60690288a776834811004ce133f326cee   
1  b40a30877124510cf65683b6c9391d927e20f89d   
2  29b7afc1d16d34b639597cab200c9c4c96e2b69e   
3  e6877439db8ca3bde8be8f6edb741fcc22c2772c   
4  a30fec2f3e9f2e408a2

### Adressa Translated to English: Bert

In [None]:
fusers = '/content/drive/Shareddrives/NewsRec/data/adressa/impressions/common_user_item.csv'
fitems = '/content/drive/Shareddrives/NewsRec/data/adressa/content/read_artciles_translated.csv'
user_naming, item_naming = 'userId', 'id'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = '/content/drive/Shareddrives/NewsRec/data/adressa/content/feat_tensor_translated.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (1857389, 2)
unique users: 535259
unique articles: 15721
                                              userId  \
0  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
1  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
2  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
3  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
4  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   

                                         id  
0  01c74ff7ec02862bed5f861cba7e7226dfd31beb  
1  2228c8e09bd615b509f87347675f4f0f1b74439e  
2  265d0e347e08d19ab62ffe4eadd0333eaad6e57c  
3  3011c1cce4999e3c4e02bd887468d1d1d90b7807  
4  44d3e6498035638f2e1c0580332b67e0d71af45c  

items
items shape: (15721, 2)
unique articles: 15721
                                         id  \
0  6a0612e60690288a776834811004ce133f326cee   
1  b40a30877124510cf65683b6c9391d927e20f89d   
2  29b7afc1d16d34b639597cab200c9c4c96e2b69e   
3  e6877439db8ca3bde8be8f6edb741fcc22c2772c   
4  a30fec2f3e9f2e408a2

## CIT

### CIT Original PT: PT-Bert

In [None]:
fusers = '/content/drive/Shareddrives/NewsRec/data/cit/common_user_item_PT.csv'
fitems = '/content/drive/Shareddrives/NewsRec/data/cit/read_articles_PT.csv'
user_naming, item_naming = 'personId', 'contentId'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = '/content/drive/Shareddrives/NewsRec/data/cit/feat_tensor_PT_original.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (14417, 4)
unique users: 1619
unique articles: 821
             contentId             personId  \
0   310515487419366995 -1130272294246983140   
1   310515487419366995   344280948527967603   
2   310515487419366995 -8763398617720485024   
3 -8864073373672512525  3609194402293569455   
4   310515487419366995  3609194402293569455   

                                                 url lang  
0  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
1  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
2  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
3  http://www.startupsstars.com/2016/06/video-int...   pt  
4  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  

items
items shape: (821, 4)
unique articles: 821
             contentId                                              title  \
0 -9157338616628196758  Situação financeira ruim de varejistas pressio...   
1 -5917314377186856799  Artigos e Palestras - Programa Agricultura de ...   
2  615

### CIT Multiling: mBert

In [None]:
fusers = '/content/drive/Shareddrives/NewsRec/data/cit/common_user_item_PT.csv'
fitems = '/content/drive/Shareddrives/NewsRec/data/cit/read_articles_PT.csv'
user_naming, item_naming = 'personId', 'contentId'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = '/content/drive/Shareddrives/NewsRec/data/cit/feat_tensor_PT_multiling.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (14417, 4)
unique users: 1619
unique articles: 821
             contentId             personId  \
0   310515487419366995 -1130272294246983140   
1   310515487419366995   344280948527967603   
2   310515487419366995 -8763398617720485024   
3 -8864073373672512525  3609194402293569455   
4   310515487419366995  3609194402293569455   

                                                 url lang  
0  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
1  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
2  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
3  http://www.startupsstars.com/2016/06/video-int...   pt  
4  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  

items
items shape: (821, 4)
unique articles: 821
             contentId                                              title  \
0 -9157338616628196758  Situação financeira ruim de varejistas pressio...   
1 -5917314377186856799  Artigos e Palestras - Programa Agricultura de ...   
2  615

### CIT Multiling: XLM

In [None]:
fusers = '/content/drive/Shareddrives/NewsRec/data/cit/common_user_item_PT.csv'
fitems = '/content/drive/Shareddrives/NewsRec/data/cit/read_articles_PT.csv'
user_naming, item_naming = 'personId', 'contentId'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = '/content/drive/Shareddrives/NewsRec/data/cit/feat_tensor_PT_xlm.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (14417, 4)
unique users: 1619
unique articles: 821
             contentId             personId  \
0   310515487419366995 -1130272294246983140   
1   310515487419366995   344280948527967603   
2   310515487419366995 -8763398617720485024   
3 -8864073373672512525  3609194402293569455   
4   310515487419366995  3609194402293569455   

                                                 url lang  
0  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
1  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
2  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
3  http://www.startupsstars.com/2016/06/video-int...   pt  
4  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  

items
items shape: (821, 4)
unique articles: 821
             contentId                                              title  \
0 -9157338616628196758  Situação financeira ruim de varejistas pressio...   
1 -5917314377186856799  Artigos e Palestras - Programa Agricultura de ...   
2  615

### CIT Translated to English: Bert

In [None]:
fusers = '/content/drive/Shareddrives/NewsRec/data/cit/common_user_item_PT.csv'
fitems = '/content/drive/Shareddrives/NewsRec/data/cit/read_artciles_PT_translated.csv'
user_naming, item_naming = 'personId', 'url'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = '/content/drive/Shareddrives/NewsRec/data/cit/feat_tensor_PT_translated.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (14417, 4)
unique users: 1619
unique articles: 821
             contentId             personId  \
0   310515487419366995 -1130272294246983140   
1   310515487419366995   344280948527967603   
2   310515487419366995 -8763398617720485024   
3 -8864073373672512525  3609194402293569455   
4   310515487419366995  3609194402293569455   

                                                 url lang  
0  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
1  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
2  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
3  http://www.startupsstars.com/2016/06/video-int...   pt  
4  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  

items
items shape: (821, 3)
unique articles: 821
      contentId                                              title  \
0 -9.160000e+18  Bad financial situation of retailers puts pres...   
1 -5.920000e+18  Articles and Lectures - SENAR Precision Agricu...   
2  6.160000e+18            

### CIT Original EN: Bert

In [None]:
fusers = '/content/drive/Shareddrives/NewsRec/data/cit/common_user_item_EN.csv'
fitems = '/content/drive/Shareddrives/NewsRec/data/cit/read_articles_EN.csv'
user_naming, item_naming = 'personId', 'contentId'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = '/content/drive/Shareddrives/NewsRec/data/cit/feat_tensor_EN.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (26206, 4)
unique users: 1644
unique articles: 2154
             contentId             personId  \
0 -3499919498720038879 -8845298781299428018   
1  8890720798209849691 -1032019229384696495   
2 -7820640624231356730  -445337111692715325   
3 -1492913151930215984  4254153380739593270   
4  3064370296170038610  3609194402293569455   

                                                 url lang  
0             http://techcrunch.com/2016/06/07/hiri/   en  
1  https://www.nngroup.com/articles/top-intranet-...   en  
2  http://www.fastcompany.com/3058876/your-most-p...   en  
3  https://developer.chrome.com/devtools/docs/con...   en  
4  https://www.linkedin.com/pulse/google-amazon-u...   en  

items
items shape: (2154, 4)
unique articles: 2154
             contentId                                              title  \
0 -6451309518266745024  Ethereum, a Virtual Currency, Enables Transact...   
1 -4110354420726924665  Ethereum, a Virtual Currency, Enables Transact...   
2 -

### CIT Mixed: mBert

In [None]:
fusers = '/content/drive/Shareddrives/NewsRec/data/cit/common_user_item.csv'
fitems = '/content/drive/Shareddrives/NewsRec/data/cit/read_articles.csv'
user_naming, item_naming = 'personId', 'contentId'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = '/content/drive/Shareddrives/NewsRec/data/cit/feat_tensor_mbert.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (40623, 4)
unique users: 1894
unique articles: 2975
             contentId             personId  \
0 -3499919498720038879 -8845298781299428018   
1  8890720798209849691 -1032019229384696495   
2 -7820640624231356730  -445337111692715325   
3 -1492913151930215984  4254153380739593270   
4  3064370296170038610  3609194402293569455   

                                                 url lang  
0             http://techcrunch.com/2016/06/07/hiri/   en  
1  https://www.nngroup.com/articles/top-intranet-...   en  
2  http://www.fastcompany.com/3058876/your-most-p...   en  
3  https://developer.chrome.com/devtools/docs/con...   en  
4  https://www.linkedin.com/pulse/google-amazon-u...   en  

items
items shape: (2975, 4)
unique articles: 2975
             contentId                                              title  \
0 -6451309518266745024  Ethereum, a Virtual Currency, Enables Transact...   
1 -4110354420726924665  Ethereum, a Virtual Currency, Enables Transact...   
2 -

### CIT Mixed: XLM

In [None]:
fusers = '/content/drive/Shareddrives/NewsRec/data/cit/common_user_item.csv'
fitems = '/content/drive/Shareddrives/NewsRec/data/cit/read_articles.csv'
user_naming, item_naming = 'personId', 'contentId'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = '/content/drive/Shareddrives/NewsRec/data/cit/feat_tensor_xlm.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (40623, 4)
unique users: 1894
unique articles: 2975
             contentId             personId  \
0 -3499919498720038879 -8845298781299428018   
1  8890720798209849691 -1032019229384696495   
2 -7820640624231356730  -445337111692715325   
3 -1492913151930215984  4254153380739593270   
4  3064370296170038610  3609194402293569455   

                                                 url lang  
0             http://techcrunch.com/2016/06/07/hiri/   en  
1  https://www.nngroup.com/articles/top-intranet-...   en  
2  http://www.fastcompany.com/3058876/your-most-p...   en  
3  https://developer.chrome.com/devtools/docs/con...   en  
4  https://www.linkedin.com/pulse/google-amazon-u...   en  

items
items shape: (2975, 4)
unique articles: 2975
             contentId                                              title  \
0 -6451309518266745024  Ethereum, a Virtual Currency, Enables Transact...   
1 -4110354420726924665  Ethereum, a Virtual Currency, Enables Transact...   
2 -

### CIT Train on Orig EN only and Test on Mixed

Train on Orig EN first

Then test on the following Mix

In [None]:
fusers = '/content/drive/Shareddrives/NewsRec/data/cit/common_user_item_MIX.csv'
fitems = '/content/drive/Shareddrives/NewsRec/data/cit/read_artciles_MIX_EN_TR.csv'
user_naming, item_naming = 'personId', 'url'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = '/content/drive/Shareddrives/NewsRec/data/cit/feat_tensor_MIX.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

### CIT Train with Mixed (Orig EN + Translated PT) and Test on Mixed

In [None]:
fusers = '/content/drive/Shareddrives/NewsRec/data/cit/common_user_item_MIX.csv'
fitems = '/content/drive/Shareddrives/NewsRec/data/cit/read_artciles_MIX_EN_TR.csv'
user_naming, item_naming = 'personId', 'url'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = '/content/drive/Shareddrives/NewsRec/data/cit/feat_tensor_MIX.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

## MIND

In [None]:
i = 0
fusers = f'/content/drive/Shareddrives/NewsRec/data/mind/MINDlarge_train/common_user_item_{i}.csv'
fitems = f'/content/drive/Shareddrives/NewsRec/data/mind/MINDlarge_train/read_article_titles_{i}.csv'
user_naming, item_naming = 'User_ID', 'News_ID'
df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

ffeat_items = f'/content/drive/Shareddrives/NewsRec/data/mind/MINDlarge_train/feat_tensor_{i}.pt'
feature_tensor = torch.load(ffeat_items, map_location=device)
print('feature tensor:', feature_tensor.shape)
assert feature_tensor.shape[0] == df_users[item_naming].nunique()

users
users shape: (4658157, 2)
unique users: 155855
unique articles: 67147
   User_ID  News_ID
0  U321454  N128643
1  U578952  N122359
2  U578952  N110096
3  U578952   N20583
4  U578952  N128736

items
items shape: (67147, 2)
unique articles: 67147
  News_ID                                              title
0  N88753  The Brands Queen Elizabeth, Prince Charles, an...
1  N45436    Walmart Slashes Prices on Last-Generation iPads
2  N23144                      50 Worst Habits For Belly Fat
3  N86255  Dispose of unwanted prescription drugs during ...
4  N93187  The Cost of Trump's Aid Freeze in the Trenches...

article set is same: True
feature tensor: torch.Size([67147, 768])


# Run PLM

In [None]:
def load_plm_model(plm_model_name):
    print('Hi, loading:', plm_model_name)
    plm_model = AutoModel.from_pretrained(plm_model_name).to(device)
    plm_tokenizer = AutoTokenizer.from_pretrained(plm_model_name)
    return plm_model, plm_tokenizer

def get_plm_embeddings(plm_model, plm_tokenizer, df_items, title_naming, feat_tensor_file):
    print('df_items shape:', df_items.shape)

    encoded_inputs = plm_tokenizer(list(df_items[title_naming]), max_length=50, padding=True, truncation=True, return_tensors="pt")
    input_ids = encoded_inputs['input_ids'].to(device)
    print('input_ids shape:', input_ids.shape)

    feat_vectors = []
    i = 0 
    while i < len(input_ids):
        if i+512 >= len(input_ids):
            encoded_inputs_batch = input_ids[i:, :]
        else:
            encoded_inputs_batch = input_ids[i:i+512, :]
        with torch.no_grad():
            out = plm_model(encoded_inputs_batch)[1]
        feat_vectors.append(out)
        print(f'samples done: {i} / {len(input_ids)}')
        i += 512
    
    feat_tensor = torch.cat(feat_vectors, 0)
    print('feat_tensor shape:', feat_tensor.shape)

    torch.save(feat_tensor, feat_tensor_file)

    return feat_tensor

In [None]:
plm_models = {
    'mbert': 'bert-base-multilingual-cased',
    'bert': 'bert-base-cased',
    'nb-bert': 'NbAiLab/nb-bert-base',
    'pt-bert': 'neuralmind/bert-base-portuguese-cased',
    'xlm': 'microsoft/infoxlm-base',
    'gpt2': 'gpt2'
}

In [None]:
plm_model_name = plm_models['xlm']
plm_model, plm_tokenizer = load_plm_model(plm_model_name)

Hi, loading: microsoft/infoxlm-base


Downloading:   0%|          | 0.00/512 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/942M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/infoxlm-base were not used when initializing XLMRobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
title_naming = 'title'
feat_tensor_file = '/content/drive/Shareddrives/NewsRec/data/cit/feat_tensor_xlm.pt'
ften = get_plm_embeddings(plm_model, plm_tokenizer, df_items, title_naming, feat_tensor_file)

df_items shape: (2975, 4)
input_ids shape: torch.Size([2975, 47])
samples done: 0 / 2975
samples done: 512 / 2975
samples done: 1024 / 2975
samples done: 1536 / 2975
samples done: 2048 / 2975
samples done: 2560 / 2975
feat_tensor shape: torch.Size([2975, 768])


# 3. Graph Construction

We construct graph from the user-item interaction.

In [None]:
user_has_features = True
user_feature_size = 16          # 32
item_feature_init = 'plm'       # 'plm', 'zero', 'random'
item_feature_size = 768         # feature_tensor.size()[1]
random_edges = False

#### Reload

In [None]:
# Create a mapping from unique user indices to range [0, num_user_nodes):
unique_user_id = df_users[user_naming].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedID': pd.RangeIndex(len(unique_user_id)),
})
print("Mapping of user IDs to consecutive values:")
print("==========================================")
print(unique_user_id.head())
print()

Mapping of user IDs to consecutive values:
                userId  mappedID
0 -1130272294246983140         0
1   344280948527967603         1
2 -8763398617720485024         2
3  3609194402293569455         3
4  1908339160857512799         4



In [None]:
# Create a mapping from unique item indices to range [0, num_item_nodes):
assert not any(df_items[item_naming].values != df_items[item_naming].unique())
unique_item_id = df_items[item_naming]
unique_item_id = pd.DataFrame(data={
    'itemId': unique_item_id,
    'mappedID': pd.RangeIndex(len(unique_item_id)),
})
print("Mapping of item IDs to consecutive values:")
print("===========================================")
print(unique_item_id.head())

Mapping of item IDs to consecutive values:
                itemId  mappedID
0 -9157338616628196758         0
1 -5917314377186856799         1
2  6157037646878010131         2
3 -1672166631728511207         3
4  5714314286511882372         4


In [None]:
# Perform merge to obtain the edges from users and items:
ratings_user_id = pd.merge(df_users[user_naming], unique_user_id,
                            left_on=user_naming, right_on='userId', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
ratings_item_id = pd.merge(df_users[item_naming], unique_item_id,
                            left_on=item_naming, right_on='itemId', how='left')
ratings_item_id = torch.from_numpy(ratings_item_id['mappedID'].values)

Random edges result in 0.5 AUC and 0.0 F1 score.

In [None]:
# Random edges
if random_edges:
  ratings_user_id = torch.from_numpy(np.random.randint(low=0, high=len(unique_user_id)-1, size=len(unique_user_id)))
  ratings_item_id = torch.from_numpy(np.random.randint(low=0, high=len(unique_item_id)-1, size=len(unique_user_id)))

In [None]:
# With this, we are ready to construct our `edge_index` in COO format
# following PyG semantics:
edge_index_coo_user_to_item = torch.stack([ratings_user_id, ratings_item_id], dim=0)

print()
print("Final edge indices pointing from users to items:")
print("=================================================")
print(edge_index_coo_user_to_item)


Final edge indices pointing from users to items:
tensor([[   0,    1,    2,  ...,   46, 1405, 1468],
        [ 260,  260,  260,  ...,  783,  618,  782]])


In [None]:
data = HeteroData()

# Save node indices:
data["user"].node_id = torch.arange(len(unique_user_id))
data["item"].node_id = torch.arange(len(unique_item_id))

# Add the node features and edge indices:
if item_feature_init == 'plm':
  data["item"].x = feature_tensor  # DONE
  if user_has_features:
  # only random init for user features
    data["user"].x = torch.Tensor(np.random.random((len(unique_user_id), user_feature_size)))
    print('user features shape:', data["user"].x.shape)
elif item_feature_init == 'random':
  data["item"].x = torch.Tensor(np.random.random((len(unique_item_id), item_feature_size)))
elif item_feature_init == 'zero':
  data["item"].x = torch.Tensor(np.zeros((len(unique_item_id), item_feature_size)))
else:
  raise 'Need item feature init type: plm, random, zero'

print('item features shape:', data["item"].x.shape)
data["user", "rates", "item"].edge_index = edge_index_coo_user_to_item  # DONE

# We also need to make sure to add the reverse edges from items to users
# in order to let a GNN be able to pass messages in both directions.
# We can leverage the `T.ToUndirected()` transform for this from PyG:
data = T.ToUndirected()(data)
del data['item', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.

print(data)
print("=============================")
print('user num_nodes:', data["user"].num_nodes)
print('user num_features:', data["user"].num_features)
print('item num_nodes:', data["item"].num_nodes)
print('item num_features:', data["item"].num_features)
print('num_edges user->item:', data["user", "rates", "item"].num_edges)
print('num_edges item->user:', data["item", "rev_rates", "user"].num_edges)

user features shape: torch.Size([1619, 16])
item features shape: torch.Size([821, 768])
HeteroData(
  [1muser[0m={
    node_id=[1619],
    x=[1619, 16]
  },
  [1mitem[0m={
    node_id=[821],
    x=[821, 768]
  },
  [1m(user, rates, item)[0m={ edge_index=[2, 14417] },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 14417] }
)
user num_nodes: 1619
user num_features: 16
item num_nodes: 821
item num_features: 768
num_edges user->item: 14417
num_edges item->user: 14417


user num_features: 0
shouldn't be randomly intialized node features???

# 4. Data Loaders

In [None]:
neg_sampling_ratio = 1.0        # hyper: 1, 2
batch_size = 128
num_neighbors = [10, 5]         # hyper: [10, 5], [10, 10], [20, 10], [20, 20]

#### Reload

In [None]:
# For this, we first split the set of edges into
# training (80%), validation (10%), and testing edges (10%).
# Across the training edges, we use 70% of edges for message passing,
# and 30% of edges for supervision.
# We further want to generate fixed negative edges for evaluation with a ratio of 2:1.
# Negative edges during training will be generated on-the-fly.
# We can leverage the `RandomLinkSplit()` transform for this from PyG:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=neg_sampling_ratio,
    add_negative_train_samples=False,       # negative samples generated on-the-fly
    edge_types=("user", "rates", "item"),
    rev_edge_types=("item", "rev_rates", "user"), 
)


train_data, val_data, test_data = transform(data)
print("Training data:")
print("==============")
print(train_data)
print()
print("Validation data:")
print("================")
print(val_data)
print()
print("Test data:")
print("================")
print(test_data)

Training data:
HeteroData(
  [1muser[0m={
    node_id=[1619],
    x=[1619, 16]
  },
  [1mitem[0m={
    node_id=[821],
    x=[821, 768]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 8075],
    edge_label=[3460],
    edge_label_index=[2, 3460]
  },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 8075] }
)

Validation data:
HeteroData(
  [1muser[0m={
    node_id=[1619],
    x=[1619, 16]
  },
  [1mitem[0m={
    node_id=[821],
    x=[821, 768]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 11535],
    edge_label=[2882],
    edge_label_index=[2, 2882]
  },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 11535] }
)

Test data:
HeteroData(
  [1muser[0m={
    node_id=[1619],
    x=[1619, 16]
  },
  [1mitem[0m={
    node_id=[821],
    x=[821, 768]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 12976],
    edge_label=[2882],
    edge_label_index=[2, 2882]
  },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 12976] }
)


In [None]:
print('Train data:')
print('=============')
print('present num_edges user->item:', train_data["user", "rates", "item"].num_edges)
print('to be predicted positive num_edges user->item:', train_data["user", "rates", "item"].edge_label_index.shape[1])
print('to be predicted edge classes:', torch.unique(train_data["user", "rates", "item"].edge_label))
print('negative edge classes [0.] would be generated during training on-the-fly')

Train data:
present num_edges user->item: 8075
to be predicted positive num_edges user->item: 3460
to be predicted edge classes: tensor([1.])
negative edge classes [0.] would be generated during training on-the-fly


In [None]:
# Training data:
# edge_index=[2, 1040140],      - number of edges to construct graph, 70% of edges for message passing,
#                               - edges already present in the graph
# edge_label=[445773],          - number of edges for training, 30% of edges for supervision.
#                               - labels of missing edges, [0, 1] - 0 for negative edge, 1 for positive
# edge_label_index=[2, 445773]  - edges that are absent in the graph for training 

So, we see that all the nodes are present both in training and validation data!

As well as all the item_node features.

Only the edge_index changes: which users connected to which items. Some of them are removed for train and val. 

We are now ready to create a mini-batch loader that will generate subgraphs that can be used as input into our GNN. While this step is not strictly necessary for small-scale graphs, it is absolutely necessary to apply GNNs on larger graphs that do not fit onto GPU memory otherwise. Here, we make use of the loader.LinkNeighborLoader which samples multiple hops from both ends of a link and creates a subgraph from it. Here, edge_label_index serves as the "seed links" to start sampling from.

In [None]:
# In the first hop, we sample at most 10 neighbors.
# In the second hop, we sample at most 5 neighbors.
# In addition, during training, we want to sample negative edges on-the-fly with
# a ratio of 2:1.
# We can make use of the `loader.LinkNeighborLoader` from PyG:
# Define seed edges:
edge_label_index = train_data["user", "rates", "item"].edge_label_index
edge_label = train_data["user", "rates", "item"].edge_label

train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=num_neighbors,
    neg_sampling_ratio=neg_sampling_ratio,
    edge_label_index=(("user", "rates", "item"), edge_label_index),
    edge_label=edge_label,
    batch_size=batch_size,
    shuffle=True,
)

# Inspect a sample:
sampled_data = next(iter(train_loader))

print("Sampled Train mini-batch:")
print("===================")
print(sampled_data)

assert sampled_data["user", "rates", "item"].edge_label_index.size(1) == (neg_sampling_ratio + 1) * batch_size # 2 edges (1 pos + 1 neg) * batch_size
assert sampled_data["user", "rates", "item"].edge_label.min() == 0
assert sampled_data["user", "rates", "item"].edge_label.max() == 1

Sampled Train mini-batch:
HeteroData(
  [1muser[0m={
    node_id=[961],
    x=[961, 16]
  },
  [1mitem[0m={
    node_id=[675],
    x=[675, 768]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 3072],
    edge_label=[256],
    edge_label_index=[2, 256],
    input_id=[128]
  },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 3146] }
)


In [None]:
# Define the validation seed edges:
edge_label_index = val_data["user", "rates", "item"].edge_label_index
edge_label = val_data["user", "rates", "item"].edge_label

val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=num_neighbors,
    edge_label_index=(("user", "rates", "item"), edge_label_index),
    edge_label=edge_label,
    batch_size=int((neg_sampling_ratio + 1) * batch_size),
    shuffle=False,
)

sampled_data = next(iter(val_loader))

print("Sampled Validation mini-batch:")
print("===================")
print(sampled_data)

assert sampled_data["user", "rates", "item"].edge_label_index.size(1) == (neg_sampling_ratio + 1) * batch_size
assert sampled_data["user", "rates", "item"].edge_label.min() >= 0
assert sampled_data["user", "rates", "item"].edge_label.max() <= 1

Sampled Validation mini-batch:
HeteroData(
  [1muser[0m={
    node_id=[990],
    x=[990, 16]
  },
  [1mitem[0m={
    node_id=[679],
    x=[679, 768]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 3520],
    edge_label=[256],
    edge_label_index=[2, 256],
    input_id=[256]
  },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 3751] }
)


In [None]:
# Define the test seed edges:
edge_label_index = test_data["user", "rates", "item"].edge_label_index
edge_label = test_data["user", "rates", "item"].edge_label

test_loader = LinkNeighborLoader(
    data=test_data,
    num_neighbors=num_neighbors,
    edge_label_index=(("user", "rates", "item"), edge_label_index),
    edge_label=edge_label,
    batch_size=int((neg_sampling_ratio + 1) * batch_size),
    shuffle=False,
)

sampled_data = next(iter(test_loader))

print("Sampled Test mini-batch:")
print("===================")
print(sampled_data)

assert sampled_data["user", "rates", "item"].edge_label_index.size(1) == (neg_sampling_ratio + 1) * batch_size
assert sampled_data["user", "rates", "item"].edge_label.min() >= 0
assert sampled_data["user", "rates", "item"].edge_label.max() <= 1

Sampled Test mini-batch:
HeteroData(
  [1muser[0m={
    node_id=[979],
    x=[979, 16]
  },
  [1mitem[0m={
    node_id=[681],
    x=[681, 768]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 3475],
    edge_label=[256],
    edge_label_index=[2, 256],
    input_id=[256]
  },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 3866] }
)


# 5. Model

In [None]:
hidden_channels = 16                          # hyper: 16, 32, 64
GNN_Conv_Layer = SAGEConv                     # hyper: SAGEConv, GATConv
remove_embedding_layer = user_has_features    # False

#### Reload

We are now ready to create our heterogeneous GNN. The GNN is responsible for learning enriched node representations from the surrounding subgraphs, which can be then used to derive edge-level predictions. For defining our heterogenous GNN, we make use of nn.SAGEConv and the nn.to_hetero() function, which transforms a GNN defined on homogeneous graphs to be applied on heterogeneous ones.
In addition, we define a final link-level classifier, which simply takes both node embeddings of the link we are trying to predict, and applies a dot-product on them.
As users do not have any node-level information, we choose to learn their features jointly via a torch.nn.Embedding layer. In order to improve the expressiveness of movie features, we do the same for movie nodes, and simply add their shallow embeddings to the pre-defined genre features.

In [None]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        if GNN_Conv_Layer == GATConv:
            self.conv1 = GNN_Conv_Layer(hidden_channels, hidden_channels, add_self_loops=False)
            self.conv2 = GNN_Conv_Layer(hidden_channels, hidden_channels, add_self_loops=False)
        else:
            self.conv1 = GNN_Conv_Layer(hidden_channels, hidden_channels)
            self.conv2 = GNN_Conv_Layer(hidden_channels, hidden_channels)

    def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
        # Define a 2-layer GNN computation graph.
        # Use a *single* `ReLU` non-linearity in-between.
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:
class Classifier(torch.nn.Module):
    def forward(self, x_user: Tensor, x_item: Tensor, edge_label_index: Tensor) -> Tensor:
        # Convert node embeddings to edge-level representations:
        edge_feat_user = x_user[edge_label_index[0]]    # feat[ids of users in coo]
        edge_feat_item = x_item[edge_label_index[1]]    # feat[ids of items in coo]

        # Apply dot-product to get a prediction per supervision edge:
        return (edge_feat_user * edge_feat_item).sum(dim=-1)


class Model(torch.nn.Module):
    def __init__(self, user_fsize, item_fsize, num_user_nodes, num_item_nodes, hidden_channels, graph_metadata):
        super().__init__()
        # learn embedding matrices for users and items:
        if remove_embedding_layer:
          self.user_lin = torch.nn.Linear(user_fsize, hidden_channels)
        else:
          self.user_emb = torch.nn.Embedding(num_user_nodes, hidden_channels)
          self.item_emb = torch.nn.Embedding(num_item_nodes, hidden_channels)

        self.item_lin = torch.nn.Linear(item_fsize, hidden_channels)

        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)     # Encoder
        # Convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=graph_metadata, aggr='mean')
        self.classifier = Classifier()      # Deocder

    def forward(self, data: HeteroData) -> Tensor:
        if remove_embedding_layer:
          x_dict = {
            "user": self.user_lin(data["user"].x),
            "item": self.item_lin(data["item"].x),
          }
        else:
          x_dict = {
            "user": self.user_emb(data["user"].node_id),
            "item": self.item_lin(data["item"].x) + self.item_emb(data["item"].node_id),
          }

        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        # Encode
        x_dict = self.gnn(x_dict, data.edge_index_dict)
        # Decode
        pred = self.classifier(
            x_dict["user"],
            x_dict["item"],
            data["user", "rates", "item"].edge_label_index,
        )

        return pred

In [None]:
# data contains the whole graph
num_user_nodes = data["user"].num_nodes
num_item_nodes = data["item"].num_nodes
graph_metadata = data.metadata()

model = Model(user_feature_size, item_feature_size, num_user_nodes, num_item_nodes, hidden_channels, graph_metadata)
model = model.to(device)
print(f"Device: '{device}'")
print(model)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = F.binary_cross_entropy_with_logits

Device: 'cpu'
Model(
  (user_lin): Linear(in_features=32, out_features=16, bias=True)
  (item_lin): Linear(in_features=768, out_features=16, bias=True)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (user__rates__item): SAGEConv(16, 16, aggr=mean)
      (item__rev_rates__user): SAGEConv(16, 16, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__item): SAGEConv(16, 16, aggr=mean)
      (item__rev_rates__user): SAGEConv(16, 16, aggr=mean)
    )
  )
  (classifier): Classifier()
)


# 6. Experiments

question is: is it enough to just translate news to english and use english recommender model?
or can use single multiling model for multiple languages?
or should we use a separate model for each language?

hypothesis: multiling is the best

In [None]:
epochs = 1      # mind=1, adressa=2, cit=50 

#### Reload

In [None]:
# training
train_loss = []
for epoch in range(1, epochs+1):
    model.train()
    total_loss = total_examples = 0
    for batch_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()
        batch_data = batch_data.to(device)
        
        pred = model(batch_data) # `forward` pass of the model
        ground_truth = batch_data['user', 'item'].edge_label
        
        loss = criterion(pred, ground_truth)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    
    epoch_loss = total_loss / total_examples
    train_loss.append(epoch_loss)
    print(f"\nEpoch: {epoch:03d}, Loss: {epoch_loss:.4f}")

In [None]:
# validation
preds, ground_truths = [], []
model.eval()
for batch_data in tqdm.tqdm(val_loader):
    with torch.no_grad():
        batch_data = batch_data.to(device)
        preds.append(model(batch_data))
        ground_truths.append(batch_data['user', 'item'].edge_label)

pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
auc = roc_auc_score(ground_truth, pred)
print()
print(f"Validation AUC: {auc:.4f}")
print(f'Neg sampling ratio:', neg_sampling_ratio)
print(f'Num neighbors:', num_neighbors)      
print(f'Hidden channels:',hidden_channels)
print(f'GNN_Conv_Layer:', GNN_Conv_Layer)


100%|██████████| 12/12 [00:00<00:00, 29.71it/s]


Validation AUC: 0.7862
Neg sampling ratio: 1.0
Num neighbors: [10, 5]
Hidden channels: 16
GNN_Conv_Layer: <class 'torch_geometric.nn.conv.sage_conv.SAGEConv'>





In [None]:
# test
preds, ground_truths = [], []
model.eval()
for batch_data in tqdm.tqdm(test_loader):
    with torch.no_grad():
        batch_data = batch_data.to(device)
        preds.append(model(batch_data))
        ground_truths.append(batch_data['user', 'item'].edge_label)

pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
auc = roc_auc_score(ground_truth, pred)
print()
print(f"Test AUC: {auc:.4f}")

100%|██████████| 12/12 [00:00<00:00, 22.04it/s]


Test AUC: 0.7870





#### Results

In [None]:
out = torch.Tensor(pred).sigmoid().cpu().numpy()
auc = roc_auc_score(ground_truth, out)
print(f'Val AUC with sigmoid:', auc)

probs = np.rint(out)
f1_s = f1_score(ground_truth, probs)
print(f'Val F1 with sigmoid:', f1_s)
print(f'Neg sampling ratio:', neg_sampling_ratio)
print(f'Num neighbors:', num_neighbors)      
print(f'Hidden channels:',hidden_channels)
print(f'GNN_Conv_Layer:', GNN_Conv_Layer)

out = torch.Tensor(pred).sigmoid().cpu().numpy()
auc = roc_auc_score(ground_truth, out)
print(f'Test AUC with sigmoid:', auc)

probs = np.rint(out)
f1_s = f1_score(ground_truth, probs)
print(f'Test F1 with sigmoid:', f1_s)

Results for CIT Mixed Bert GATConv:
- 
Val AUC with sigmoid: 0.867049439982584
Val F1 with sigmoid: 0.6727344677483182
Validation AUC: 0.8670
Neg sampling ratio: 2.0
Num neighbors: [20, 20]
Hidden channels: 32
GNN_Conv_Layer: <class 'torch_geometric.nn.conv.gat_conv.GATConv'>
- 
Test AUC with sigmoid: 0.8572776203217435
Test F1 with sigmoid: 0.646524064171123

Results for CIT Mixed Bert SAGEConv:
- 
Val AUC with sigmoid: 0.8697367381170392
Val F1 with sigmoid: 0.6960985626283367
Validation AUC: 0.8697
Neg sampling ratio: 2.0
Num neighbors: [20, 20]
Hidden channels: 32
GNN_Conv_Layer: <class 'torch_geometric.nn.conv.sage_conv.SAGEConv'>
- 
Test AUC with sigmoid: 0.8618162086865792
Test F1 with sigmoid: 0.6903941829314963

Results for CIT Mixed XLM SAGEConv:
- 
Val AUC with sigmoid: 0.8700091346318184
Val F1 with sigmoid: 0.7060878243512974
Validation AUC: 0.8700
Neg sampling ratio: 2.0
Num neighbors: [20, 20]
Hidden channels: 16
GNN_Conv_Layer: <class 'torch_geometric.nn.conv.sage_conv.SAGEConv'>
- 
Test AUC with sigmoid: 0.8699248611077778
Test F1 with sigmoid: 0.7104747919725893

- Val AUC with sigmoid: 0.8670928040289351
Val F1 with sigmoid: 0.6989166036785085
Validation AUC: 0.8671
Neg sampling ratio: 2.0
Num neighbors: [10, 5]
Hidden channels: 16
GNN_Conv_Layer: <class 'torch_geometric.nn.conv.sage_conv.SAGEConv'>
- 
Test AUC with sigmoid: 0.8667656645723439
Test F1 with sigmoid: 0.696376720111097

Test with feature_tensor = 0 for adressa:
- Test AUC with sigmoid: 0.9906381031759737
- Test F1 with sigmoid: 0.9496406713155481

Test with feature_tensor = 0 for cit pt:
- Test AUC with sigmoid: 0.811534394006013
- Test F1 with sigmoid: 0.6191342306293198

Test with feature_tensor = random for cit pt:
- Test AUC with sigmoid: 0.8127672490140773
- Test F1 with sigmoid: 0.5971082454083627

Test with removing embed layer and adding random init user features for CIT PT XLM:
- Test AUC with sigmoid: 0.7870459686363612
- Test F1 with sigmoid: 0.7361064891846923

# Loop

##### Dict

In [None]:
datas_dict  = {'adressa_nb':  ['/content/drive/Shareddrives/NewsRec/data/adressa/impressions/common_user_item.csv',
                              '/content/drive/Shareddrives/NewsRec/data/adressa/content/read_articles.csv',
                              '/content/drive/Shareddrives/NewsRec/data/adressa/content/feat_tensor_original.pt',
                              'userId', 'id', 5],
               'adressa_mb':  ['/content/drive/Shareddrives/NewsRec/data/adressa/impressions/common_user_item.csv',
                              '/content/drive/Shareddrives/NewsRec/data/adressa/content/read_articles.csv',
                              '/content/drive/Shareddrives/NewsRec/data/adressa/content/feat_tensor_multiling.pt',
                              'userId', 'id', 5],
               'adressa_xlm': ['/content/drive/Shareddrives/NewsRec/data/adressa/impressions/common_user_item.csv',
                              '/content/drive/Shareddrives/NewsRec/data/adressa/content/read_articles.csv',
                              '/content/drive/Shareddrives/NewsRec/data/adressa/content/feat_tensor_xlm.pt',
                              'userId', 'id', 5],
               'adressa_tr':  ['/content/drive/Shareddrives/NewsRec/data/adressa/impressions/common_user_item.csv',
                              '/content/drive/Shareddrives/NewsRec/data/adressa/content/read_artciles_translated.csv',
                              '/content/drive/Shareddrives/NewsRec/data/adressa/content/feat_tensor_translated.pt',
                              'userId', 'id', 5],
               
               'cit_pt':      ['/content/drive/Shareddrives/NewsRec/data/cit/common_user_item_PT.csv',
                              '/content/drive/Shareddrives/NewsRec/data/cit/read_articles_PT.csv',
                              '/content/drive/Shareddrives/NewsRec/data/cit/feat_tensor_PT_original.pt',
                              'personId', 'contentId', 100],
               'cit_mb':      ['/content/drive/Shareddrives/NewsRec/data/cit/common_user_item_PT.csv',
                              '/content/drive/Shareddrives/NewsRec/data/cit/read_articles_PT.csv',
                              '/content/drive/Shareddrives/NewsRec/data/cit/feat_tensor_PT_multiling.pt',
                              'personId', 'contentId', 100],
               'cit_xlm':     ['/content/drive/Shareddrives/NewsRec/data/cit/common_user_item_PT.csv',
                              '/content/drive/Shareddrives/NewsRec/data/cit/read_articles_PT.csv',
                              '/content/drive/Shareddrives/NewsRec/data/cit/feat_tensor_PT_xlm.pt',
                              'personId', 'contentId', 100],
               'cit_tr':      ['/content/drive/Shareddrives/NewsRec/data/cit/common_user_item_PT.csv',
                              '/content/drive/Shareddrives/NewsRec/data/cit/read_artciles_PT_translated.csv',
                              '/content/drive/Shareddrives/NewsRec/data/cit/feat_tensor_PT_translated.pt',
                              'personId', 'url', 100],
               'cit_en_b':    ['/content/drive/Shareddrives/NewsRec/data/cit/common_user_item_EN.csv',
                              '/content/drive/Shareddrives/NewsRec/data/cit/read_articles_EN.csv',
                              '/content/drive/Shareddrives/NewsRec/data/cit/feat_tensor_EN.pt',
                              'personId', 'contentId', 100],
               'cit_mix_mb':  ['/content/drive/Shareddrives/NewsRec/data/cit/common_user_item.csv',
                              '/content/drive/Shareddrives/NewsRec/data/cit/read_articles.csv',
                              '/content/drive/Shareddrives/NewsRec/data/cit/feat_tensor_mbert.pt',
                              'personId', 'contentId', 100],
               'cit_mix_xlm': ['/content/drive/Shareddrives/NewsRec/data/cit/common_user_item.csv',
                              '/content/drive/Shareddrives/NewsRec/data/cit/read_articles.csv',
                              '/content/drive/Shareddrives/NewsRec/data/cit/feat_tensor_xlm.pt',
                              'personId', 'contentId', 100],


               'mind':        ['/content/drive/Shareddrives/NewsRec/data/mind/MINDlarge_train/common_user_item_0.csv',
                               '/content/drive/Shareddrives/NewsRec/data/mind/MINDlarge_train/read_article_titles_0.csv',
                               '/content/drive/Shareddrives/NewsRec/data/mind/MINDlarge_train/feat_tensor_0.pt',
                               'User_ID', 'News_ID', 3]
               }

##### Funs

In [None]:
def graph_con(df_users, df_items, user_naming, item_naming, feature_tensor):
    # Create a mapping from unique user indices to range [0, num_user_nodes):
    unique_user_id = df_users[user_naming].unique()
    unique_user_id = pd.DataFrame(data={
        'userId': unique_user_id,
        'mappedID': pd.RangeIndex(len(unique_user_id)),
    })
    print("Mapping of user IDs to consecutive values:")
    print("==========================================")
    print(unique_user_id.head())
    print()
    # Create a mapping from unique item indices to range [0, num_item_nodes):
    assert not any(df_items[item_naming].values != df_items[item_naming].unique())
    unique_item_id = df_items[item_naming]
    unique_item_id = pd.DataFrame(data={
        'itemId': unique_item_id,
        'mappedID': pd.RangeIndex(len(unique_item_id)),
    })
    print("Mapping of item IDs to consecutive values:")
    print("===========================================")
    print(unique_item_id.head())
    # Perform merge to obtain the edges from users and items:
    ratings_user_id = pd.merge(df_users[user_naming], unique_user_id,
                                left_on=user_naming, right_on='userId', how='left')
    ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
    ratings_item_id = pd.merge(df_users[item_naming], unique_item_id,
                                left_on=item_naming, right_on='itemId', how='left')
    ratings_item_id = torch.from_numpy(ratings_item_id['mappedID'].values)
    # Random edges
    if random_edges:
      ratings_user_id = torch.from_numpy(np.random.randint(low=0, high=len(unique_user_id)-1, size=len(unique_user_id)))
      ratings_item_id = torch.from_numpy(np.random.randint(low=0, high=len(unique_item_id)-1, size=len(unique_user_id)))
    # With this, we are ready to construct our `edge_index` in COO format
    # following PyG semantics:
    edge_index_coo_user_to_item = torch.stack([ratings_user_id, ratings_item_id], dim=0)

    print()
    print("Final edge indices pointing from users to items:")
    print("=================================================")
    print(edge_index_coo_user_to_item)
    data = HeteroData()

    # Save node indices:
    data["user"].node_id = torch.arange(len(unique_user_id))
    data["item"].node_id = torch.arange(len(unique_item_id))

    # Add the node features and edge indices:
    if item_feature_init == 'plm':
      data["item"].x = feature_tensor  # DONE
      if user_has_features:
      # only random init for user features
        data["user"].x = torch.Tensor(np.random.random((len(unique_user_id), user_feature_size)))
        print('user features shape:', data["user"].x.shape)
    elif item_feature_init == 'random':
      data["item"].x = torch.Tensor(np.random.random((len(unique_item_id), item_feature_size)))
    elif item_feature_init == 'zero':
      data["item"].x = torch.Tensor(np.zeros((len(unique_item_id), item_feature_size)))
    else:
      raise 'Need item feature init type: plm, random, zero'

    print('item features shape:', data["item"].x.shape)
    data["user", "rates", "item"].edge_index = edge_index_coo_user_to_item  # DONE

    # We also need to make sure to add the reverse edges from items to users
    # in order to let a GNN be able to pass messages in both directions.
    # We can leverage the `T.ToUndirected()` transform for this from PyG:
    data = T.ToUndirected()(data)
    del data['item', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.

    print(data)
    print("=============================")
    print('user num_nodes:', data["user"].num_nodes)
    print('user num_features:', data["user"].num_features)
    print('item num_nodes:', data["item"].num_nodes)
    print('item num_features:', data["item"].num_features)
    print('num_edges user->item:', data["user", "rates", "item"].num_edges)
    print('num_edges item->user:', data["item", "rev_rates", "user"].num_edges)

    return data

In [None]:
def data_loader(data):
    # For this, we first split the set of edges into
    # training (80%), validation (10%), and testing edges (10%).
    # Across the training edges, we use 70% of edges for message passing,
    # and 30% of edges for supervision.
    # We further want to generate fixed negative edges for evaluation with a ratio of 2:1.
    # Negative edges during training will be generated on-the-fly.
    # We can leverage the `RandomLinkSplit()` transform for this from PyG:
    transform = T.RandomLinkSplit(
        num_val=0.1,
        num_test=0.1,
        disjoint_train_ratio=0.3,
        neg_sampling_ratio=neg_sampling_ratio,
        add_negative_train_samples=False,       # negative samples generated on-the-fly
        edge_types=("user", "rates", "item"),
        rev_edge_types=("item", "rev_rates", "user"), 
    )


    train_data, val_data, test_data = transform(data)
    print("Training data:")
    print("==============")
    print(train_data)
    print()
    print("Validation data:")
    print("================")
    print(val_data)
    print()
    print("Test data:")
    print("================")
    print(test_data)
    # In the first hop, we sample at most 10 neighbors.
    # In the second hop, we sample at most 5 neighbors.
    # In addition, during training, we want to sample negative edges on-the-fly with
    # a ratio of 2:1.
    # We can make use of the `loader.LinkNeighborLoader` from PyG:
    # Define seed edges:
    edge_label_index = train_data["user", "rates", "item"].edge_label_index
    edge_label = train_data["user", "rates", "item"].edge_label

    train_loader = LinkNeighborLoader(
        data=train_data,
        num_neighbors=num_neighbors,
        neg_sampling_ratio=neg_sampling_ratio,
        edge_label_index=(("user", "rates", "item"), edge_label_index),
        edge_label=edge_label,
        batch_size=batch_size,
        shuffle=True,
    )

    # Inspect a sample:
    sampled_data = next(iter(train_loader))

    print("Sampled Train mini-batch:")
    print("===================")
    print(sampled_data)

    assert sampled_data["user", "rates", "item"].edge_label_index.size(1) == (neg_sampling_ratio + 1) * batch_size # 2 edges (1 pos + 1 neg) * batch_size
    assert sampled_data["user", "rates", "item"].edge_label.min() == 0
    assert sampled_data["user", "rates", "item"].edge_label.max() == 1
    # Define the validation seed edges:
    edge_label_index = val_data["user", "rates", "item"].edge_label_index
    edge_label = val_data["user", "rates", "item"].edge_label

    val_loader = LinkNeighborLoader(
        data=val_data,
        num_neighbors=num_neighbors,
        edge_label_index=(("user", "rates", "item"), edge_label_index),
        edge_label=edge_label,
        batch_size=int((neg_sampling_ratio + 1) * batch_size),
        shuffle=False,
    )

    sampled_data = next(iter(val_loader))

    print("Sampled Validation mini-batch:")
    print("===================")
    print(sampled_data)

    assert sampled_data["user", "rates", "item"].edge_label_index.size(1) == (neg_sampling_ratio + 1) * batch_size
    assert sampled_data["user", "rates", "item"].edge_label.min() >= 0
    assert sampled_data["user", "rates", "item"].edge_label.max() <= 1
    # Define the test seed edges:
    edge_label_index = test_data["user", "rates", "item"].edge_label_index
    edge_label = test_data["user", "rates", "item"].edge_label

    test_loader = LinkNeighborLoader(
        data=test_data,
        num_neighbors=num_neighbors,
        edge_label_index=(("user", "rates", "item"), edge_label_index),
        edge_label=edge_label,
        batch_size=int((neg_sampling_ratio + 1) * batch_size),
        shuffle=False,
    )

    sampled_data = next(iter(test_loader))

    print("Sampled Test mini-batch:")
    print("===================")
    print(sampled_data)

    assert sampled_data["user", "rates", "item"].edge_label_index.size(1) == (neg_sampling_ratio + 1) * batch_size
    assert sampled_data["user", "rates", "item"].edge_label.min() >= 0
    assert sampled_data["user", "rates", "item"].edge_label.max() <= 1

    return train_loader, val_loader, test_loader

In [None]:
def train(model, train_loader, epochs):
    # training
    train_loss = []
    for epoch in range(1, epochs+1):
        model.train()
        total_loss = total_examples = 0
        for batch_data in tqdm.tqdm(train_loader):
            optimizer.zero_grad()
            batch_data = batch_data.to(device)
            
            pred = model(batch_data) # `forward` pass of the model
            ground_truth = batch_data['user', 'item'].edge_label
            
            loss = criterion(pred, ground_truth)
            loss.backward()
            optimizer.step()
            total_loss += float(loss) * pred.numel()
            total_examples += pred.numel()
        
        epoch_loss = total_loss / total_examples
        train_loss.append(epoch_loss)
        print(f"\nEpoch: {epoch:03d}, Loss: {epoch_loss:.4f}")
    return train_loss

In [None]:
def test(model, test_loader):
    # test
    preds, ground_truths = [], []
    model.eval()
    for batch_data in tqdm.tqdm(test_loader):
        with torch.no_grad():
            batch_data = batch_data.to(device)
            preds.append(model(batch_data))
            ground_truths.append(batch_data['user', 'item'].edge_label)

    pred = torch.cat(preds, dim=0).cpu().numpy()
    ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
    auc = roc_auc_score(ground_truth, pred)
    print()
    # print(f"Test AUC: {auc:.4f}")

    out = torch.Tensor(pred).sigmoid().cpu().numpy()
    auc = roc_auc_score(ground_truth, out)
    print(f'Test AUC:', auc)

    probs = np.rint(out)
    f1_s = f1_score(ground_truth, probs)
    print(f'Test F1:', f1_s)

    return auc, f1_s

##### Loop

In [None]:
user_has_features = True
user_feature_size = 32          # 32
item_feature_init = 'plm'       # 'plm', 'zero', 'random'
item_feature_size = 768         # feature_tensor.size()[1]
random_edges = False

neg_sampling_ratio = 1.0        # hyper: 1, 2
batch_size = 256
num_neighbors = [20, 10]         # hyper: [10, 5], [10, 10], [20, 10], [20, 20]

hidden_channels = 32                          # hyper: 16, 32, 64
GNN_Conv_Layer = SAGEConv                     # hyper: SAGEConv, GATConv
remove_embedding_layer = user_has_features    # False

In [None]:
current = list(datas_dict.keys())
for dataset_name in current:
    cur_dataset = datas_dict[dataset_name]
    fusers = cur_dataset[0]
    fitems = cur_dataset[1]
    user_naming, item_naming = cur_dataset[3], cur_dataset[4]
    df_users, df_items = get_users_and_items(fusers, fitems, user_naming, item_naming)

    ffeat_items = cur_dataset[2]
    feature_tensor = torch.load(ffeat_items, map_location=device)
    print('feature tensor:', feature_tensor.shape)
    assert feature_tensor.shape[0] == df_users[item_naming].nunique()

    data = graph_con(df_users, df_items, user_naming, item_naming, feature_tensor)
    train_loader, val_loader, test_loader = data_loader(data)

    # data contains the whole graph
    num_user_nodes = data["user"].num_nodes
    num_item_nodes = data["item"].num_nodes
    graph_metadata = data.metadata()

    model = Model(user_feature_size, item_feature_size, num_user_nodes, num_item_nodes, hidden_channels, graph_metadata)
    model = model.to(device)
    print(f"Device: '{device}'")
    print(model)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = F.binary_cross_entropy_with_logits

    epochs = cur_dataset[5]
    train_loss = train(model, train_loader, epochs)
    # val_auc, val_f1 = validate(model, val_loader)
    test_auc, test_f1 = test(model, test_loader)

    # all_train_losses.append(train_loss)
    # all_vals.append([val_auc, val_f1])
    # all_tests.append([test_auc, test_f1])

    fpath = f'/content/drive/Shareddrives/NewsRec/data/k_log_{dataset_name}.txt'

    with open(fpath, 'w') as f:
      f.write('train,')
      f.write(','.join(map(str, train_loss)))
      f.write('\ntest,')
      f.write(f',{test_auc},{test_f1}')

users
users shape: (1857389, 2)
unique users: 535259
unique articles: 15721
                                              userId  \
0  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
1  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
2  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
3  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
4  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   

                                         id  
0  01c74ff7ec02862bed5f861cba7e7226dfd31beb  
1  2228c8e09bd615b509f87347675f4f0f1b74439e  
2  265d0e347e08d19ab62ffe4eadd0333eaad6e57c  
3  3011c1cce4999e3c4e02bd887468d1d1d90b7807  
4  44d3e6498035638f2e1c0580332b67e0d71af45c  

items
items shape: (15721, 2)
unique articles: 15721
                                         id  \
0  6a0612e60690288a776834811004ce133f326cee   
1  b40a30877124510cf65683b6c9391d927e20f89d   
2  29b7afc1d16d34b639597cab200c9c4c96e2b69e   
3  e6877439db8ca3bde8be8f6edb741fcc22c2772c   
4  a30fec2f3e9f2e408a2

100%|██████████| 1742/1742 [01:29<00:00, 19.41it/s]



Epoch: 001, Loss: 0.3720


100%|██████████| 1742/1742 [01:25<00:00, 20.40it/s]



Epoch: 002, Loss: 0.2494


100%|██████████| 1742/1742 [01:24<00:00, 20.59it/s]



Epoch: 003, Loss: 0.2227


100%|██████████| 1742/1742 [01:26<00:00, 20.10it/s]



Epoch: 004, Loss: 0.2104


100%|██████████| 1742/1742 [01:23<00:00, 20.99it/s]



Epoch: 005, Loss: 0.2029


100%|██████████| 726/726 [00:29<00:00, 24.36it/s]



Test AUC: 0.9705620433353779
Test F1: 0.9213352007469655
users
users shape: (1857389, 2)
unique users: 535259
unique articles: 15721
                                              userId  \
0  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
1  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
2  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
3  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
4  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   

                                         id  
0  01c74ff7ec02862bed5f861cba7e7226dfd31beb  
1  2228c8e09bd615b509f87347675f4f0f1b74439e  
2  265d0e347e08d19ab62ffe4eadd0333eaad6e57c  
3  3011c1cce4999e3c4e02bd887468d1d1d90b7807  
4  44d3e6498035638f2e1c0580332b67e0d71af45c  

items
items shape: (15721, 2)
unique articles: 15721
                                         id  \
0  6a0612e60690288a776834811004ce133f326cee   
1  b40a30877124510cf65683b6c9391d927e20f89d   
2  29b7afc1d16d34b639597cab200c9c4c96e2b69e   
3  e6877439

100%|██████████| 1742/1742 [01:27<00:00, 19.94it/s]



Epoch: 001, Loss: 0.2556


100%|██████████| 1742/1742 [01:28<00:00, 19.75it/s]



Epoch: 002, Loss: 0.1718


100%|██████████| 1742/1742 [01:27<00:00, 19.99it/s]



Epoch: 003, Loss: 0.1574


100%|██████████| 1742/1742 [01:27<00:00, 19.87it/s]



Epoch: 004, Loss: 0.1510


100%|██████████| 1742/1742 [01:27<00:00, 19.96it/s]



Epoch: 005, Loss: 0.1443


100%|██████████| 726/726 [00:29<00:00, 24.45it/s]



Test AUC: 0.9859093867993718
Test F1: 0.9450776767922745
users
users shape: (1857389, 2)
unique users: 535259
unique articles: 15721
                                              userId  \
0  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
1  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
2  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
3  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
4  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   

                                         id  
0  01c74ff7ec02862bed5f861cba7e7226dfd31beb  
1  2228c8e09bd615b509f87347675f4f0f1b74439e  
2  265d0e347e08d19ab62ffe4eadd0333eaad6e57c  
3  3011c1cce4999e3c4e02bd887468d1d1d90b7807  
4  44d3e6498035638f2e1c0580332b67e0d71af45c  

items
items shape: (15721, 2)
unique articles: 15721
                                         id  \
0  6a0612e60690288a776834811004ce133f326cee   
1  b40a30877124510cf65683b6c9391d927e20f89d   
2  29b7afc1d16d34b639597cab200c9c4c96e2b69e   
3  e6877439

100%|██████████| 1742/1742 [01:26<00:00, 20.18it/s]



Epoch: 001, Loss: 0.2590


100%|██████████| 1742/1742 [01:24<00:00, 20.52it/s]



Epoch: 002, Loss: 0.1907


100%|██████████| 1742/1742 [01:27<00:00, 20.00it/s]



Epoch: 003, Loss: 0.1725


100%|██████████| 1742/1742 [01:23<00:00, 20.86it/s]



Epoch: 004, Loss: 0.1631


100%|██████████| 1742/1742 [01:23<00:00, 20.84it/s]



Epoch: 005, Loss: 0.1577


100%|██████████| 726/726 [00:29<00:00, 24.79it/s]



Test AUC: 0.9836626845834466
Test F1: 0.9441286247487798
users
users shape: (1857389, 2)
unique users: 535259
unique articles: 15721
                                              userId  \
0  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
1  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
2  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
3  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   
4  cx:0049d95b7c3a854f8c1edb0a13197e71:1zqfs7vo2seov   

                                         id  
0  01c74ff7ec02862bed5f861cba7e7226dfd31beb  
1  2228c8e09bd615b509f87347675f4f0f1b74439e  
2  265d0e347e08d19ab62ffe4eadd0333eaad6e57c  
3  3011c1cce4999e3c4e02bd887468d1d1d90b7807  
4  44d3e6498035638f2e1c0580332b67e0d71af45c  

items
items shape: (15721, 2)
unique articles: 15721
                                         id  \
0  6a0612e60690288a776834811004ce133f326cee   
1  b40a30877124510cf65683b6c9391d927e20f89d   
2  29b7afc1d16d34b639597cab200c9c4c96e2b69e   
3  e6877439

100%|██████████| 1742/1742 [01:27<00:00, 19.98it/s]



Epoch: 001, Loss: 0.3251


100%|██████████| 1742/1742 [01:24<00:00, 20.64it/s]



Epoch: 002, Loss: 0.2245


100%|██████████| 1742/1742 [01:23<00:00, 20.76it/s]



Epoch: 003, Loss: 0.2085


100%|██████████| 1742/1742 [01:22<00:00, 21.05it/s]



Epoch: 004, Loss: 0.1971


100%|██████████| 1742/1742 [01:25<00:00, 20.48it/s]



Epoch: 005, Loss: 0.1847


100%|██████████| 726/726 [00:28<00:00, 25.06it/s]



Test AUC: 0.975822608679187
Test F1: 0.9279407123043012
users
users shape: (14417, 4)
unique users: 1619
unique articles: 821
             contentId             personId  \
0   310515487419366995 -1130272294246983140   
1   310515487419366995   344280948527967603   
2   310515487419366995 -8763398617720485024   
3 -8864073373672512525  3609194402293569455   
4   310515487419366995  3609194402293569455   

                                                 url lang  
0  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
1  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
2  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
3  http://www.startupsstars.com/2016/06/video-int...   pt  
4  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  

items
items shape: (821, 4)
unique articles: 821
             contentId                                              title  \
0 -9157338616628196758  Situação financeira ruim de varejistas pressio...   
1 -5917314377186856799  Ar

100%|██████████| 14/14 [00:00<00:00, 22.97it/s]



Epoch: 001, Loss: 0.6803


100%|██████████| 14/14 [00:00<00:00, 34.78it/s]



Epoch: 002, Loss: 0.6604


100%|██████████| 14/14 [00:00<00:00, 39.13it/s]



Epoch: 003, Loss: 0.6472


100%|██████████| 14/14 [00:00<00:00, 36.57it/s]



Epoch: 004, Loss: 0.6274


100%|██████████| 14/14 [00:00<00:00, 40.16it/s]



Epoch: 005, Loss: 0.6098


100%|██████████| 14/14 [00:00<00:00, 38.94it/s]



Epoch: 006, Loss: 0.5964


100%|██████████| 14/14 [00:00<00:00, 38.91it/s]



Epoch: 007, Loss: 0.5804


100%|██████████| 14/14 [00:00<00:00, 40.40it/s]



Epoch: 008, Loss: 0.5824


100%|██████████| 14/14 [00:00<00:00, 38.18it/s]



Epoch: 009, Loss: 0.5830


100%|██████████| 14/14 [00:00<00:00, 39.50it/s]



Epoch: 010, Loss: 0.5711


100%|██████████| 14/14 [00:00<00:00, 27.55it/s]



Epoch: 011, Loss: 0.5655


100%|██████████| 14/14 [00:00<00:00, 28.92it/s]



Epoch: 012, Loss: 0.5666


100%|██████████| 14/14 [00:00<00:00, 22.39it/s]



Epoch: 013, Loss: 0.5579


100%|██████████| 14/14 [00:00<00:00, 28.67it/s]



Epoch: 014, Loss: 0.5746


100%|██████████| 14/14 [00:00<00:00, 32.91it/s]



Epoch: 015, Loss: 0.5593


100%|██████████| 14/14 [00:00<00:00, 30.46it/s]



Epoch: 016, Loss: 0.5488


100%|██████████| 14/14 [00:00<00:00, 41.74it/s]



Epoch: 017, Loss: 0.5496


100%|██████████| 14/14 [00:00<00:00, 42.97it/s]



Epoch: 018, Loss: 0.5502


100%|██████████| 14/14 [00:00<00:00, 46.35it/s]



Epoch: 019, Loss: 0.5427


100%|██████████| 14/14 [00:00<00:00, 44.59it/s]



Epoch: 020, Loss: 0.5453


100%|██████████| 14/14 [00:00<00:00, 42.92it/s]



Epoch: 021, Loss: 0.5469


100%|██████████| 14/14 [00:00<00:00, 44.22it/s]



Epoch: 022, Loss: 0.5431


100%|██████████| 14/14 [00:00<00:00, 43.39it/s]



Epoch: 023, Loss: 0.5450


100%|██████████| 14/14 [00:00<00:00, 43.47it/s]



Epoch: 024, Loss: 0.5344


100%|██████████| 14/14 [00:00<00:00, 46.77it/s]



Epoch: 025, Loss: 0.5368


100%|██████████| 14/14 [00:00<00:00, 44.69it/s]



Epoch: 026, Loss: 0.5386


100%|██████████| 14/14 [00:00<00:00, 44.61it/s]



Epoch: 027, Loss: 0.5292


100%|██████████| 14/14 [00:00<00:00, 45.29it/s]



Epoch: 028, Loss: 0.5279


100%|██████████| 14/14 [00:00<00:00, 43.68it/s]



Epoch: 029, Loss: 0.5307


100%|██████████| 14/14 [00:00<00:00, 44.13it/s]



Epoch: 030, Loss: 0.5286


100%|██████████| 14/14 [00:00<00:00, 44.65it/s]



Epoch: 031, Loss: 0.5251


100%|██████████| 14/14 [00:00<00:00, 44.77it/s]



Epoch: 032, Loss: 0.5203


100%|██████████| 14/14 [00:00<00:00, 43.74it/s]



Epoch: 033, Loss: 0.5196


100%|██████████| 14/14 [00:00<00:00, 45.81it/s]



Epoch: 034, Loss: 0.5192


100%|██████████| 14/14 [00:00<00:00, 42.78it/s]



Epoch: 035, Loss: 0.5104


100%|██████████| 14/14 [00:00<00:00, 42.21it/s]



Epoch: 036, Loss: 0.5171


100%|██████████| 14/14 [00:00<00:00, 43.60it/s]



Epoch: 037, Loss: 0.5190


100%|██████████| 14/14 [00:00<00:00, 42.72it/s]



Epoch: 038, Loss: 0.5219


100%|██████████| 14/14 [00:00<00:00, 43.67it/s]



Epoch: 039, Loss: 0.5137


100%|██████████| 14/14 [00:00<00:00, 43.46it/s]



Epoch: 040, Loss: 0.5185


100%|██████████| 14/14 [00:00<00:00, 43.29it/s]



Epoch: 041, Loss: 0.5118


100%|██████████| 14/14 [00:00<00:00, 43.71it/s]



Epoch: 042, Loss: 0.5059


100%|██████████| 14/14 [00:00<00:00, 44.65it/s]



Epoch: 043, Loss: 0.5081


100%|██████████| 14/14 [00:00<00:00, 44.36it/s]



Epoch: 044, Loss: 0.5005


100%|██████████| 14/14 [00:00<00:00, 43.47it/s]



Epoch: 045, Loss: 0.5060


100%|██████████| 14/14 [00:00<00:00, 42.57it/s]



Epoch: 046, Loss: 0.5075


100%|██████████| 14/14 [00:00<00:00, 43.56it/s]



Epoch: 047, Loss: 0.5031


100%|██████████| 14/14 [00:00<00:00, 43.23it/s]



Epoch: 048, Loss: 0.4975


100%|██████████| 14/14 [00:00<00:00, 44.94it/s]



Epoch: 049, Loss: 0.5040


100%|██████████| 14/14 [00:00<00:00, 43.65it/s]



Epoch: 050, Loss: 0.4920


100%|██████████| 14/14 [00:00<00:00, 42.35it/s]



Epoch: 051, Loss: 0.4975


100%|██████████| 14/14 [00:00<00:00, 42.20it/s]



Epoch: 052, Loss: 0.5025


100%|██████████| 14/14 [00:00<00:00, 42.15it/s]



Epoch: 053, Loss: 0.4905


100%|██████████| 14/14 [00:00<00:00, 42.74it/s]



Epoch: 054, Loss: 0.4900


100%|██████████| 14/14 [00:00<00:00, 45.82it/s]



Epoch: 055, Loss: 0.4966


100%|██████████| 14/14 [00:00<00:00, 44.28it/s]



Epoch: 056, Loss: 0.4950


100%|██████████| 14/14 [00:00<00:00, 44.40it/s]



Epoch: 057, Loss: 0.4934


100%|██████████| 14/14 [00:00<00:00, 44.00it/s]



Epoch: 058, Loss: 0.4989


100%|██████████| 14/14 [00:00<00:00, 42.81it/s]



Epoch: 059, Loss: 0.5087


100%|██████████| 14/14 [00:00<00:00, 42.50it/s]



Epoch: 060, Loss: 0.5034


100%|██████████| 14/14 [00:00<00:00, 40.46it/s]



Epoch: 061, Loss: 0.4949


100%|██████████| 14/14 [00:00<00:00, 42.88it/s]



Epoch: 062, Loss: 0.4840


100%|██████████| 14/14 [00:00<00:00, 39.85it/s]



Epoch: 063, Loss: 0.4865


100%|██████████| 14/14 [00:00<00:00, 42.19it/s]



Epoch: 064, Loss: 0.4890


100%|██████████| 14/14 [00:00<00:00, 41.31it/s]



Epoch: 065, Loss: 0.4924


100%|██████████| 14/14 [00:00<00:00, 42.87it/s]



Epoch: 066, Loss: 0.4947


100%|██████████| 14/14 [00:00<00:00, 43.42it/s]



Epoch: 067, Loss: 0.4850


100%|██████████| 14/14 [00:00<00:00, 43.50it/s]



Epoch: 068, Loss: 0.4805


100%|██████████| 14/14 [00:00<00:00, 41.44it/s]



Epoch: 069, Loss: 0.4751


100%|██████████| 14/14 [00:00<00:00, 43.51it/s]



Epoch: 070, Loss: 0.4793


100%|██████████| 14/14 [00:00<00:00, 41.81it/s]



Epoch: 071, Loss: 0.4793


100%|██████████| 14/14 [00:00<00:00, 42.08it/s]



Epoch: 072, Loss: 0.4840


100%|██████████| 14/14 [00:00<00:00, 44.49it/s]



Epoch: 073, Loss: 0.4680


100%|██████████| 14/14 [00:00<00:00, 43.89it/s]



Epoch: 074, Loss: 0.4787


100%|██████████| 14/14 [00:00<00:00, 42.31it/s]



Epoch: 075, Loss: 0.4645


100%|██████████| 14/14 [00:00<00:00, 42.84it/s]



Epoch: 076, Loss: 0.4757


100%|██████████| 14/14 [00:00<00:00, 43.16it/s]



Epoch: 077, Loss: 0.4767


100%|██████████| 14/14 [00:00<00:00, 43.66it/s]



Epoch: 078, Loss: 0.4695


100%|██████████| 14/14 [00:00<00:00, 46.05it/s]



Epoch: 079, Loss: 0.4718


100%|██████████| 14/14 [00:00<00:00, 43.99it/s]



Epoch: 080, Loss: 0.4719


100%|██████████| 14/14 [00:00<00:00, 44.22it/s]



Epoch: 081, Loss: 0.4817


100%|██████████| 14/14 [00:00<00:00, 43.45it/s]



Epoch: 082, Loss: 0.4843


100%|██████████| 14/14 [00:00<00:00, 42.93it/s]



Epoch: 083, Loss: 0.4828


100%|██████████| 14/14 [00:00<00:00, 42.47it/s]



Epoch: 084, Loss: 0.4713


100%|██████████| 14/14 [00:00<00:00, 41.26it/s]



Epoch: 085, Loss: 0.4797


100%|██████████| 14/14 [00:00<00:00, 42.97it/s]



Epoch: 086, Loss: 0.4671


100%|██████████| 14/14 [00:00<00:00, 41.51it/s]



Epoch: 087, Loss: 0.4724


100%|██████████| 14/14 [00:00<00:00, 44.38it/s]



Epoch: 088, Loss: 0.4788


100%|██████████| 14/14 [00:00<00:00, 42.99it/s]



Epoch: 089, Loss: 0.4650


100%|██████████| 14/14 [00:00<00:00, 45.59it/s]



Epoch: 090, Loss: 0.4649


100%|██████████| 14/14 [00:00<00:00, 46.71it/s]



Epoch: 091, Loss: 0.4710


100%|██████████| 14/14 [00:00<00:00, 43.76it/s]



Epoch: 092, Loss: 0.4752


100%|██████████| 14/14 [00:00<00:00, 42.79it/s]



Epoch: 093, Loss: 0.4753


100%|██████████| 14/14 [00:00<00:00, 43.97it/s]



Epoch: 094, Loss: 0.4633


100%|██████████| 14/14 [00:00<00:00, 44.55it/s]



Epoch: 095, Loss: 0.4647


100%|██████████| 14/14 [00:00<00:00, 45.54it/s]



Epoch: 096, Loss: 0.4612


100%|██████████| 14/14 [00:00<00:00, 45.21it/s]



Epoch: 097, Loss: 0.4742


100%|██████████| 14/14 [00:00<00:00, 42.87it/s]



Epoch: 098, Loss: 0.4532


100%|██████████| 14/14 [00:00<00:00, 43.78it/s]



Epoch: 099, Loss: 0.4573


100%|██████████| 14/14 [00:00<00:00, 42.85it/s]



Epoch: 100, Loss: 0.4655


100%|██████████| 6/6 [00:00<00:00, 45.76it/s]



Test AUC: 0.7980540154232088
Test F1: 0.7352640545144805
users
users shape: (14417, 4)
unique users: 1619
unique articles: 821
             contentId             personId  \
0   310515487419366995 -1130272294246983140   
1   310515487419366995   344280948527967603   
2   310515487419366995 -8763398617720485024   
3 -8864073373672512525  3609194402293569455   
4   310515487419366995  3609194402293569455   

                                                 url lang  
0  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
1  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
2  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
3  http://www.startupsstars.com/2016/06/video-int...   pt  
4  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  

items
items shape: (821, 4)
unique articles: 821
             contentId                                              title  \
0 -9157338616628196758  Situação financeira ruim de varejistas pressio...   
1 -5917314377186856799  A

100%|██████████| 14/14 [00:00<00:00, 43.55it/s]



Epoch: 001, Loss: 0.6771


100%|██████████| 14/14 [00:00<00:00, 42.86it/s]



Epoch: 002, Loss: 0.6460


100%|██████████| 14/14 [00:00<00:00, 41.45it/s]



Epoch: 003, Loss: 0.6338


100%|██████████| 14/14 [00:00<00:00, 44.26it/s]



Epoch: 004, Loss: 0.6189


100%|██████████| 14/14 [00:00<00:00, 41.66it/s]



Epoch: 005, Loss: 0.6081


100%|██████████| 14/14 [00:00<00:00, 41.90it/s]



Epoch: 006, Loss: 0.6036


100%|██████████| 14/14 [00:00<00:00, 43.25it/s]



Epoch: 007, Loss: 0.5992


100%|██████████| 14/14 [00:00<00:00, 43.46it/s]



Epoch: 008, Loss: 0.5913


100%|██████████| 14/14 [00:00<00:00, 41.51it/s]



Epoch: 009, Loss: 0.5886


100%|██████████| 14/14 [00:00<00:00, 43.06it/s]



Epoch: 010, Loss: 0.5768


100%|██████████| 14/14 [00:00<00:00, 44.07it/s]



Epoch: 011, Loss: 0.5697


100%|██████████| 14/14 [00:00<00:00, 44.10it/s]



Epoch: 012, Loss: 0.5657


100%|██████████| 14/14 [00:00<00:00, 43.95it/s]



Epoch: 013, Loss: 0.5540


100%|██████████| 14/14 [00:00<00:00, 43.53it/s]



Epoch: 014, Loss: 0.5652


100%|██████████| 14/14 [00:00<00:00, 42.37it/s]



Epoch: 015, Loss: 0.5603


100%|██████████| 14/14 [00:00<00:00, 43.38it/s]



Epoch: 016, Loss: 0.5553


100%|██████████| 14/14 [00:00<00:00, 43.46it/s]



Epoch: 017, Loss: 0.5494


100%|██████████| 14/14 [00:00<00:00, 43.31it/s]



Epoch: 018, Loss: 0.5445


100%|██████████| 14/14 [00:00<00:00, 43.80it/s]



Epoch: 019, Loss: 0.5503


100%|██████████| 14/14 [00:00<00:00, 44.22it/s]



Epoch: 020, Loss: 0.5497


100%|██████████| 14/14 [00:00<00:00, 43.12it/s]



Epoch: 021, Loss: 0.5414


100%|██████████| 14/14 [00:00<00:00, 43.93it/s]



Epoch: 022, Loss: 0.5468


100%|██████████| 14/14 [00:00<00:00, 42.79it/s]



Epoch: 023, Loss: 0.5419


100%|██████████| 14/14 [00:00<00:00, 43.23it/s]



Epoch: 024, Loss: 0.5376


100%|██████████| 14/14 [00:00<00:00, 43.08it/s]



Epoch: 025, Loss: 0.5332


100%|██████████| 14/14 [00:00<00:00, 44.25it/s]



Epoch: 026, Loss: 0.5329


100%|██████████| 14/14 [00:00<00:00, 43.51it/s]



Epoch: 027, Loss: 0.5278


100%|██████████| 14/14 [00:00<00:00, 43.49it/s]



Epoch: 028, Loss: 0.5294


100%|██████████| 14/14 [00:00<00:00, 43.28it/s]



Epoch: 029, Loss: 0.5356


100%|██████████| 14/14 [00:00<00:00, 43.08it/s]



Epoch: 030, Loss: 0.5272


100%|██████████| 14/14 [00:00<00:00, 43.92it/s]



Epoch: 031, Loss: 0.5208


100%|██████████| 14/14 [00:00<00:00, 42.91it/s]



Epoch: 032, Loss: 0.5209


100%|██████████| 14/14 [00:00<00:00, 44.13it/s]



Epoch: 033, Loss: 0.5151


100%|██████████| 14/14 [00:00<00:00, 44.79it/s]



Epoch: 034, Loss: 0.5214


100%|██████████| 14/14 [00:00<00:00, 43.75it/s]



Epoch: 035, Loss: 0.5166


100%|██████████| 14/14 [00:00<00:00, 43.99it/s]



Epoch: 036, Loss: 0.5107


100%|██████████| 14/14 [00:00<00:00, 43.84it/s]



Epoch: 037, Loss: 0.5127


100%|██████████| 14/14 [00:00<00:00, 44.34it/s]



Epoch: 038, Loss: 0.5101


100%|██████████| 14/14 [00:00<00:00, 44.01it/s]



Epoch: 039, Loss: 0.5044


100%|██████████| 14/14 [00:00<00:00, 42.20it/s]



Epoch: 040, Loss: 0.5025


100%|██████████| 14/14 [00:00<00:00, 44.66it/s]



Epoch: 041, Loss: 0.5159


100%|██████████| 14/14 [00:00<00:00, 43.94it/s]



Epoch: 042, Loss: 0.5081


100%|██████████| 14/14 [00:00<00:00, 45.56it/s]



Epoch: 043, Loss: 0.5024


100%|██████████| 14/14 [00:00<00:00, 42.28it/s]



Epoch: 044, Loss: 0.5064


100%|██████████| 14/14 [00:00<00:00, 43.12it/s]



Epoch: 045, Loss: 0.5101


100%|██████████| 14/14 [00:00<00:00, 45.31it/s]



Epoch: 046, Loss: 0.5107


100%|██████████| 14/14 [00:00<00:00, 43.90it/s]



Epoch: 047, Loss: 0.5053


100%|██████████| 14/14 [00:00<00:00, 44.00it/s]



Epoch: 048, Loss: 0.4982


100%|██████████| 14/14 [00:00<00:00, 42.26it/s]



Epoch: 049, Loss: 0.5050


100%|██████████| 14/14 [00:00<00:00, 41.69it/s]



Epoch: 050, Loss: 0.5040


100%|██████████| 14/14 [00:00<00:00, 43.45it/s]



Epoch: 051, Loss: 0.5025


100%|██████████| 14/14 [00:00<00:00, 40.52it/s]



Epoch: 052, Loss: 0.4897


100%|██████████| 14/14 [00:00<00:00, 41.78it/s]



Epoch: 053, Loss: 0.4944


100%|██████████| 14/14 [00:00<00:00, 41.56it/s]



Epoch: 054, Loss: 0.4948


100%|██████████| 14/14 [00:00<00:00, 41.59it/s]



Epoch: 055, Loss: 0.4877


100%|██████████| 14/14 [00:00<00:00, 43.38it/s]



Epoch: 056, Loss: 0.5009


100%|██████████| 14/14 [00:00<00:00, 44.79it/s]



Epoch: 057, Loss: 0.4867


100%|██████████| 14/14 [00:00<00:00, 44.05it/s]



Epoch: 058, Loss: 0.4844


100%|██████████| 14/14 [00:00<00:00, 43.84it/s]



Epoch: 059, Loss: 0.4804


100%|██████████| 14/14 [00:00<00:00, 44.29it/s]



Epoch: 060, Loss: 0.4919


100%|██████████| 14/14 [00:00<00:00, 41.90it/s]



Epoch: 061, Loss: 0.4868


100%|██████████| 14/14 [00:00<00:00, 43.31it/s]



Epoch: 062, Loss: 0.4954


100%|██████████| 14/14 [00:00<00:00, 44.62it/s]



Epoch: 063, Loss: 0.4869


100%|██████████| 14/14 [00:00<00:00, 42.78it/s]



Epoch: 064, Loss: 0.4783


100%|██████████| 14/14 [00:00<00:00, 43.82it/s]



Epoch: 065, Loss: 0.4878


100%|██████████| 14/14 [00:00<00:00, 44.85it/s]



Epoch: 066, Loss: 0.4907


100%|██████████| 14/14 [00:00<00:00, 42.65it/s]



Epoch: 067, Loss: 0.4846


100%|██████████| 14/14 [00:00<00:00, 44.56it/s]



Epoch: 068, Loss: 0.4800


100%|██████████| 14/14 [00:00<00:00, 43.36it/s]



Epoch: 069, Loss: 0.4727


100%|██████████| 14/14 [00:00<00:00, 43.13it/s]



Epoch: 070, Loss: 0.4693


100%|██████████| 14/14 [00:00<00:00, 44.15it/s]



Epoch: 071, Loss: 0.4751


100%|██████████| 14/14 [00:00<00:00, 43.40it/s]



Epoch: 072, Loss: 0.4855


100%|██████████| 14/14 [00:00<00:00, 43.96it/s]



Epoch: 073, Loss: 0.4783


100%|██████████| 14/14 [00:00<00:00, 43.37it/s]



Epoch: 074, Loss: 0.4773


100%|██████████| 14/14 [00:00<00:00, 43.26it/s]



Epoch: 075, Loss: 0.4768


100%|██████████| 14/14 [00:00<00:00, 43.95it/s]



Epoch: 076, Loss: 0.4728


100%|██████████| 14/14 [00:00<00:00, 46.08it/s]



Epoch: 077, Loss: 0.4755


100%|██████████| 14/14 [00:00<00:00, 44.62it/s]



Epoch: 078, Loss: 0.4616


100%|██████████| 14/14 [00:00<00:00, 44.71it/s]



Epoch: 079, Loss: 0.4627


100%|██████████| 14/14 [00:00<00:00, 43.84it/s]



Epoch: 080, Loss: 0.4743


100%|██████████| 14/14 [00:00<00:00, 43.90it/s]



Epoch: 081, Loss: 0.4669


100%|██████████| 14/14 [00:00<00:00, 42.39it/s]



Epoch: 082, Loss: 0.4763


100%|██████████| 14/14 [00:00<00:00, 42.67it/s]



Epoch: 083, Loss: 0.4762


100%|██████████| 14/14 [00:00<00:00, 44.03it/s]



Epoch: 084, Loss: 0.4725


100%|██████████| 14/14 [00:00<00:00, 42.85it/s]



Epoch: 085, Loss: 0.4727


100%|██████████| 14/14 [00:00<00:00, 45.82it/s]



Epoch: 086, Loss: 0.4724


100%|██████████| 14/14 [00:00<00:00, 44.57it/s]



Epoch: 087, Loss: 0.4734


100%|██████████| 14/14 [00:00<00:00, 45.32it/s]



Epoch: 088, Loss: 0.4684


100%|██████████| 14/14 [00:00<00:00, 42.59it/s]



Epoch: 089, Loss: 0.4757


100%|██████████| 14/14 [00:00<00:00, 42.58it/s]



Epoch: 090, Loss: 0.4772


100%|██████████| 14/14 [00:00<00:00, 43.43it/s]



Epoch: 091, Loss: 0.4521


100%|██████████| 14/14 [00:00<00:00, 43.62it/s]



Epoch: 092, Loss: 0.4664


100%|██████████| 14/14 [00:00<00:00, 44.31it/s]



Epoch: 093, Loss: 0.4572


100%|██████████| 14/14 [00:00<00:00, 41.59it/s]



Epoch: 094, Loss: 0.4561


100%|██████████| 14/14 [00:00<00:00, 42.39it/s]



Epoch: 095, Loss: 0.4588


100%|██████████| 14/14 [00:00<00:00, 43.97it/s]



Epoch: 096, Loss: 0.4621


100%|██████████| 14/14 [00:00<00:00, 43.68it/s]



Epoch: 097, Loss: 0.4620


100%|██████████| 14/14 [00:00<00:00, 43.87it/s]



Epoch: 098, Loss: 0.4581


100%|██████████| 14/14 [00:00<00:00, 43.55it/s]



Epoch: 099, Loss: 0.4648


100%|██████████| 14/14 [00:00<00:00, 43.74it/s]



Epoch: 100, Loss: 0.4498


100%|██████████| 6/6 [00:00<00:00, 42.94it/s]



Test AUC: 0.7990046622145832
Test F1: 0.732947232947233
users
users shape: (14417, 4)
unique users: 1619
unique articles: 821
             contentId             personId  \
0   310515487419366995 -1130272294246983140   
1   310515487419366995   344280948527967603   
2   310515487419366995 -8763398617720485024   
3 -8864073373672512525  3609194402293569455   
4   310515487419366995  3609194402293569455   

                                                 url lang  
0  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
1  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
2  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
3  http://www.startupsstars.com/2016/06/video-int...   pt  
4  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  

items
items shape: (821, 4)
unique articles: 821
             contentId                                              title  \
0 -9157338616628196758  Situação financeira ruim de varejistas pressio...   
1 -5917314377186856799  Ar

100%|██████████| 14/14 [00:00<00:00, 42.57it/s]



Epoch: 001, Loss: 0.6876


100%|██████████| 14/14 [00:00<00:00, 43.45it/s]



Epoch: 002, Loss: 0.6793


100%|██████████| 14/14 [00:00<00:00, 44.63it/s]



Epoch: 003, Loss: 0.6723


100%|██████████| 14/14 [00:00<00:00, 43.44it/s]



Epoch: 004, Loss: 0.6681


100%|██████████| 14/14 [00:00<00:00, 45.49it/s]



Epoch: 005, Loss: 0.6616


100%|██████████| 14/14 [00:00<00:00, 44.87it/s]



Epoch: 006, Loss: 0.6561


100%|██████████| 14/14 [00:00<00:00, 42.40it/s]



Epoch: 007, Loss: 0.6480


100%|██████████| 14/14 [00:00<00:00, 44.84it/s]



Epoch: 008, Loss: 0.6445


100%|██████████| 14/14 [00:00<00:00, 45.36it/s]



Epoch: 009, Loss: 0.6398


100%|██████████| 14/14 [00:00<00:00, 43.84it/s]



Epoch: 010, Loss: 0.6313


100%|██████████| 14/14 [00:00<00:00, 45.07it/s]



Epoch: 011, Loss: 0.6333


100%|██████████| 14/14 [00:00<00:00, 44.87it/s]



Epoch: 012, Loss: 0.6254


100%|██████████| 14/14 [00:00<00:00, 42.68it/s]



Epoch: 013, Loss: 0.6253


100%|██████████| 14/14 [00:00<00:00, 44.02it/s]



Epoch: 014, Loss: 0.6201


100%|██████████| 14/14 [00:00<00:00, 42.89it/s]



Epoch: 015, Loss: 0.6183


100%|██████████| 14/14 [00:00<00:00, 42.09it/s]



Epoch: 016, Loss: 0.6176


100%|██████████| 14/14 [00:00<00:00, 43.01it/s]



Epoch: 017, Loss: 0.6141


100%|██████████| 14/14 [00:00<00:00, 43.95it/s]



Epoch: 018, Loss: 0.6085


100%|██████████| 14/14 [00:00<00:00, 41.36it/s]



Epoch: 019, Loss: 0.6144


100%|██████████| 14/14 [00:00<00:00, 44.32it/s]



Epoch: 020, Loss: 0.6013


100%|██████████| 14/14 [00:00<00:00, 43.05it/s]



Epoch: 021, Loss: 0.5963


100%|██████████| 14/14 [00:00<00:00, 43.50it/s]



Epoch: 022, Loss: 0.5920


100%|██████████| 14/14 [00:00<00:00, 44.46it/s]



Epoch: 023, Loss: 0.5916


100%|██████████| 14/14 [00:00<00:00, 42.76it/s]



Epoch: 024, Loss: 0.5929


100%|██████████| 14/14 [00:00<00:00, 43.70it/s]



Epoch: 025, Loss: 0.5778


100%|██████████| 14/14 [00:00<00:00, 44.76it/s]



Epoch: 026, Loss: 0.5785


100%|██████████| 14/14 [00:00<00:00, 44.10it/s]



Epoch: 027, Loss: 0.5802


100%|██████████| 14/14 [00:00<00:00, 43.93it/s]



Epoch: 028, Loss: 0.5686


100%|██████████| 14/14 [00:00<00:00, 44.15it/s]



Epoch: 029, Loss: 0.5693


100%|██████████| 14/14 [00:00<00:00, 42.17it/s]



Epoch: 030, Loss: 0.5681


100%|██████████| 14/14 [00:00<00:00, 43.90it/s]



Epoch: 031, Loss: 0.5692


100%|██████████| 14/14 [00:00<00:00, 43.98it/s]



Epoch: 032, Loss: 0.5613


100%|██████████| 14/14 [00:00<00:00, 44.17it/s]



Epoch: 033, Loss: 0.5420


100%|██████████| 14/14 [00:00<00:00, 44.61it/s]



Epoch: 034, Loss: 0.5550


100%|██████████| 14/14 [00:00<00:00, 42.26it/s]



Epoch: 035, Loss: 0.5468


100%|██████████| 14/14 [00:00<00:00, 44.40it/s]



Epoch: 036, Loss: 0.5453


100%|██████████| 14/14 [00:00<00:00, 41.55it/s]



Epoch: 037, Loss: 0.5511


100%|██████████| 14/14 [00:00<00:00, 43.22it/s]



Epoch: 038, Loss: 0.5426


100%|██████████| 14/14 [00:00<00:00, 43.27it/s]



Epoch: 039, Loss: 0.5440


100%|██████████| 14/14 [00:00<00:00, 42.21it/s]



Epoch: 040, Loss: 0.5404


100%|██████████| 14/14 [00:00<00:00, 40.92it/s]



Epoch: 041, Loss: 0.5484


100%|██████████| 14/14 [00:00<00:00, 42.00it/s]



Epoch: 042, Loss: 0.5431


100%|██████████| 14/14 [00:00<00:00, 43.23it/s]



Epoch: 043, Loss: 0.5408


100%|██████████| 14/14 [00:00<00:00, 42.00it/s]



Epoch: 044, Loss: 0.5378


100%|██████████| 14/14 [00:00<00:00, 43.36it/s]



Epoch: 045, Loss: 0.5412


100%|██████████| 14/14 [00:00<00:00, 45.38it/s]



Epoch: 046, Loss: 0.5325


100%|██████████| 14/14 [00:00<00:00, 43.81it/s]



Epoch: 047, Loss: 0.5340


100%|██████████| 14/14 [00:00<00:00, 42.66it/s]



Epoch: 048, Loss: 0.5379


100%|██████████| 14/14 [00:00<00:00, 44.22it/s]



Epoch: 049, Loss: 0.5396


100%|██████████| 14/14 [00:00<00:00, 43.40it/s]



Epoch: 050, Loss: 0.5449


100%|██████████| 14/14 [00:00<00:00, 44.10it/s]



Epoch: 051, Loss: 0.5323


100%|██████████| 14/14 [00:00<00:00, 44.72it/s]



Epoch: 052, Loss: 0.5360


100%|██████████| 14/14 [00:00<00:00, 44.23it/s]



Epoch: 053, Loss: 0.5258


100%|██████████| 14/14 [00:00<00:00, 44.54it/s]



Epoch: 054, Loss: 0.5330


100%|██████████| 14/14 [00:00<00:00, 43.79it/s]



Epoch: 055, Loss: 0.5271


100%|██████████| 14/14 [00:00<00:00, 44.87it/s]



Epoch: 056, Loss: 0.5245


100%|██████████| 14/14 [00:00<00:00, 45.05it/s]



Epoch: 057, Loss: 0.5279


100%|██████████| 14/14 [00:00<00:00, 44.38it/s]



Epoch: 058, Loss: 0.5284


100%|██████████| 14/14 [00:00<00:00, 44.45it/s]



Epoch: 059, Loss: 0.5243


100%|██████████| 14/14 [00:00<00:00, 42.47it/s]



Epoch: 060, Loss: 0.5243


100%|██████████| 14/14 [00:00<00:00, 43.82it/s]



Epoch: 061, Loss: 0.5266


100%|██████████| 14/14 [00:00<00:00, 42.34it/s]



Epoch: 062, Loss: 0.5276


100%|██████████| 14/14 [00:00<00:00, 44.04it/s]



Epoch: 063, Loss: 0.5219


100%|██████████| 14/14 [00:00<00:00, 45.71it/s]



Epoch: 064, Loss: 0.5090


100%|██████████| 14/14 [00:00<00:00, 45.39it/s]



Epoch: 065, Loss: 0.5222


100%|██████████| 14/14 [00:00<00:00, 42.99it/s]



Epoch: 066, Loss: 0.5180


100%|██████████| 14/14 [00:00<00:00, 43.86it/s]



Epoch: 067, Loss: 0.5206


100%|██████████| 14/14 [00:00<00:00, 42.32it/s]



Epoch: 068, Loss: 0.5161


100%|██████████| 14/14 [00:00<00:00, 44.25it/s]



Epoch: 069, Loss: 0.5225


100%|██████████| 14/14 [00:00<00:00, 42.71it/s]



Epoch: 070, Loss: 0.5148


100%|██████████| 14/14 [00:00<00:00, 43.81it/s]



Epoch: 071, Loss: 0.5364


100%|██████████| 14/14 [00:00<00:00, 43.29it/s]



Epoch: 072, Loss: 0.5150


100%|██████████| 14/14 [00:00<00:00, 43.77it/s]



Epoch: 073, Loss: 0.5180


100%|██████████| 14/14 [00:00<00:00, 43.75it/s]



Epoch: 074, Loss: 0.5134


100%|██████████| 14/14 [00:00<00:00, 43.31it/s]



Epoch: 075, Loss: 0.5028


100%|██████████| 14/14 [00:00<00:00, 42.33it/s]



Epoch: 076, Loss: 0.5090


100%|██████████| 14/14 [00:00<00:00, 42.54it/s]



Epoch: 077, Loss: 0.5100


100%|██████████| 14/14 [00:00<00:00, 42.88it/s]



Epoch: 078, Loss: 0.5130


100%|██████████| 14/14 [00:00<00:00, 41.91it/s]



Epoch: 079, Loss: 0.5130


100%|██████████| 14/14 [00:00<00:00, 41.70it/s]



Epoch: 080, Loss: 0.5117


100%|██████████| 14/14 [00:00<00:00, 38.57it/s]



Epoch: 081, Loss: 0.5033


100%|██████████| 14/14 [00:00<00:00, 41.03it/s]



Epoch: 082, Loss: 0.5087


100%|██████████| 14/14 [00:00<00:00, 43.75it/s]



Epoch: 083, Loss: 0.5053


100%|██████████| 14/14 [00:00<00:00, 43.78it/s]



Epoch: 084, Loss: 0.5073


100%|██████████| 14/14 [00:00<00:00, 42.33it/s]



Epoch: 085, Loss: 0.5090


100%|██████████| 14/14 [00:00<00:00, 40.87it/s]



Epoch: 086, Loss: 0.5015


100%|██████████| 14/14 [00:00<00:00, 41.80it/s]



Epoch: 087, Loss: 0.5073


100%|██████████| 14/14 [00:00<00:00, 41.85it/s]



Epoch: 088, Loss: 0.5078


100%|██████████| 14/14 [00:00<00:00, 43.40it/s]



Epoch: 089, Loss: 0.5058


100%|██████████| 14/14 [00:00<00:00, 43.21it/s]



Epoch: 090, Loss: 0.5069


100%|██████████| 14/14 [00:00<00:00, 44.61it/s]



Epoch: 091, Loss: 0.5118


100%|██████████| 14/14 [00:00<00:00, 43.69it/s]



Epoch: 092, Loss: 0.5043


100%|██████████| 14/14 [00:00<00:00, 44.02it/s]



Epoch: 093, Loss: 0.5105


100%|██████████| 14/14 [00:00<00:00, 44.02it/s]



Epoch: 094, Loss: 0.5124


100%|██████████| 14/14 [00:00<00:00, 43.76it/s]



Epoch: 095, Loss: 0.5181


100%|██████████| 14/14 [00:00<00:00, 44.65it/s]



Epoch: 096, Loss: 0.5238


100%|██████████| 14/14 [00:00<00:00, 44.60it/s]



Epoch: 097, Loss: 0.5076


100%|██████████| 14/14 [00:00<00:00, 44.05it/s]



Epoch: 098, Loss: 0.5115


100%|██████████| 14/14 [00:00<00:00, 44.01it/s]



Epoch: 099, Loss: 0.5062


100%|██████████| 14/14 [00:00<00:00, 41.77it/s]



Epoch: 100, Loss: 0.5098


100%|██████████| 6/6 [00:00<00:00, 45.02it/s]



Test AUC: 0.7836377987566464
Test F1: 0.7253176930596285
users
users shape: (14417, 4)
unique users: 1619
unique articles: 821
             contentId             personId  \
0   310515487419366995 -1130272294246983140   
1   310515487419366995   344280948527967603   
2   310515487419366995 -8763398617720485024   
3 -8864073373672512525  3609194402293569455   
4   310515487419366995  3609194402293569455   

                                                 url lang  
0  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
1  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
2  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  
3  http://www.startupsstars.com/2016/06/video-int...   pt  
4  http://blog.runrun.it/erros-de-portugues-em-e-...   pt  

items
items shape: (821, 3)
unique articles: 821
      contentId                                              title  \
0 -9.160000e+18  Bad financial situation of retailers puts pres...   
1 -5.920000e+18  Articles and Lectures 

100%|██████████| 14/14 [00:00<00:00, 43.73it/s]



Epoch: 001, Loss: 0.6877


100%|██████████| 14/14 [00:00<00:00, 44.26it/s]



Epoch: 002, Loss: 0.6758


100%|██████████| 14/14 [00:00<00:00, 42.99it/s]



Epoch: 003, Loss: 0.6745


100%|██████████| 14/14 [00:00<00:00, 43.41it/s]



Epoch: 004, Loss: 0.6703


100%|██████████| 14/14 [00:00<00:00, 43.52it/s]



Epoch: 005, Loss: 0.6548


100%|██████████| 14/14 [00:00<00:00, 45.10it/s]



Epoch: 006, Loss: 0.6365


100%|██████████| 14/14 [00:00<00:00, 43.14it/s]



Epoch: 007, Loss: 0.6216


100%|██████████| 14/14 [00:00<00:00, 42.49it/s]



Epoch: 008, Loss: 0.6201


100%|██████████| 14/14 [00:00<00:00, 44.02it/s]



Epoch: 009, Loss: 0.6104


100%|██████████| 14/14 [00:00<00:00, 43.07it/s]



Epoch: 010, Loss: 0.6072


100%|██████████| 14/14 [00:00<00:00, 44.52it/s]



Epoch: 011, Loss: 0.6118


100%|██████████| 14/14 [00:00<00:00, 44.01it/s]



Epoch: 012, Loss: 0.6096


100%|██████████| 14/14 [00:00<00:00, 41.86it/s]



Epoch: 013, Loss: 0.6042


100%|██████████| 14/14 [00:00<00:00, 44.18it/s]



Epoch: 014, Loss: 0.6060


100%|██████████| 14/14 [00:00<00:00, 43.35it/s]



Epoch: 015, Loss: 0.6017


100%|██████████| 14/14 [00:00<00:00, 43.28it/s]



Epoch: 016, Loss: 0.5918


100%|██████████| 14/14 [00:00<00:00, 42.62it/s]



Epoch: 017, Loss: 0.6008


100%|██████████| 14/14 [00:00<00:00, 42.53it/s]



Epoch: 018, Loss: 0.5957


100%|██████████| 14/14 [00:00<00:00, 44.57it/s]



Epoch: 019, Loss: 0.6021


100%|██████████| 14/14 [00:00<00:00, 41.90it/s]



Epoch: 020, Loss: 0.5843


100%|██████████| 14/14 [00:00<00:00, 40.59it/s]



Epoch: 021, Loss: 0.5992


100%|██████████| 14/14 [00:00<00:00, 42.92it/s]



Epoch: 022, Loss: 0.5930


100%|██████████| 14/14 [00:00<00:00, 42.57it/s]



Epoch: 023, Loss: 0.5929


100%|██████████| 14/14 [00:00<00:00, 43.39it/s]



Epoch: 024, Loss: 0.5968


100%|██████████| 14/14 [00:00<00:00, 41.52it/s]



Epoch: 025, Loss: 0.5913


100%|██████████| 14/14 [00:00<00:00, 40.71it/s]



Epoch: 026, Loss: 0.5911


100%|██████████| 14/14 [00:00<00:00, 43.41it/s]



Epoch: 027, Loss: 0.5803


100%|██████████| 14/14 [00:00<00:00, 45.35it/s]



Epoch: 028, Loss: 0.5779


100%|██████████| 14/14 [00:00<00:00, 43.69it/s]



Epoch: 029, Loss: 0.5830


100%|██████████| 14/14 [00:00<00:00, 42.75it/s]



Epoch: 030, Loss: 0.5897


100%|██████████| 14/14 [00:00<00:00, 42.97it/s]



Epoch: 031, Loss: 0.5807


100%|██████████| 14/14 [00:00<00:00, 42.54it/s]



Epoch: 032, Loss: 0.5758


100%|██████████| 14/14 [00:00<00:00, 45.63it/s]



Epoch: 033, Loss: 0.5718


100%|██████████| 14/14 [00:00<00:00, 42.09it/s]



Epoch: 034, Loss: 0.5706


100%|██████████| 14/14 [00:00<00:00, 43.14it/s]



Epoch: 035, Loss: 0.5763


100%|██████████| 14/14 [00:00<00:00, 43.52it/s]



Epoch: 036, Loss: 0.5722


100%|██████████| 14/14 [00:00<00:00, 43.77it/s]



Epoch: 037, Loss: 0.5714


100%|██████████| 14/14 [00:00<00:00, 42.42it/s]



Epoch: 038, Loss: 0.5731


100%|██████████| 14/14 [00:00<00:00, 43.40it/s]



Epoch: 039, Loss: 0.5740


100%|██████████| 14/14 [00:00<00:00, 43.12it/s]



Epoch: 040, Loss: 0.5742


100%|██████████| 14/14 [00:00<00:00, 42.78it/s]



Epoch: 041, Loss: 0.5748


100%|██████████| 14/14 [00:00<00:00, 43.10it/s]



Epoch: 042, Loss: 0.5692


100%|██████████| 14/14 [00:00<00:00, 43.62it/s]



Epoch: 043, Loss: 0.5740


100%|██████████| 14/14 [00:00<00:00, 43.12it/s]



Epoch: 044, Loss: 0.5664


100%|██████████| 14/14 [00:00<00:00, 43.62it/s]



Epoch: 045, Loss: 0.5656


100%|██████████| 14/14 [00:00<00:00, 43.09it/s]



Epoch: 046, Loss: 0.5726


100%|██████████| 14/14 [00:00<00:00, 42.64it/s]



Epoch: 047, Loss: 0.5637


100%|██████████| 14/14 [00:00<00:00, 44.33it/s]



Epoch: 048, Loss: 0.5709


100%|██████████| 14/14 [00:00<00:00, 44.39it/s]



Epoch: 049, Loss: 0.5735


100%|██████████| 14/14 [00:00<00:00, 43.65it/s]



Epoch: 050, Loss: 0.5629


100%|██████████| 14/14 [00:00<00:00, 42.68it/s]



Epoch: 051, Loss: 0.5616


100%|██████████| 14/14 [00:00<00:00, 42.99it/s]



Epoch: 052, Loss: 0.5561


100%|██████████| 14/14 [00:00<00:00, 41.78it/s]



Epoch: 053, Loss: 0.5563


100%|██████████| 14/14 [00:00<00:00, 43.14it/s]



Epoch: 054, Loss: 0.5654


100%|██████████| 14/14 [00:00<00:00, 42.20it/s]



Epoch: 055, Loss: 0.5599


100%|██████████| 14/14 [00:00<00:00, 43.67it/s]



Epoch: 056, Loss: 0.5653


100%|██████████| 14/14 [00:00<00:00, 45.86it/s]



Epoch: 057, Loss: 0.5680


100%|██████████| 14/14 [00:00<00:00, 43.53it/s]



Epoch: 058, Loss: 0.5646


100%|██████████| 14/14 [00:00<00:00, 42.89it/s]



Epoch: 059, Loss: 0.5575


100%|██████████| 14/14 [00:00<00:00, 46.11it/s]



Epoch: 060, Loss: 0.5549


100%|██████████| 14/14 [00:00<00:00, 44.55it/s]



Epoch: 061, Loss: 0.5508


100%|██████████| 14/14 [00:00<00:00, 43.03it/s]



Epoch: 062, Loss: 0.5536


100%|██████████| 14/14 [00:00<00:00, 42.00it/s]



Epoch: 063, Loss: 0.5610


100%|██████████| 14/14 [00:00<00:00, 43.05it/s]



Epoch: 064, Loss: 0.5540


100%|██████████| 14/14 [00:00<00:00, 40.61it/s]



Epoch: 065, Loss: 0.5582


100%|██████████| 14/14 [00:00<00:00, 44.17it/s]



Epoch: 066, Loss: 0.5506


100%|██████████| 14/14 [00:00<00:00, 43.43it/s]



Epoch: 067, Loss: 0.5723


100%|██████████| 14/14 [00:00<00:00, 42.77it/s]



Epoch: 068, Loss: 0.5544


100%|██████████| 14/14 [00:00<00:00, 43.57it/s]



Epoch: 069, Loss: 0.5543


100%|██████████| 14/14 [00:00<00:00, 39.98it/s]



Epoch: 070, Loss: 0.5514


100%|██████████| 14/14 [00:00<00:00, 39.65it/s]



Epoch: 071, Loss: 0.5559


100%|██████████| 14/14 [00:00<00:00, 43.56it/s]



Epoch: 072, Loss: 0.5519


100%|██████████| 14/14 [00:00<00:00, 43.60it/s]



Epoch: 073, Loss: 0.5517


100%|██████████| 14/14 [00:00<00:00, 42.64it/s]



Epoch: 074, Loss: 0.5488


100%|██████████| 14/14 [00:00<00:00, 42.72it/s]



Epoch: 075, Loss: 0.5611


100%|██████████| 14/14 [00:00<00:00, 44.31it/s]



Epoch: 076, Loss: 0.5528


100%|██████████| 14/14 [00:00<00:00, 41.62it/s]



Epoch: 077, Loss: 0.5557


100%|██████████| 14/14 [00:00<00:00, 44.43it/s]



Epoch: 078, Loss: 0.5484


100%|██████████| 14/14 [00:00<00:00, 41.81it/s]



Epoch: 079, Loss: 0.5492


100%|██████████| 14/14 [00:00<00:00, 42.30it/s]



Epoch: 080, Loss: 0.5521


100%|██████████| 14/14 [00:00<00:00, 44.14it/s]



Epoch: 081, Loss: 0.5476


100%|██████████| 14/14 [00:00<00:00, 44.64it/s]



Epoch: 082, Loss: 0.5479


100%|██████████| 14/14 [00:00<00:00, 41.61it/s]



Epoch: 083, Loss: 0.5438


100%|██████████| 14/14 [00:00<00:00, 44.18it/s]



Epoch: 084, Loss: 0.5434


100%|██████████| 14/14 [00:00<00:00, 43.70it/s]



Epoch: 085, Loss: 0.5451


100%|██████████| 14/14 [00:00<00:00, 42.90it/s]



Epoch: 086, Loss: 0.5536


100%|██████████| 14/14 [00:00<00:00, 42.65it/s]



Epoch: 087, Loss: 0.5547


100%|██████████| 14/14 [00:00<00:00, 44.81it/s]



Epoch: 088, Loss: 0.5589


100%|██████████| 14/14 [00:00<00:00, 41.98it/s]



Epoch: 089, Loss: 0.5478


100%|██████████| 14/14 [00:00<00:00, 43.55it/s]



Epoch: 090, Loss: 0.5497


100%|██████████| 14/14 [00:00<00:00, 43.30it/s]



Epoch: 091, Loss: 0.5507


100%|██████████| 14/14 [00:00<00:00, 42.91it/s]



Epoch: 092, Loss: 0.5426


100%|██████████| 14/14 [00:00<00:00, 44.36it/s]



Epoch: 093, Loss: 0.5460


100%|██████████| 14/14 [00:00<00:00, 44.48it/s]



Epoch: 094, Loss: 0.5513


100%|██████████| 14/14 [00:00<00:00, 43.30it/s]



Epoch: 095, Loss: 0.5490


100%|██████████| 14/14 [00:00<00:00, 44.67it/s]



Epoch: 096, Loss: 0.5448


100%|██████████| 14/14 [00:00<00:00, 44.68it/s]



Epoch: 097, Loss: 0.5465


100%|██████████| 14/14 [00:00<00:00, 44.32it/s]



Epoch: 098, Loss: 0.5422


100%|██████████| 14/14 [00:00<00:00, 43.52it/s]



Epoch: 099, Loss: 0.5429


100%|██████████| 14/14 [00:00<00:00, 43.03it/s]



Epoch: 100, Loss: 0.5468


100%|██████████| 6/6 [00:00<00:00, 43.71it/s]



Test AUC: 0.7554656170704187
Test F1: 0.7243486651656482
users
users shape: (26206, 4)
unique users: 1644
unique articles: 2154
             contentId             personId  \
0 -3499919498720038879 -8845298781299428018   
1  8890720798209849691 -1032019229384696495   
2 -7820640624231356730  -445337111692715325   
3 -1492913151930215984  4254153380739593270   
4  3064370296170038610  3609194402293569455   

                                                 url lang  
0             http://techcrunch.com/2016/06/07/hiri/   en  
1  https://www.nngroup.com/articles/top-intranet-...   en  
2  http://www.fastcompany.com/3058876/your-most-p...   en  
3  https://developer.chrome.com/devtools/docs/con...   en  
4  https://www.linkedin.com/pulse/google-amazon-u...   en  

items
items shape: (2154, 4)
unique articles: 2154
             contentId                                              title  \
0 -6451309518266745024  Ethereum, a Virtual Currency, Enables Transact...   
1 -4110354420726924665

100%|██████████| 25/25 [00:00<00:00, 27.19it/s]



Epoch: 001, Loss: 0.6830


100%|██████████| 25/25 [00:00<00:00, 28.18it/s]



Epoch: 002, Loss: 0.6621


100%|██████████| 25/25 [00:00<00:00, 27.55it/s]



Epoch: 003, Loss: 0.6562


100%|██████████| 25/25 [00:00<00:00, 28.48it/s]



Epoch: 004, Loss: 0.6432


100%|██████████| 25/25 [00:00<00:00, 28.17it/s]



Epoch: 005, Loss: 0.6100


100%|██████████| 25/25 [00:00<00:00, 28.22it/s]



Epoch: 006, Loss: 0.5681


100%|██████████| 25/25 [00:00<00:00, 27.78it/s]



Epoch: 007, Loss: 0.5537


100%|██████████| 25/25 [00:00<00:00, 27.97it/s]



Epoch: 008, Loss: 0.5518


100%|██████████| 25/25 [00:00<00:00, 27.82it/s]



Epoch: 009, Loss: 0.5508


100%|██████████| 25/25 [00:00<00:00, 27.95it/s]



Epoch: 010, Loss: 0.5370


100%|██████████| 25/25 [00:00<00:00, 27.86it/s]



Epoch: 011, Loss: 0.5353


100%|██████████| 25/25 [00:00<00:00, 27.98it/s]



Epoch: 012, Loss: 0.5412


100%|██████████| 25/25 [00:00<00:00, 28.26it/s]



Epoch: 013, Loss: 0.5267


100%|██████████| 25/25 [00:00<00:00, 28.03it/s]



Epoch: 014, Loss: 0.5280


100%|██████████| 25/25 [00:00<00:00, 27.95it/s]



Epoch: 015, Loss: 0.5303


100%|██████████| 25/25 [00:00<00:00, 27.09it/s]



Epoch: 016, Loss: 0.5206


100%|██████████| 25/25 [00:00<00:00, 27.70it/s]



Epoch: 017, Loss: 0.5194


100%|██████████| 25/25 [00:00<00:00, 27.88it/s]



Epoch: 018, Loss: 0.5101


100%|██████████| 25/25 [00:00<00:00, 27.86it/s]



Epoch: 019, Loss: 0.5153


100%|██████████| 25/25 [00:00<00:00, 28.41it/s]



Epoch: 020, Loss: 0.5100


100%|██████████| 25/25 [00:00<00:00, 28.16it/s]



Epoch: 021, Loss: 0.5104


100%|██████████| 25/25 [00:00<00:00, 27.95it/s]



Epoch: 022, Loss: 0.5084


100%|██████████| 25/25 [00:00<00:00, 28.36it/s]



Epoch: 023, Loss: 0.5086


100%|██████████| 25/25 [00:00<00:00, 28.49it/s]



Epoch: 024, Loss: 0.5099


100%|██████████| 25/25 [00:00<00:00, 28.19it/s]



Epoch: 025, Loss: 0.5084


100%|██████████| 25/25 [00:00<00:00, 27.75it/s]



Epoch: 026, Loss: 0.5011


100%|██████████| 25/25 [00:00<00:00, 27.99it/s]



Epoch: 027, Loss: 0.5029


100%|██████████| 25/25 [00:00<00:00, 28.04it/s]



Epoch: 028, Loss: 0.5022


100%|██████████| 25/25 [00:00<00:00, 28.31it/s]



Epoch: 029, Loss: 0.4923


100%|██████████| 25/25 [00:00<00:00, 28.35it/s]



Epoch: 030, Loss: 0.4963


100%|██████████| 25/25 [00:00<00:00, 28.00it/s]



Epoch: 031, Loss: 0.4978


100%|██████████| 25/25 [00:00<00:00, 28.92it/s]



Epoch: 032, Loss: 0.4975


100%|██████████| 25/25 [00:00<00:00, 27.58it/s]



Epoch: 033, Loss: 0.4965


100%|██████████| 25/25 [00:00<00:00, 28.01it/s]



Epoch: 034, Loss: 0.4993


100%|██████████| 25/25 [00:00<00:00, 27.80it/s]



Epoch: 035, Loss: 0.4949


100%|██████████| 25/25 [00:00<00:00, 27.14it/s]



Epoch: 036, Loss: 0.4936


100%|██████████| 25/25 [00:00<00:00, 27.71it/s]



Epoch: 037, Loss: 0.4939


100%|██████████| 25/25 [00:00<00:00, 27.35it/s]



Epoch: 038, Loss: 0.4962


100%|██████████| 25/25 [00:00<00:00, 27.40it/s]



Epoch: 039, Loss: 0.4865


100%|██████████| 25/25 [00:00<00:00, 28.22it/s]



Epoch: 040, Loss: 0.4927


100%|██████████| 25/25 [00:00<00:00, 27.54it/s]



Epoch: 041, Loss: 0.4874


100%|██████████| 25/25 [00:00<00:00, 28.06it/s]



Epoch: 042, Loss: 0.4914


100%|██████████| 25/25 [00:00<00:00, 27.85it/s]



Epoch: 043, Loss: 0.4871


100%|██████████| 25/25 [00:00<00:00, 27.53it/s]



Epoch: 044, Loss: 0.4866


100%|██████████| 25/25 [00:00<00:00, 28.62it/s]



Epoch: 045, Loss: 0.4857


100%|██████████| 25/25 [00:00<00:00, 27.83it/s]



Epoch: 046, Loss: 0.4930


100%|██████████| 25/25 [00:00<00:00, 28.02it/s]



Epoch: 047, Loss: 0.4928


100%|██████████| 25/25 [00:00<00:00, 28.00it/s]



Epoch: 048, Loss: 0.4824


100%|██████████| 25/25 [00:00<00:00, 27.60it/s]



Epoch: 049, Loss: 0.4870


100%|██████████| 25/25 [00:00<00:00, 26.73it/s]



Epoch: 050, Loss: 0.4856


100%|██████████| 25/25 [00:00<00:00, 27.35it/s]



Epoch: 051, Loss: 0.4789


100%|██████████| 25/25 [00:00<00:00, 27.00it/s]



Epoch: 052, Loss: 0.4804


100%|██████████| 25/25 [00:00<00:00, 25.92it/s]



Epoch: 053, Loss: 0.4869


100%|██████████| 25/25 [00:01<00:00, 20.84it/s]



Epoch: 054, Loss: 0.4832


100%|██████████| 25/25 [00:01<00:00, 19.92it/s]



Epoch: 055, Loss: 0.4809


100%|██████████| 25/25 [00:01<00:00, 20.29it/s]



Epoch: 056, Loss: 0.4789


100%|██████████| 25/25 [00:01<00:00, 20.37it/s]



Epoch: 057, Loss: 0.4883


100%|██████████| 25/25 [00:01<00:00, 19.58it/s]



Epoch: 058, Loss: 0.4851


100%|██████████| 25/25 [00:01<00:00, 20.79it/s]



Epoch: 059, Loss: 0.4750


100%|██████████| 25/25 [00:00<00:00, 27.65it/s]



Epoch: 060, Loss: 0.4723


100%|██████████| 25/25 [00:00<00:00, 28.48it/s]



Epoch: 061, Loss: 0.4830


100%|██████████| 25/25 [00:00<00:00, 28.51it/s]



Epoch: 062, Loss: 0.4810


100%|██████████| 25/25 [00:00<00:00, 27.90it/s]



Epoch: 063, Loss: 0.4810


100%|██████████| 25/25 [00:00<00:00, 27.81it/s]



Epoch: 064, Loss: 0.4730


100%|██████████| 25/25 [00:00<00:00, 27.17it/s]



Epoch: 065, Loss: 0.4878


100%|██████████| 25/25 [00:00<00:00, 26.89it/s]



Epoch: 066, Loss: 0.4769


100%|██████████| 25/25 [00:00<00:00, 28.24it/s]



Epoch: 067, Loss: 0.4796


100%|██████████| 25/25 [00:00<00:00, 28.61it/s]



Epoch: 068, Loss: 0.4771


100%|██████████| 25/25 [00:00<00:00, 28.12it/s]



Epoch: 069, Loss: 0.4704


100%|██████████| 25/25 [00:00<00:00, 28.36it/s]



Epoch: 070, Loss: 0.4769


100%|██████████| 25/25 [00:00<00:00, 28.62it/s]



Epoch: 071, Loss: 0.4752


100%|██████████| 25/25 [00:00<00:00, 28.14it/s]



Epoch: 072, Loss: 0.4722


100%|██████████| 25/25 [00:00<00:00, 28.45it/s]



Epoch: 073, Loss: 0.4736


100%|██████████| 25/25 [00:00<00:00, 27.88it/s]



Epoch: 074, Loss: 0.4769


100%|██████████| 25/25 [00:00<00:00, 28.56it/s]



Epoch: 075, Loss: 0.4747


100%|██████████| 25/25 [00:00<00:00, 29.17it/s]



Epoch: 076, Loss: 0.4702


100%|██████████| 25/25 [00:00<00:00, 28.85it/s]



Epoch: 077, Loss: 0.4640


100%|██████████| 25/25 [00:00<00:00, 27.90it/s]



Epoch: 078, Loss: 0.4774


100%|██████████| 25/25 [00:00<00:00, 28.39it/s]



Epoch: 079, Loss: 0.4739


100%|██████████| 25/25 [00:00<00:00, 28.53it/s]



Epoch: 080, Loss: 0.4730


100%|██████████| 25/25 [00:00<00:00, 27.86it/s]



Epoch: 081, Loss: 0.4711


100%|██████████| 25/25 [00:00<00:00, 28.21it/s]



Epoch: 082, Loss: 0.4735


100%|██████████| 25/25 [00:00<00:00, 27.76it/s]



Epoch: 083, Loss: 0.4761


100%|██████████| 25/25 [00:00<00:00, 28.44it/s]



Epoch: 084, Loss: 0.4712


100%|██████████| 25/25 [00:00<00:00, 28.38it/s]



Epoch: 085, Loss: 0.4721


100%|██████████| 25/25 [00:00<00:00, 27.89it/s]



Epoch: 086, Loss: 0.4660


100%|██████████| 25/25 [00:00<00:00, 28.14it/s]



Epoch: 087, Loss: 0.4669


100%|██████████| 25/25 [00:00<00:00, 28.24it/s]



Epoch: 088, Loss: 0.4661


100%|██████████| 25/25 [00:00<00:00, 28.04it/s]



Epoch: 089, Loss: 0.4593


100%|██████████| 25/25 [00:00<00:00, 27.94it/s]



Epoch: 090, Loss: 0.4708


100%|██████████| 25/25 [00:00<00:00, 27.69it/s]



Epoch: 091, Loss: 0.4638


100%|██████████| 25/25 [00:00<00:00, 28.16it/s]



Epoch: 092, Loss: 0.4650


100%|██████████| 25/25 [00:00<00:00, 28.38it/s]



Epoch: 093, Loss: 0.4690


100%|██████████| 25/25 [00:00<00:00, 27.92it/s]



Epoch: 094, Loss: 0.4624


100%|██████████| 25/25 [00:00<00:00, 27.62it/s]



Epoch: 095, Loss: 0.4636


100%|██████████| 25/25 [00:00<00:00, 28.03it/s]



Epoch: 096, Loss: 0.4663


100%|██████████| 25/25 [00:00<00:00, 27.38it/s]



Epoch: 097, Loss: 0.4595


100%|██████████| 25/25 [00:00<00:00, 27.36it/s]



Epoch: 098, Loss: 0.4653


100%|██████████| 25/25 [00:00<00:00, 27.61it/s]



Epoch: 099, Loss: 0.4673


100%|██████████| 25/25 [00:00<00:00, 27.37it/s]



Epoch: 100, Loss: 0.4582


100%|██████████| 11/11 [00:00<00:00, 29.10it/s]



Test AUC: 0.8326281976574791
Test F1: 0.7702604920405209
users
users shape: (40623, 4)
unique users: 1894
unique articles: 2975
             contentId             personId  \
0 -3499919498720038879 -8845298781299428018   
1  8890720798209849691 -1032019229384696495   
2 -7820640624231356730  -445337111692715325   
3 -1492913151930215984  4254153380739593270   
4  3064370296170038610  3609194402293569455   

                                                 url lang  
0             http://techcrunch.com/2016/06/07/hiri/   en  
1  https://www.nngroup.com/articles/top-intranet-...   en  
2  http://www.fastcompany.com/3058876/your-most-p...   en  
3  https://developer.chrome.com/devtools/docs/con...   en  
4  https://www.linkedin.com/pulse/google-amazon-u...   en  

items
items shape: (2975, 4)
unique articles: 2975
             contentId                                              title  \
0 -6451309518266745024  Ethereum, a Virtual Currency, Enables Transact...   
1 -4110354420726924665

100%|██████████| 39/39 [00:01<00:00, 20.80it/s]



Epoch: 001, Loss: 0.6546


100%|██████████| 39/39 [00:01<00:00, 20.83it/s]



Epoch: 002, Loss: 0.6036


100%|██████████| 39/39 [00:01<00:00, 20.31it/s]



Epoch: 003, Loss: 0.5721


100%|██████████| 39/39 [00:01<00:00, 20.63it/s]



Epoch: 004, Loss: 0.5603


100%|██████████| 39/39 [00:01<00:00, 20.54it/s]



Epoch: 005, Loss: 0.5487


100%|██████████| 39/39 [00:01<00:00, 19.81it/s]



Epoch: 006, Loss: 0.5427


100%|██████████| 39/39 [00:01<00:00, 20.57it/s]



Epoch: 007, Loss: 0.5290


100%|██████████| 39/39 [00:01<00:00, 20.06it/s]



Epoch: 008, Loss: 0.5283


100%|██████████| 39/39 [00:01<00:00, 20.65it/s]



Epoch: 009, Loss: 0.5243


100%|██████████| 39/39 [00:01<00:00, 21.01it/s]



Epoch: 010, Loss: 0.5257


100%|██████████| 39/39 [00:01<00:00, 21.01it/s]



Epoch: 011, Loss: 0.5205


100%|██████████| 39/39 [00:01<00:00, 20.92it/s]



Epoch: 012, Loss: 0.5199


100%|██████████| 39/39 [00:01<00:00, 20.65it/s]



Epoch: 013, Loss: 0.5192


100%|██████████| 39/39 [00:01<00:00, 20.85it/s]



Epoch: 014, Loss: 0.5125


100%|██████████| 39/39 [00:01<00:00, 20.94it/s]



Epoch: 015, Loss: 0.5053


100%|██████████| 39/39 [00:01<00:00, 20.74it/s]



Epoch: 016, Loss: 0.5082


100%|██████████| 39/39 [00:01<00:00, 20.67it/s]



Epoch: 017, Loss: 0.5129


100%|██████████| 39/39 [00:01<00:00, 20.95it/s]



Epoch: 018, Loss: 0.5011


100%|██████████| 39/39 [00:01<00:00, 20.97it/s]



Epoch: 019, Loss: 0.5018


100%|██████████| 39/39 [00:01<00:00, 20.94it/s]



Epoch: 020, Loss: 0.5036


100%|██████████| 39/39 [00:01<00:00, 20.56it/s]



Epoch: 021, Loss: 0.4983


100%|██████████| 39/39 [00:01<00:00, 20.48it/s]



Epoch: 022, Loss: 0.4975


100%|██████████| 39/39 [00:01<00:00, 20.62it/s]



Epoch: 023, Loss: 0.4952


100%|██████████| 39/39 [00:01<00:00, 20.61it/s]



Epoch: 024, Loss: 0.5000


100%|██████████| 39/39 [00:01<00:00, 20.72it/s]



Epoch: 025, Loss: 0.4894


100%|██████████| 39/39 [00:01<00:00, 20.68it/s]



Epoch: 026, Loss: 0.4928


100%|██████████| 39/39 [00:01<00:00, 20.66it/s]



Epoch: 027, Loss: 0.4900


100%|██████████| 39/39 [00:01<00:00, 20.98it/s]



Epoch: 028, Loss: 0.4885


100%|██████████| 39/39 [00:01<00:00, 20.20it/s]



Epoch: 029, Loss: 0.4842


100%|██████████| 39/39 [00:01<00:00, 20.53it/s]



Epoch: 030, Loss: 0.4825


100%|██████████| 39/39 [00:01<00:00, 20.94it/s]



Epoch: 031, Loss: 0.4857


100%|██████████| 39/39 [00:01<00:00, 20.87it/s]



Epoch: 032, Loss: 0.4738


100%|██████████| 39/39 [00:01<00:00, 20.81it/s]



Epoch: 033, Loss: 0.4827


100%|██████████| 39/39 [00:01<00:00, 20.76it/s]



Epoch: 034, Loss: 0.4864


100%|██████████| 39/39 [00:01<00:00, 20.79it/s]



Epoch: 035, Loss: 0.4786


100%|██████████| 39/39 [00:01<00:00, 20.95it/s]



Epoch: 036, Loss: 0.4784


100%|██████████| 39/39 [00:01<00:00, 20.84it/s]



Epoch: 037, Loss: 0.4787


100%|██████████| 39/39 [00:01<00:00, 20.57it/s]



Epoch: 038, Loss: 0.4747


100%|██████████| 39/39 [00:01<00:00, 20.85it/s]



Epoch: 039, Loss: 0.4751


100%|██████████| 39/39 [00:01<00:00, 20.78it/s]



Epoch: 040, Loss: 0.4755


100%|██████████| 39/39 [00:01<00:00, 21.08it/s]



Epoch: 041, Loss: 0.4739


100%|██████████| 39/39 [00:01<00:00, 20.93it/s]



Epoch: 042, Loss: 0.4716


100%|██████████| 39/39 [00:01<00:00, 20.73it/s]



Epoch: 043, Loss: 0.4777


100%|██████████| 39/39 [00:01<00:00, 20.72it/s]



Epoch: 044, Loss: 0.4723


100%|██████████| 39/39 [00:02<00:00, 19.27it/s]



Epoch: 045, Loss: 0.4696


100%|██████████| 39/39 [00:01<00:00, 19.63it/s]



Epoch: 046, Loss: 0.4662


100%|██████████| 39/39 [00:01<00:00, 20.28it/s]



Epoch: 047, Loss: 0.4636


100%|██████████| 39/39 [00:01<00:00, 20.40it/s]



Epoch: 048, Loss: 0.4625


100%|██████████| 39/39 [00:01<00:00, 20.72it/s]



Epoch: 049, Loss: 0.4587


100%|██████████| 39/39 [00:01<00:00, 20.60it/s]



Epoch: 050, Loss: 0.4638


100%|██████████| 39/39 [00:01<00:00, 20.68it/s]



Epoch: 051, Loss: 0.4617


100%|██████████| 39/39 [00:01<00:00, 20.59it/s]



Epoch: 052, Loss: 0.4598


100%|██████████| 39/39 [00:01<00:00, 20.44it/s]



Epoch: 053, Loss: 0.4581


100%|██████████| 39/39 [00:01<00:00, 20.43it/s]



Epoch: 054, Loss: 0.4631


100%|██████████| 39/39 [00:01<00:00, 20.64it/s]



Epoch: 055, Loss: 0.4569


100%|██████████| 39/39 [00:01<00:00, 20.35it/s]



Epoch: 056, Loss: 0.4584


100%|██████████| 39/39 [00:01<00:00, 20.82it/s]



Epoch: 057, Loss: 0.4519


100%|██████████| 39/39 [00:01<00:00, 20.71it/s]



Epoch: 058, Loss: 0.4557


100%|██████████| 39/39 [00:01<00:00, 20.64it/s]



Epoch: 059, Loss: 0.4619


100%|██████████| 39/39 [00:01<00:00, 20.18it/s]



Epoch: 060, Loss: 0.4587


100%|██████████| 39/39 [00:01<00:00, 19.61it/s]



Epoch: 061, Loss: 0.4520


100%|██████████| 39/39 [00:01<00:00, 20.07it/s]



Epoch: 062, Loss: 0.4590


100%|██████████| 39/39 [00:01<00:00, 20.72it/s]



Epoch: 063, Loss: 0.4534


100%|██████████| 39/39 [00:01<00:00, 20.59it/s]



Epoch: 064, Loss: 0.4483


100%|██████████| 39/39 [00:01<00:00, 20.91it/s]



Epoch: 065, Loss: 0.4465


100%|██████████| 39/39 [00:01<00:00, 20.60it/s]



Epoch: 066, Loss: 0.4486


100%|██████████| 39/39 [00:01<00:00, 20.94it/s]



Epoch: 067, Loss: 0.4531


100%|██████████| 39/39 [00:01<00:00, 20.94it/s]



Epoch: 068, Loss: 0.4378


100%|██████████| 39/39 [00:01<00:00, 20.40it/s]



Epoch: 069, Loss: 0.4468


100%|██████████| 39/39 [00:01<00:00, 21.04it/s]



Epoch: 070, Loss: 0.4445


100%|██████████| 39/39 [00:01<00:00, 20.88it/s]



Epoch: 071, Loss: 0.4447


100%|██████████| 39/39 [00:01<00:00, 20.87it/s]



Epoch: 072, Loss: 0.4344


100%|██████████| 39/39 [00:01<00:00, 20.92it/s]



Epoch: 073, Loss: 0.4408


100%|██████████| 39/39 [00:01<00:00, 20.78it/s]



Epoch: 074, Loss: 0.4376


100%|██████████| 39/39 [00:01<00:00, 20.72it/s]



Epoch: 075, Loss: 0.4382


100%|██████████| 39/39 [00:01<00:00, 20.60it/s]



Epoch: 076, Loss: 0.4357


100%|██████████| 39/39 [00:01<00:00, 20.66it/s]



Epoch: 077, Loss: 0.4446


100%|██████████| 39/39 [00:01<00:00, 20.95it/s]



Epoch: 078, Loss: 0.4364


100%|██████████| 39/39 [00:01<00:00, 20.71it/s]



Epoch: 079, Loss: 0.4348


100%|██████████| 39/39 [00:01<00:00, 20.47it/s]



Epoch: 080, Loss: 0.4345


100%|██████████| 39/39 [00:01<00:00, 20.79it/s]



Epoch: 081, Loss: 0.4455


100%|██████████| 39/39 [00:01<00:00, 20.69it/s]



Epoch: 082, Loss: 0.4344


100%|██████████| 39/39 [00:01<00:00, 20.66it/s]



Epoch: 083, Loss: 0.4436


100%|██████████| 39/39 [00:01<00:00, 20.23it/s]



Epoch: 084, Loss: 0.4308


100%|██████████| 39/39 [00:01<00:00, 20.33it/s]



Epoch: 085, Loss: 0.4395


100%|██████████| 39/39 [00:01<00:00, 20.33it/s]



Epoch: 086, Loss: 0.4339


100%|██████████| 39/39 [00:01<00:00, 20.44it/s]



Epoch: 087, Loss: 0.4293


100%|██████████| 39/39 [00:01<00:00, 20.26it/s]



Epoch: 088, Loss: 0.4352


100%|██████████| 39/39 [00:01<00:00, 20.24it/s]



Epoch: 089, Loss: 0.4363


100%|██████████| 39/39 [00:01<00:00, 20.92it/s]



Epoch: 090, Loss: 0.4323


100%|██████████| 39/39 [00:01<00:00, 20.57it/s]



Epoch: 091, Loss: 0.4296


100%|██████████| 39/39 [00:01<00:00, 20.41it/s]



Epoch: 092, Loss: 0.4283


100%|██████████| 39/39 [00:01<00:00, 20.32it/s]



Epoch: 093, Loss: 0.4226


100%|██████████| 39/39 [00:01<00:00, 20.66it/s]



Epoch: 094, Loss: 0.4249


100%|██████████| 39/39 [00:01<00:00, 20.71it/s]



Epoch: 095, Loss: 0.4267


100%|██████████| 39/39 [00:01<00:00, 20.71it/s]



Epoch: 096, Loss: 0.4209


100%|██████████| 39/39 [00:01<00:00, 20.77it/s]



Epoch: 097, Loss: 0.4256


100%|██████████| 39/39 [00:01<00:00, 20.16it/s]



Epoch: 098, Loss: 0.4328


100%|██████████| 39/39 [00:01<00:00, 20.62it/s]



Epoch: 099, Loss: 0.4168


100%|██████████| 39/39 [00:01<00:00, 20.39it/s]



Epoch: 100, Loss: 0.4213


100%|██████████| 16/16 [00:00<00:00, 20.21it/s]



Test AUC: 0.842172689632702
Test F1: 0.7785722658539445
users
users shape: (40623, 4)
unique users: 1894
unique articles: 2975
             contentId             personId  \
0 -3499919498720038879 -8845298781299428018   
1  8890720798209849691 -1032019229384696495   
2 -7820640624231356730  -445337111692715325   
3 -1492913151930215984  4254153380739593270   
4  3064370296170038610  3609194402293569455   

                                                 url lang  
0             http://techcrunch.com/2016/06/07/hiri/   en  
1  https://www.nngroup.com/articles/top-intranet-...   en  
2  http://www.fastcompany.com/3058876/your-most-p...   en  
3  https://developer.chrome.com/devtools/docs/con...   en  
4  https://www.linkedin.com/pulse/google-amazon-u...   en  

items
items shape: (2975, 4)
unique articles: 2975
             contentId                                              title  \
0 -6451309518266745024  Ethereum, a Virtual Currency, Enables Transact...   
1 -4110354420726924665 

100%|██████████| 39/39 [00:01<00:00, 20.73it/s]



Epoch: 001, Loss: 0.6728


100%|██████████| 39/39 [00:01<00:00, 20.39it/s]



Epoch: 002, Loss: 0.6510


100%|██████████| 39/39 [00:01<00:00, 20.71it/s]



Epoch: 003, Loss: 0.6333


100%|██████████| 39/39 [00:01<00:00, 20.41it/s]



Epoch: 004, Loss: 0.6063


100%|██████████| 39/39 [00:01<00:00, 20.13it/s]



Epoch: 005, Loss: 0.5841


100%|██████████| 39/39 [00:01<00:00, 19.72it/s]



Epoch: 006, Loss: 0.5576


100%|██████████| 39/39 [00:01<00:00, 20.54it/s]



Epoch: 007, Loss: 0.5517


100%|██████████| 39/39 [00:01<00:00, 20.70it/s]



Epoch: 008, Loss: 0.5389


100%|██████████| 39/39 [00:01<00:00, 20.57it/s]



Epoch: 009, Loss: 0.5347


100%|██████████| 39/39 [00:01<00:00, 20.46it/s]



Epoch: 010, Loss: 0.5217


100%|██████████| 39/39 [00:01<00:00, 20.35it/s]



Epoch: 011, Loss: 0.5346


100%|██████████| 39/39 [00:01<00:00, 19.99it/s]



Epoch: 012, Loss: 0.5159


100%|██████████| 39/39 [00:01<00:00, 19.96it/s]



Epoch: 013, Loss: 0.5133


100%|██████████| 39/39 [00:01<00:00, 19.70it/s]



Epoch: 014, Loss: 0.5135


100%|██████████| 39/39 [00:02<00:00, 19.26it/s]



Epoch: 015, Loss: 0.5077


100%|██████████| 39/39 [00:01<00:00, 20.18it/s]



Epoch: 016, Loss: 0.5071


100%|██████████| 39/39 [00:01<00:00, 20.52it/s]



Epoch: 017, Loss: 0.5055


100%|██████████| 39/39 [00:01<00:00, 20.72it/s]



Epoch: 018, Loss: 0.5076


100%|██████████| 39/39 [00:01<00:00, 20.78it/s]



Epoch: 019, Loss: 0.5054


100%|██████████| 39/39 [00:01<00:00, 20.52it/s]



Epoch: 020, Loss: 0.5082


100%|██████████| 39/39 [00:01<00:00, 20.31it/s]



Epoch: 021, Loss: 0.4967


100%|██████████| 39/39 [00:01<00:00, 20.25it/s]



Epoch: 022, Loss: 0.4977


100%|██████████| 39/39 [00:01<00:00, 19.94it/s]



Epoch: 023, Loss: 0.4969


100%|██████████| 39/39 [00:01<00:00, 20.24it/s]



Epoch: 024, Loss: 0.4934


100%|██████████| 39/39 [00:01<00:00, 20.59it/s]



Epoch: 025, Loss: 0.4958


100%|██████████| 39/39 [00:01<00:00, 20.58it/s]



Epoch: 026, Loss: 0.4897


100%|██████████| 39/39 [00:01<00:00, 20.93it/s]



Epoch: 027, Loss: 0.4862


100%|██████████| 39/39 [00:01<00:00, 20.53it/s]



Epoch: 028, Loss: 0.4912


100%|██████████| 39/39 [00:01<00:00, 20.68it/s]



Epoch: 029, Loss: 0.4903


100%|██████████| 39/39 [00:01<00:00, 20.50it/s]



Epoch: 030, Loss: 0.4890


100%|██████████| 39/39 [00:01<00:00, 20.35it/s]



Epoch: 031, Loss: 0.4871


100%|██████████| 39/39 [00:02<00:00, 17.81it/s]



Epoch: 032, Loss: 0.4885


100%|██████████| 39/39 [00:02<00:00, 13.94it/s]



Epoch: 033, Loss: 0.4868


100%|██████████| 39/39 [00:02<00:00, 14.07it/s]



Epoch: 034, Loss: 0.4814


100%|██████████| 39/39 [00:02<00:00, 18.15it/s]



Epoch: 035, Loss: 0.4734


100%|██████████| 39/39 [00:01<00:00, 20.24it/s]



Epoch: 036, Loss: 0.4817


100%|██████████| 39/39 [00:01<00:00, 20.56it/s]



Epoch: 037, Loss: 0.4794


100%|██████████| 39/39 [00:01<00:00, 20.78it/s]



Epoch: 038, Loss: 0.4806


100%|██████████| 39/39 [00:01<00:00, 20.61it/s]



Epoch: 039, Loss: 0.4805


100%|██████████| 39/39 [00:01<00:00, 20.65it/s]



Epoch: 040, Loss: 0.4751


100%|██████████| 39/39 [00:01<00:00, 20.63it/s]



Epoch: 041, Loss: 0.4748


100%|██████████| 39/39 [00:01<00:00, 20.77it/s]



Epoch: 042, Loss: 0.4777


100%|██████████| 39/39 [00:01<00:00, 20.85it/s]



Epoch: 043, Loss: 0.4805


100%|██████████| 39/39 [00:01<00:00, 20.50it/s]



Epoch: 044, Loss: 0.4712


100%|██████████| 39/39 [00:01<00:00, 20.31it/s]



Epoch: 045, Loss: 0.4695


100%|██████████| 39/39 [00:01<00:00, 20.72it/s]



Epoch: 046, Loss: 0.4691


100%|██████████| 39/39 [00:01<00:00, 20.47it/s]



Epoch: 047, Loss: 0.4728


100%|██████████| 39/39 [00:01<00:00, 20.31it/s]



Epoch: 048, Loss: 0.4728


100%|██████████| 39/39 [00:01<00:00, 20.32it/s]



Epoch: 049, Loss: 0.4691


100%|██████████| 39/39 [00:01<00:00, 20.63it/s]



Epoch: 050, Loss: 0.4719


100%|██████████| 39/39 [00:01<00:00, 20.19it/s]



Epoch: 051, Loss: 0.4714


100%|██████████| 39/39 [00:01<00:00, 20.42it/s]



Epoch: 052, Loss: 0.4689


100%|██████████| 39/39 [00:01<00:00, 19.91it/s]



Epoch: 053, Loss: 0.4683


100%|██████████| 39/39 [00:01<00:00, 20.70it/s]



Epoch: 054, Loss: 0.4618


100%|██████████| 39/39 [00:01<00:00, 20.57it/s]



Epoch: 055, Loss: 0.4670


100%|██████████| 39/39 [00:01<00:00, 20.40it/s]



Epoch: 056, Loss: 0.4675


100%|██████████| 39/39 [00:01<00:00, 20.62it/s]



Epoch: 057, Loss: 0.4657


100%|██████████| 39/39 [00:01<00:00, 20.64it/s]



Epoch: 058, Loss: 0.4672


100%|██████████| 39/39 [00:01<00:00, 20.31it/s]



Epoch: 059, Loss: 0.4604


100%|██████████| 39/39 [00:01<00:00, 20.05it/s]



Epoch: 060, Loss: 0.4633


100%|██████████| 39/39 [00:01<00:00, 20.12it/s]



Epoch: 061, Loss: 0.4670


100%|██████████| 39/39 [00:01<00:00, 20.22it/s]



Epoch: 062, Loss: 0.4659


100%|██████████| 39/39 [00:01<00:00, 20.34it/s]



Epoch: 063, Loss: 0.4552


100%|██████████| 39/39 [00:01<00:00, 20.22it/s]



Epoch: 064, Loss: 0.4625


100%|██████████| 39/39 [00:01<00:00, 20.35it/s]



Epoch: 065, Loss: 0.4569


100%|██████████| 39/39 [00:01<00:00, 20.33it/s]



Epoch: 066, Loss: 0.4652


100%|██████████| 39/39 [00:01<00:00, 20.30it/s]



Epoch: 067, Loss: 0.4651


100%|██████████| 39/39 [00:01<00:00, 19.57it/s]



Epoch: 068, Loss: 0.4495


100%|██████████| 39/39 [00:01<00:00, 19.98it/s]



Epoch: 069, Loss: 0.4569


100%|██████████| 39/39 [00:01<00:00, 20.38it/s]



Epoch: 070, Loss: 0.4552


100%|██████████| 39/39 [00:01<00:00, 20.59it/s]



Epoch: 071, Loss: 0.4525


100%|██████████| 39/39 [00:01<00:00, 20.10it/s]



Epoch: 072, Loss: 0.4540


100%|██████████| 39/39 [00:01<00:00, 20.36it/s]



Epoch: 073, Loss: 0.4575


100%|██████████| 39/39 [00:01<00:00, 20.52it/s]



Epoch: 074, Loss: 0.4529


100%|██████████| 39/39 [00:01<00:00, 20.14it/s]



Epoch: 075, Loss: 0.4564


100%|██████████| 39/39 [00:01<00:00, 20.63it/s]



Epoch: 076, Loss: 0.4587


100%|██████████| 39/39 [00:01<00:00, 20.27it/s]



Epoch: 077, Loss: 0.4565


100%|██████████| 39/39 [00:01<00:00, 20.20it/s]



Epoch: 078, Loss: 0.4613


100%|██████████| 39/39 [00:01<00:00, 20.59it/s]



Epoch: 079, Loss: 0.4497


100%|██████████| 39/39 [00:01<00:00, 20.26it/s]



Epoch: 080, Loss: 0.4471


100%|██████████| 39/39 [00:01<00:00, 21.06it/s]



Epoch: 081, Loss: 0.4516


100%|██████████| 39/39 [00:01<00:00, 20.11it/s]



Epoch: 082, Loss: 0.4498


100%|██████████| 39/39 [00:01<00:00, 19.68it/s]



Epoch: 083, Loss: 0.4570


100%|██████████| 39/39 [00:02<00:00, 19.24it/s]



Epoch: 084, Loss: 0.4489


100%|██████████| 39/39 [00:01<00:00, 20.19it/s]



Epoch: 085, Loss: 0.4453


100%|██████████| 39/39 [00:01<00:00, 20.73it/s]



Epoch: 086, Loss: 0.4567


100%|██████████| 39/39 [00:01<00:00, 20.43it/s]



Epoch: 087, Loss: 0.4502


100%|██████████| 39/39 [00:01<00:00, 20.37it/s]



Epoch: 088, Loss: 0.4514


100%|██████████| 39/39 [00:01<00:00, 20.42it/s]



Epoch: 089, Loss: 0.4478


100%|██████████| 39/39 [00:01<00:00, 20.57it/s]



Epoch: 090, Loss: 0.4475


100%|██████████| 39/39 [00:01<00:00, 20.51it/s]



Epoch: 091, Loss: 0.4453


100%|██████████| 39/39 [00:01<00:00, 20.31it/s]



Epoch: 092, Loss: 0.4463


100%|██████████| 39/39 [00:01<00:00, 20.44it/s]



Epoch: 093, Loss: 0.4525


100%|██████████| 39/39 [00:01<00:00, 20.41it/s]



Epoch: 094, Loss: 0.4468


100%|██████████| 39/39 [00:01<00:00, 20.75it/s]



Epoch: 095, Loss: 0.4482


100%|██████████| 39/39 [00:01<00:00, 20.82it/s]



Epoch: 096, Loss: 0.4460


100%|██████████| 39/39 [00:01<00:00, 20.93it/s]



Epoch: 097, Loss: 0.4498


100%|██████████| 39/39 [00:01<00:00, 20.53it/s]



Epoch: 098, Loss: 0.4417


100%|██████████| 39/39 [00:02<00:00, 19.29it/s]



Epoch: 099, Loss: 0.4447


100%|██████████| 39/39 [00:01<00:00, 19.56it/s]



Epoch: 100, Loss: 0.4465


100%|██████████| 16/16 [00:00<00:00, 19.83it/s]



Test AUC: 0.8433426400879911
Test F1: 0.7658001724775163
users
users shape: (4658157, 2)
unique users: 155855
unique articles: 67147
   User_ID  News_ID
0  U321454  N128643
1  U578952  N122359
2  U578952  N110096
3  U578952   N20583
4  U578952  N128736

items
items shape: (67147, 2)
unique articles: 67147
  News_ID                                              title
0  N88753  The Brands Queen Elizabeth, Prince Charles, an...
1  N45436    Walmart Slashes Prices on Last-Generation iPads
2  N23144                      50 Worst Habits For Belly Fat
3  N86255  Dispose of unwanted prescription drugs during ...
4  N93187  The Cost of Trump's Aid Freeze in the Trenches...

article set is same: True
feature tensor: torch.Size([67147, 768])
Mapping of user IDs to consecutive values:
    userId  mappedID
0  U321454         0
1  U578952         1
2  U100987         2
3   U55226         3
4  U211115         4

Mapping of item IDs to consecutive values:
   itemId  mappedID
0  N88753         0
1  N4

 79%|███████▉  | 3457/4368 [18:10<04:51,  3.13it/s]

# Draft

In [None]:
batch_data

HeteroData(
  [1muser[0m={ node_id=[224] },
  [1mitem[0m={
    node_id=[99],
    x=[99, 768]
  },
  [1m(user, rates, item)[0m={
    edge_index=[2, 205],
    edge_label=[20],
    edge_label_index=[2, 20],
    input_id=[20]
  },
  [1m(item, rev_rates, user)[0m={ edge_index=[2, 150] }
)

What is input id??

In [None]:
batch_data['user', 'rates', 'item'].input_id

tensor([557056, 557057, 557058, 557059, 557060, 557061, 557062, 557063, 557064,
        557065, 557066, 557067, 557068, 557069, 557070, 557071, 557072, 557073,
        557074, 557075, 557076, 557077, 557078, 557079, 557080, 557081, 557082,
        557083, 557084, 557085, 557086, 557087, 557088, 557089, 557090, 557091,
        557092, 557093, 557094, 557095, 557096, 557097, 557098, 557099, 557100,
        557101, 557102, 557103, 557104, 557105, 557106, 557107, 557108, 557109,
        557110, 557111, 557112, 557113, 557114, 557115, 557116, 557117, 557118,
        557119, 557120, 557121, 557122, 557123, 557124, 557125, 557126, 557127,
        557128, 557129, 557130, 557131, 557132, 557133, 557134, 557135, 557136,
        557137, 557138, 557139, 557140, 557141, 557142, 557143, 557144, 557145,
        557146, 557147, 557148, 557149, 557150, 557151, 557152, 557153, 557154,
        557155, 557156, 557157, 557158, 557159, 557160, 557161, 557162, 557163,
        557164, 557165, 557166, 557167, 

In [None]:
batch_data['user', 'rates', 'item'].edge_label_index

tensor([[113, 119, 125, 139,   0,  99,  80,  77,  98,  67, 128,  31,  47,  16,
          91,  33,  55,  24,  30,  74,  88, 146, 111,  41, 102,  73, 108,  53,
          60,  28, 137,  23,  38,  87, 100, 110,  97,  65,  51,  34, 118,  19,
         155,  44, 150,  13,  54,  27, 107,  25,  50, 123, 112,  64,  92,  43,
          39,  76,  52, 109, 147,  94,   3,  42,  75,   7, 157, 156,  96,   9,
         142, 129,  71,  22,  46,  10,  81, 122,  37,  78,  21,  11,   5,  69,
          14,  68, 144,  17,  90, 145, 154, 152, 149,  95,  12, 103, 104,  93,
          63, 106,  40,  29, 121, 140,  35,  49,  57, 126, 127,  15,  82,   1,
          59, 117,  89, 124, 120, 132,  85,  79,  45,  56,  20, 151, 114,  26,
          86,  32,  48,   6,  84,  66, 130, 116, 115,   8, 148, 153, 136, 141,
          58, 101,  61,  70, 143,  62,   4,   2, 135, 131,  72,  36, 134, 138,
          18, 133,  83, 105],
        [  2,  80, 100,  92,  25,  38,  74,  85, 144,  99, 113,  84,  17, 131,
          61,  58,   5

In [None]:
batch_data['user', 'rates', 'item'].edge_label

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       device='cuda:0')

In [None]:
ground_truths[-1]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       device='cuda:0')

In [None]:
preds[-1]

tensor([ -3.6659,  -5.9209,  -3.8125,  -3.8156,  -4.9753,  -4.2238,  -4.2707,
        -11.7944,  -4.9251,  -3.5596,  -6.4042,  -0.0456,  -1.0130,  -7.3948,
        -13.3436,  -3.7321,  -6.4111,  -6.3421,  -2.7053,   1.7885,  -6.3836,
         -8.3329,  -6.3701,  -5.0134,  -1.7026,  -4.2668, -10.5365, -12.4031,
         -7.9567,  -4.2070,  -3.5657,  -1.4284, -10.8802,  -8.2110,  -6.4754,
         -3.1270,  -8.6796,  -4.9021,  -9.6323,  -8.6613, -10.1006,  -3.5280,
         -2.2421,  -3.4787,  -9.1882,  -2.7364,  -3.8944,  -9.9798,  -5.8470,
         -3.2481,  -8.0756, -11.7022,  -3.6880,  -6.8287,  -6.6260,  -8.0269,
          0.0511, -11.6987,  -0.9760,  -9.0100, -11.5480,  -8.1490,  -8.7247,
         -6.7875,   0.2181,  -7.4815,  -2.0227,  -9.6268,  -3.5110,  -7.6323,
         -1.2116,  -4.2475,  -8.2994,  -5.1759,  -2.0947,  -9.9370,  -4.4411,
         -5.1921,  -3.3492,  -8.0734,  -5.4666,  -3.7695,  -8.5921,  -5.9587,
         -3.9303,  -3.1428,  -4.4850,  -7.2361,  -6.2845,  -5.70

In [None]:
probs = preds[-1].view(-1).sigmoid().cpu().numpy()
probs = np.rint(probs)
probs

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.], dtype=float32)

In [None]:
(len(probs) - np.sum(np.rint(probs))) / len(probs)

0.7552625491556789

In [None]:
# draft
def predictions(max = 1000, threshold = 0.99):
    """
    Creates predictions for the specified run.
    :param run_id: model id
    :param max: the maximum amount of predictions to output
    """
    pred_edges = []

    loader = NeighborLoader(data, num_neighbors = [10] * 2, shuffle = True, input_nodes = None, batch_size = batch_size)
    threshold_tensor = torch.tensor([threshold]).to(device)
    for batch in tqdm(loader):
        batch.to(device)
        z = model.encode(batch.x, batch.edge_index)
        # collecting negative edge tuples ensure that the decode are actual non-existing edges
        neg_edge_index = negative_sampling(edge_index = batch.edge_index, num_nodes = None, num_neg_samples = None, method = 'sparse')
        out = model.decode(z, neg_edge_index).view(-1).sigmoid()
        pred = ((out > threshold_tensor).float() * 1).cpu().numpy()
        found = np.argwhere(pred == 1)
        if found.size > 0:
            edge_tuples = neg_edge_index.t().cpu().numpy()
            select_index = found.reshape(1, found.size)[0]
            edges = edge_tuples[select_index]
            pred_edges += edges.tolist()
            if len(pred_edges) >= max:
                break
    
    return pd.DataFrame.from_dict([{'source': a, 'target': b} for a,b in pred_edges])