In [1]:
from companykg import CompanyKG

## Initialize CompanyKG

In [2]:
DATA_ROOT_FOLDER = "./data"

# It may take long time if the data files are not yet downloaded.
comkg = CompanyKG(nodes_feature_type="msbert", load_edges_weights=True, data_root_folder=DATA_ROOT_FOLDER)

In [3]:
# Obtain the list of node IDs using nodes_id
len(comkg.nodes_id)

1169931

## Inspect CompanyKG

In [4]:
# Show the important info of the loaded data
comkg.describe()

data_root_folder=./data
n_nodes=1169931, n_edges=50815503
nodes_feature_type=msbert
nodes_feature_dimension=512
edges_weight_dimension=15
sp: 1764 samples
sr: 1856 samples
cr: 400 samples
ep: 40000 samples


In [5]:
# Nodes feature: ordered by comkg.nodes_id
comkg.nodes_feature

tensor([[ 0.0642,  0.0100, -0.0427,  ...,  0.0580,  0.0156, -0.0645],
        [ 0.0563, -0.0086, -0.0114,  ..., -0.0381, -0.0020, -0.1615],
        [ 0.0668, -0.0803,  0.0296,  ...,  0.0003,  0.0872, -0.0662],
        ...,
        [-0.0072,  0.0593, -0.0401,  ...,  0.0861,  0.0681, -0.0302],
        [-0.0445, -0.1243,  0.0048,  ...,  0.0149, -0.0466, -0.0800],
        [-0.0396, -0.0113, -0.0121,  ..., -0.0240, -0.0342, -0.0039]])

In [6]:
# Edges
comkg.edges

tensor([[ 113091,  412357],
        [ 560244, 1164306],
        [ 388246, 1121544],
        ...,
        [  84160,  837013],
        [ 179090,  917143],
        [ 179090,  226260]])

In [7]:
# Edges weight
comkg.edges_weight

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.7918],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.7918],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.7918],
        ...,
        [1.0000, 0.0000, 1.0000,  ..., 0.0000, 0.0000, 0.0000],
        [1.0000, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [1.0000, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])

## Inspect Evaluation Data

In [8]:
# SP samples
comkg.eval_tasks['sp'][1]

Unnamed: 0,node_id0,node_id1,label
0,1070786,136969,0
1,606364,184359,0
2,3848,223709,1
3,876994,242417,0
4,574071,242417,0
...,...,...,...
1759,771712,1027754,1
1760,1114431,1046037,0
1761,247018,1046339,0
1762,98621,1168288,1


In [9]:
# SR samples
comkg.eval_tasks['sr'][1]

Unnamed: 0,target_node_id,candidate0_node_id,candidate1_node_id,label,split
0,201389,198435,797652,0,test
1,450703,618486,624384,0,test
2,1097415,297978,386584,0,validation
3,81000,244410,1016534,0,test
4,861572,1155658,1115208,0,test
...,...,...,...,...,...
1851,522257,669089,607981,1,test
1852,1083662,203070,482478,1,validation
1853,354276,551887,865995,0,test
1854,830672,504882,1046882,1,test


In [10]:
# CR samples
comkg.eval_tasks['cr'][1]

Unnamed: 0,target_node_id,competitor_node_id
0,3843,34994
1,3843,263332
2,3843,1034500
3,4981,45823
4,4981,288480
...,...,...
395,1144634,1004440
396,1144634,1077443
397,1163522,172921
398,1163522,268689


In [11]:
# EP samples
comkg.eval_tasks['ep'][1]

Unnamed: 0,node_id0,node_id1,et2,et3,et4,et5,et8,et10,et14,et15,split
0,217024,716915,0,0,0,0,0,0,0,1,test
1,435690,994558,0,0,0,0,0,1,0,0,test
2,78531,691964,0,0,0,0,0,0,0,1,test
3,457175,456862,0,0,0,1,0,0,0,0,test
4,430325,84446,0,0,0,1,0,0,0,0,validation
...,...,...,...,...,...,...,...,...,...,...,...
39995,89061,609621,0,0,0,0,0,0,1,0,test
39996,197242,938619,0,0,0,0,0,0,1,0,test
39997,367197,465661,0,0,0,0,1,0,0,0,test
39998,597795,668947,0,0,0,0,0,1,0,0,test


## Evaluate Node Feature

In [12]:
# Run all evaluation tasks on the loaded node feature
eval_results = comkg.evaluate()

Evaluate Node Features msbert:
Evaluate SP ...
SP AUC: 0.7519511905744471
Evaluate SR ...
SR Validation ACC: 0.6956521739130435 SR Test ACC: 0.6713709677419355
Evaluate CR with top-K hit rate (K=[50, 100, 200, 500, 1000, 2000, 5000, 10000]) ...
CR Hit Rates: [0.12955922001974632, 0.18240535049745576, 0.23030967570441258, 0.31102329687856, 0.4143004291030607, 0.47711466165413524, 0.5583993126756285, 0.6349049707602339]
Evaluate EP ...
Trial 1
Using device: cuda
Epoch [1/100], Validation Loss: 1.6778, Overall Validation AUC-ROC: 0.7937
Class 0 Validation AUC-ROC: 0.7330
Class 1 Validation AUC-ROC: 0.7873
Class 2 Validation AUC-ROC: 0.7210
Class 3 Validation AUC-ROC: 0.9202
Class 4 Validation AUC-ROC: 0.9213
Class 5 Validation AUC-ROC: 0.7016
Class 6 Validation AUC-ROC: 0.7607
Class 7 Validation AUC-ROC: 0.8045
Epoch [2/100], Validation Loss: 1.6365, Overall Validation AUC-ROC: 0.8086
Class 0 Validation AUC-ROC: 0.7586
Class 1 Validation AUC-ROC: 0.7993
Class 2 Validation AUC-ROC: 0.7397


In [13]:
# Show AUC score for SP task
eval_results["sp_auc"]

0.7519511905744471

In [14]:
# Show test accuracy for SR task
eval_results["sr_test_acc"]

0.6713709677419355

In [15]:
# Show validation accuracy for SR task
eval_results["sr_validation_acc"]

0.6956521739130435

In [16]:
# Show Top-K Hit Rate for CR task
eval_results["cr_topk_hit_rate"]

[0.12955922001974632,
 0.18240535049745576,
 0.23030967570441258,
 0.31102329687856,
 0.4143004291030607,
 0.47711466165413524,
 0.5583993126756285,
 0.6349049707602339]

In [17]:
# Show overall AUC and per-ET AUC for EP task from all trials
eval_results["ep_test_auc"]

{'overall_mean': 0.8152710799319728,
 'overall_std': 0.0011013208302879237,
 'per_class_mean': array([0.75057417, 0.8148841 , 0.75387312, 0.92682654, 0.92663642,
        0.75045155, 0.78954263, 0.80938012]),
 'per_class_std': array([0.00467002, 0.00671789, 0.00189928, 0.00319254, 0.00073781,
        0.00627061, 0.00250782, 0.00553296]),
 'test_auc_scores': [{'overall_test_auc': 0.8168251581632653,
   'per_class_test_auc': array([0.75219244, 0.81377389, 0.75488691, 0.92551108, 0.92647168,
          0.75917381, 0.78688562, 0.81570583])},
  {'overall_test_auc': 0.8144046428571429,
   'per_class_test_auc': array([0.75531024, 0.80726787, 0.75121214, 0.93122468, 0.92761108,
          0.74747696, 0.79290557, 0.80222859])},
  {'overall_test_auc': 0.8145834387755102,
   'per_class_test_auc': array([0.74421981, 0.82361053, 0.75552031, 0.92374386, 0.9258265 ,
          0.74470388, 0.78883669, 0.81020594])}]}

## Evaluate Saved Embedding

In [18]:
# Run all evaluation tasks on the specified embeddings saved in torch.Tensor format

EMBEDDINGS_FILE = "./data/nodes_feature_msbert.pt"

eval_results = comkg.evaluate(embeddings_file=EMBEDDINGS_FILE)

Evaluate Node Embeddings ./data/nodes_feature_msbert.pt:
Evaluate SP ...
SP AUC: 0.7519511905744471
Evaluate SR ...
SR Validation ACC: 0.6956521739130435 SR Test ACC: 0.6713709677419355
Evaluate CR with top-K hit rate (K=[50, 100, 200, 500, 1000, 2000, 5000, 10000]) ...
CR Hit Rates: [0.12955922001974632, 0.18240535049745576, 0.23030967570441258, 0.31102329687856, 0.4143004291030607, 0.47711466165413524, 0.5583993126756285, 0.6349049707602339]
Evaluate EP ...
Trial 1
Using device: cuda
Epoch [1/100], Validation Loss: 1.6720, Overall Validation AUC-ROC: 0.7911
Class 0 Validation AUC-ROC: 0.7555
Class 1 Validation AUC-ROC: 0.7570
Class 2 Validation AUC-ROC: 0.7154
Class 3 Validation AUC-ROC: 0.9259
Class 4 Validation AUC-ROC: 0.9210
Class 5 Validation AUC-ROC: 0.6953
Class 6 Validation AUC-ROC: 0.7547
Class 7 Validation AUC-ROC: 0.8044
Epoch [2/100], Validation Loss: 1.7054, Overall Validation AUC-ROC: 0.8080
Class 0 Validation AUC-ROC: 0.7765
Class 1 Validation AUC-ROC: 0.7904
Class 2 V

In [19]:
# Show AUC score for SP task
eval_results["sp_auc"]

0.7519511905744471

In [20]:
# Show test accuracy for SR task
eval_results["sr_test_acc"]

0.6713709677419355

In [21]:
# Show validation accuracy for SR task
eval_results["sr_validation_acc"]

0.6956521739130435

In [22]:
# Show Top-K Hit Rate for CR task
eval_results["cr_topk_hit_rate"]

[0.12955922001974632,
 0.18240535049745576,
 0.23030967570441258,
 0.31102329687856,
 0.4143004291030607,
 0.47711466165413524,
 0.5583993126756285,
 0.6349049707602339]

In [23]:
# Show overall AUC and per-ET AUC for EP task from all trials
eval_results["ep_test_auc"]

{'overall_mean': 0.815198977648202,
 'overall_std': 0.0027160472182029384,
 'per_class_mean': array([0.74505209, 0.81318325, 0.75822825, 0.9289814 , 0.9262074 ,
        0.75893122, 0.7894433 , 0.80156491]),
 'per_class_std': array([0.0116706 , 0.00490813, 0.00156706, 0.00126533, 0.00032441,
        0.00461707, 0.00253323, 0.00406191]),
 'test_auc_scores': [{'overall_test_auc': 0.8139624424198251,
   'per_class_test_auc': array([0.74160419, 0.80841078, 0.75842525, 0.92827949, 0.92644288,
          0.76456039, 0.78603384, 0.79794273])},
  {'overall_test_auc': 0.8189666268221574,
   'per_class_test_auc': array([0.76075417, 0.81993438, 0.7562181 , 0.93075787, 0.92574866,
          0.758982  , 0.79210068, 0.80723715])},
  {'overall_test_auc': 0.8126678637026239,
   'per_class_test_auc': array([0.7327979 , 0.8112046 , 0.76004141, 0.92790685, 0.92643065,
          0.75325127, 0.79019539, 0.79951485])}]}

## Create DGL Graph

In [24]:
# Takes about 15 mins, the graph will be saved to work_folder
g = comkg.get_dgl_graph(work_folder="./experiments")
g

[Graph(num_nodes=1169931, num_edges=50815503,
       ndata_schemes={'feat': Scheme(shape=(512,), dtype=torch.float32)}
       edata_schemes={'weight': Scheme(shape=(15,), dtype=torch.float32)})]

In [25]:
# When call the same function again, it will load from file directly
g = comkg.get_dgl_graph(work_folder="./experiments")
g

[Graph(num_nodes=1169931, num_edges=50815503,
       ndata_schemes={'feat': Scheme(shape=(512,), dtype=torch.float32)}
       edata_schemes={'weight': Scheme(shape=(15,), dtype=torch.float32)})]

## Create iGraph

In [27]:
g = comkg.to_igraph()
g

<igraph.Graph at 0x7f6b93bf8040>