In [1]:
import numpy as np
import pandas as pd
from ogb.lsc import WikiKG90Mv2Dataset
import sys
import os

In [2]:
rootdir = '/db2/users/minjunpark/ogb/rawdata'
dataset = WikiKG90Mv2Dataset(root=str(rootdir))

In [3]:
print(dataset.num_entities) # number of entities -- > 91230610
print(dataset.num_relations) # number of relation types --> 1387
print(dataset.num_feat_dims) # dimensionality of entity/relation features.

entity_feat = dataset.entity_feat # np.array of shape (91230610, 768)

91230610
1387
768


In [4]:
# Validation and testing
valid_task = dataset.valid_dict['h,r->t'] # get a dictionary storing the h,r->t task.
hr = valid_task['hr']
t = valid_task['t']

testdev_task = dataset.test_dict(mode = 'test-dev')['h,r->t'] # get a dictionary storing the h,r->t task.
hr = testdev_task['hr']

In [5]:
dataset.test_dict(mode = 'test-challenge')['h,r->t']['hr']

array([[42203917,      818],
       [52193345,       73],
       [19804008,      480],
       ...,
       [23350374,       18],
       [17369640,      388],
       [37290774,      997]])

In [5]:
dataset.test_dict(mode = 'test-challenge')['h,r->t']['hr'][:,0]

array([42203917, 52193345, 19804008, ..., 23350374, 17369640, 37290774])

In [6]:
# Look from the perspective of tail

In [7]:
valid_task = dataset.valid_dict['h,r->t'] # get a dictionary storing the h,r->t task.
hr = valid_task['hr']
t = valid_task['t']

In [8]:
train_task = dataset.train_hrt

In [9]:
df = pd.DataFrame(train_task, columns=['Head', 'Relation', 'Tail'])
df

Unnamed: 0,Head,Relation,Tail
0,0,167,2648053
1,0,758,4103231
2,0,1336,4362459
3,0,1188,4507475
4,0,1188,4507661
...,...,...,...
601062806,91230609,879,5002505
601062807,91230609,534,9183678
601062808,91230609,215,9684464
601062809,91230609,879,50127613


In [10]:
df2 = df[df.Head==0]
df2.head()

Unnamed: 0,Head,Relation,Tail
0,0,167,2648053
1,0,758,4103231
2,0,1336,4362459
3,0,1188,4507475
4,0,1188,4507661


In [11]:
df2.Tail.value_counts()

14271585    2
23719580    2
2648053     1
46174861    1
19162448    1
20384735    1
25098003    1
26275985    1
28376155    1
46194152    1
17180947    1
51527433    1
51711289    1
54417003    1
61286324    1
71734062    1
75798892    1
80906364    1
18935886    1
15031472    1
4103231     1
6301083     1
4362459     1
4507475     1
4507661     1
4848582     1
4913680     1
5495024     1
6569077     1
13254045    1
9749921     1
10836978    1
11021407    1
11603051    1
12706508    1
12788458    1
84998900    1
Name: Tail, dtype: int64

## case 1: remove duplicates & only consider the first case

In [12]:
df_uniq = df.drop_duplicates(subset='Head').reset_index(drop=True)
df_uniq

Unnamed: 0,Head,Relation,Tail
0,0,167,2648053
1,1,1296,539765
2,2,215,2
3,3,534,4571655
4,4,88,8956884
...,...,...,...
91197927,91230605,1125,6005191
91197928,91230606,88,25700305
91197929,91230607,818,2188720
91197930,91230608,1347,2666612


In [13]:
valid_task = dataset.valid_dict['h,r->t'] # get a dictionary storing the h,r->t task.
hr = valid_task['hr']
t = valid_task['t']

hr

array([[68027985,      943],
       [78805001,      934],
       [ 3742264,        8],
       ...,
       [77901263,      480],
       [68077338,      672],
       [45944824,      815]])

In [14]:
dic = dataset.valid_dict['h,r->t'] # get a dictionary storing the h,r->t task.
hr = dic['hr']
h = hr[:,0]

In [16]:
df_uniq[df_uniq.Head==h[0]].Tail.tolist()[0]

8899443

In [21]:
np.array([df_uniq[df_uniq.Head==i].Tail.tolist()[0] for i in h2[0:120]])

array([ 8899443, 31774728, 42069230, 24338794,  4923899,  4923899,
        5370583,  8816900, 10642117,  5164798,  8451235,  7484636,
       42069230,  5518483,  9302442,  5319741,  7026824,  4923899,
       11217693,  6393284, 80249390,  4962138,  5055239,  4687586,
        8656361,  3174575,  9747495,   591529,  2086047,  6214131,
       13569667,  6393284, 37422588,  6393284,  3730347, 10660130,
        3598573,  4531709,  6134788,  6393284,  8651498,  5968685,
        7175094, 12667311,  6223683,  6739325,  5319741, 29696674,
        1866570, 33464694, 16930221,  1491280,  9294723,  5547405,
        7357331, 21626711,  4608354, 42069230, 10737551, 30581974,
       41815441,  5012695,  7672562,  4943914,  9792914,  1491280,
        8816900, 10016963,  2417138,  4548405,  1572964,  5820110,
        6393284, 11050852,  9804464, 12929065,  8816900,  7023434,
        7996271,  3174574,   227282,  5997237,  4274068,  1338135,
        6188353,  5542154, 11050852,  1303423,  1799039,  6393