In [27]:
import torch
import pandas as pd
import numpy as np
from torch import nn
from torch.nn.parameter import Parameter
from sklearn.model_selection import train_test_split
from torch.nn import MarginRankingLoss
from torch import ones_like
import networkx as nx
from tqdm.autonotebook import tqdm

In [2]:
x = nx.read_graphml('q42889.graphml')

In [3]:
head = []
tail = []
relation = []
for i in x.edges:
    head.append(i[0])
    tail.append(i[1])
    relation.append(i[2])

In [4]:
df = pd.DataFrame(columns=['from', 'to', 'rel'])
df['from'] = head
df['to'] = tail
df['rel'] = relation

In [5]:
df

Unnamed: 0,from,to,rel
0,n0,n0,0
1,n0,n60,0
2,n0,n61,0
3,n0,n62,0
4,n0,n63,0
...,...,...,...
182920,n51774,n7795,0
182921,n51774,n12099,0
182922,n51774,n14385,0
182923,n51774,n30611,0


In [6]:
d_1 = {}
k = 0
for i in df['from']:
    if i not in d_1:
        d_1[i] = k
        k+=1
for i in df['to']:
    if i not in d_1:
        d_1[i] = k
        k+=1

In [7]:
for i, j in enumerate(df['from']):
    df['from'][i] = d_1[j]
for i, j in enumerate(df['to']):
    df['to'][i] = d_1[j]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [8]:
df

Unnamed: 0,from,to,rel
0,0,0,0
1,0,24806,0
2,0,24807,0
3,0,24808,0
4,0,24809,0
...,...,...,...
182920,24805,2271,0
182921,24805,3809,0
182922,24805,4738,0
182923,24805,13386,0


In [9]:
E = df[['from','to']].to_numpy()

In [10]:
E

array([[0, 0],
       [0, 24806],
       [0, 24807],
       ...,
       [24805, 4738],
       [24805, 13386],
       [24805, 49980]], dtype=object)

In [11]:
r = df['rel'].to_numpy()

In [12]:
r = r.reshape(-1,1)

In [13]:
E_train, E_test, r_train, r_test = train_test_split(E, r)

In [14]:
E_train_1 = np.array([np.zeros(shape=(len(E_train),1)),np.zeros(shape=(len(E_train),1))])
E_test_1 = np.array([np.zeros(shape=(len(E_test),1)),np.zeros(shape=(len(E_test),1))])

In [15]:
for i in range(len(E_train)):
    E_train_1[0][i] = E_train[i][0]
    E_train_1[1][i] = E_train[i][1]
for i in range(len(E_test)):
    E_test_1[0][i] = E_test[i][0]
    E_test_1[1][i] = E_test[i][1]

In [16]:
E_train = torch.LongTensor(E_train_1)
E_test = torch.LongTensor(E_test_1)
r_train = torch.LongTensor(r_train)
r_test = torch.LongTensor(r_test)

In [17]:
def batch_generator(X, y, batch_size):
    np.random.seed(42)
    perm = np.random.permutation(len(X[0]))
    X_1 = X[0][perm]
    X_2 = X[1][perm]
    y = y[perm]
    num_samples = X[0].shape[0]
    num_batches = num_samples // batch_size
    if(num_samples <= batch_size):
        yield(X_1, X_2, y)
    else:
        if(num_samples % batch_size == 0):
            for i in range(num_batches):
                yield(X_1[batch_size*i:batch_size*(i+1)], X_2[batch_size*i:batch_size*(i+1)], y[batch_size*i:batch_size*(i+1)])
        else:
            for i in range(num_batches):
                yield(X_1[batch_size*i:batch_size*(i+1)], X_2[batch_size*i:batch_size*(i+1)], y[batch_size*i:batch_size*(i+1)])
            yield(X_1[batch_size*num_batches:], X_2[batch_size*num_batches:], y[batch_size*num_batches:])

In [18]:
class norm_1d(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self,X):
        for i in range(len(X)):
            X[i]= X[i]/torch.norm(X[i])
        return Parameter(X)

In [38]:
class TransE(nn.Module):
    def __init__(self, n_ent, n_rel, k):
        super().__init__()
        self.r = np.zeros(shape=(n_rel,k))
        for i in range(len(self.r)):
            self.r[i] = np.random.uniform(-6/np.sqrt(k), 6/np.sqrt(k), (1,k))
            self.r[i] = self.r[i]/np.linalg.norm(self.r[i])
        self.r = Parameter(torch.tensor(self.r, requires_grad = True))
        self.e = np.zeros(shape=(n_ent,k))
        for i in range(len(self.e)):
            self.e[i] = np.random.uniform(-6/np.sqrt(k), 6/np.sqrt(k), (1,k))
            self.e[i] = self.e[i]/np.linalg.norm(self.e[i])
        self.e = Parameter(torch.tensor(self.e, requires_grad = True))
        #self.norm = norm_1d()
    
    def forward(self, T, H, r, T_bad, H_bad):
        #for i in range(len(self.e)):
            #self.e[i] = self.e[i]/torch.norm(self.e[i])
        #self.e = self.norm(self.e)
        return nn.functional.pairwise_distance(self.e[H] + self.r[r], self.e[T]), nn.functional.pairwise_distance(self.e[H_bad] + self.r[r], self.e[T_bad])

In [39]:
def get_corrupted_batch(H_batch, T_batch):
    H_bad_batch = H_batch.clone()
    T_bad_batch = T_batch.clone()
    for i in range(len(H_batch)):
        p = np.random.randint(0,2)
        pp = np.random.randint(0,len(model.e))
        if(p==1):
            H_bad_batch[i] = pp
        else:
            T_bad_batch[i] = pp
    return H_bad_batch, T_bad_batch

In [40]:
def train(E_train, r_train, num_epoch):
    iterator = tqdm(range(num_epoch), unit='epoch')
    train_losses = []
    for epoch in iterator:
        epoch_train_losses = []
        for H_batch, T_batch, r_batch in batch_generator(E_train, r_train, 5000):
            H_bad_batch, T_bad_batch = get_corrupted_batch(H_batch, T_batch)
            model.train(True)
            positive_score, negative_score = model.forward(H_batch, T_batch, r_batch, H_bad_batch, T_bad_batch)
            loss = loss_fn(positive_score,negative_score, target=ones_like(positive_score))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_train_losses.append(loss.item())
        train_losses.append(np.mean(epoch_train_losses))
        print(np.mean(epoch_train_losses))
    return train_losses

In [41]:
model = TransE(len(d_1), len(df['rel'].unique()), 50)
loss_fn = MarginRankingLoss(margin=0.5)
optimizer = torch.optim.Adam(model.parameters(), 1e-1)

In [42]:
if(torch.cuda.is_available()):
    model=model.cuda()
    E_train, r_train = E_train.cuda(), r_train.cuda()

In [43]:
train_losses = train(E_train, r_train, 10)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

0.41485032198889904
0.267554299899398
0.22614545051975102
0.2082268393775111
0.19954813101286664
0.19337819266113346
0.1876495447058991
0.1855179289725725
0.18305191154003278
0.1801217313899847



In [44]:
model.e

Parameter containing:
tensor([[-1.9690e+00, -2.5020e-02,  1.4041e+00,  ...,  1.4582e+00,
         -7.9346e-01,  1.4963e+00],
        [ 1.7680e-01, -6.2436e-01,  8.0666e-01,  ...,  1.5215e-03,
          2.7196e-01, -5.0512e-01],
        [-1.1150e+00, -1.7116e+00, -1.3763e+00,  ...,  4.6120e-01,
         -4.9704e-01,  9.2473e-01],
        ...,
        [ 1.5034e+00, -7.7526e-01,  1.4366e+00,  ..., -2.3115e+00,
          1.7895e+00, -1.8363e+00],
        [ 7.8721e-01, -7.2621e-01, -9.0434e-01,  ..., -3.7629e-01,
          9.1154e-01, -6.4958e-01],
        [ 1.0715e+00,  9.1338e-01, -1.5496e+00,  ..., -1.4035e+00,
          6.2862e-01,  2.7258e-02]], device='cuda:0', dtype=torch.float64,
       requires_grad=True)

In [45]:
model.r

Parameter containing:
tensor([[ 8.7844e-01,  1.6697e-01, -1.0302e-01,  ..., -9.2849e-01,
          7.0632e-01, -1.8086e+00],
        [ 2.6495e-01, -3.4130e-01, -2.5624e-01,  ..., -3.0092e-01,
          2.8551e-01, -8.8260e-01],
        [ 3.1197e-02,  6.7299e-02,  8.3738e-02,  ...,  1.7662e-01,
          2.5073e-01, -9.8378e-01],
        ...,
        [ 1.7673e-01, -9.9844e-02,  1.6517e-01,  ..., -2.5897e-04,
         -1.9014e-01, -1.4624e-01],
        [ 1.3649e-01, -1.9418e-01, -4.8421e-03,  ...,  2.3598e-01,
         -1.6356e-01, -6.7364e-02],
        [-9.7498e-02,  3.4697e-01,  4.6006e-02,  ..., -1.0178e-01,
          1.4740e-01,  1.0185e-01]], device='cuda:0', dtype=torch.float64,
       requires_grad=True)