# TransE algorithm

TranseE algorithm to convert openIE triplets to embeddings.

In [1]:
import pandas as pd
import numpy as np
columns = ["confidence_score", "subject", "relation", "object"]
text_file = pd.read_csv("text_articles\output.txt", sep="\t", encoding='latin1', header = None)

In [2]:
text_file

Unnamed: 0,0,1,2,3
0,1.0,state governments,verify customers due,poor response from state governments
1,1.0,state governments,verify,linked non-core customers
2,1.0,state governments,verify customers due,poor response from certain state governments
3,1.0,state governments,verify customers due,response from state governments
4,1.0,state governments,verify customers due,poor response
...,...,...,...,...
857,1.0,companies,would however have,obligation into agreement with existing market...
858,1.0,companies,would however have,obligation by entering into suitable agreement...
859,1.0,companies,would however have,obligation
860,1.0,companies,would however have,flexibility to discharge


In [3]:
text_file.columns = columns

Dataframe contains the <b>triplets (subject, relation, object) </b> and <b>confident score</b>. Here confident score  represent how much confidence algorithm have to convert sentence into the triplets.

In [4]:
text_file

Unnamed: 0,confidence_score,subject,relation,object
0,1.0,state governments,verify customers due,poor response from state governments
1,1.0,state governments,verify,linked non-core customers
2,1.0,state governments,verify customers due,poor response from certain state governments
3,1.0,state governments,verify customers due,response from state governments
4,1.0,state governments,verify customers due,poor response
...,...,...,...,...
857,1.0,companies,would however have,obligation into agreement with existing market...
858,1.0,companies,would however have,obligation by entering into suitable agreement...
859,1.0,companies,would however have,obligation
860,1.0,companies,would however have,flexibility to discharge


In [5]:
# to count the occurence of the subject
from collections import Counter
Counter(np.array(text_file['subject'])).most_common()

[('government', 56),
 ('companies', 40),
 ('lacklustre revenue collections', 39),
 ('cii', 39),
 ("nepal 's prince dipendra", 25),
 ('customs duties', 19),
 ('average customs duties', 19),
 ('india', 16),
 ('it', 14),
 ('interest rates', 14),
 ('vajpayee', 14),
 ('state governments', 13),
 ('delegates', 13),
 ('board', 10),
 ('us', 9),
 ('lazard capital', 9),
 ('capital', 9),
 ('control', 9),
 ('indian companies', 8),
 ('rana talwar', 8),
 ('bids', 8),
 ('deficit', 7),
 ('2002', 7),
 ('yashwant sinha harping', 6),
 ('demand stimulus needed', 6),
 ('demand stimulus', 6),
 ('fiscal deficit', 6),
 ('capacities', 6),
 ('benicio del toro', 6),
 ('sachin tendulkar', 6),
 ('sir', 6),
 ('hamid karzai', 6),
 ('karzai', 6),
 ('oil sector regulator', 6),
 ('enron', 5),
 ('clinton', 5),
 ('bsf men', 5),
 ('3', 5),
 ('steven s reinemund', 5),
 ('hewlett packard', 5),
 ('afghanistan', 5),
 ('its two subsidiaries', 5),
 ('its subsidiaries', 5),
 ('proposed oil sector regulator', 5),
 ('private', 5),


In [6]:
# to count the occurence of the object
Counter(np.array(text_file['object'])).most_common()

[('india', 12),
 ('trade', 5),
 ('power', 5),
 ('range', 4),
 ('range of', 4),
 ('much more trade', 4),
 ('china', 4),
 ('more trade', 4),
 ('margins', 4),
 ('serious proportions', 4),
 ('proportions', 4),
 ('decision', 4),
 ('82', 4),
 ('20 long years', 4),
 ('20 years', 4),
 ('while injured', 4),
 ('injured', 4),
 ('april 2002', 4),
 ('us', 3),
 ('pakistan president pervez musharraf', 3),
 ('president musharraf', 3),
 ('president pervez musharraf', 3),
 ('pakistan president musharraf', 3),
 ("prime minister 's invitation", 3),
 ("minister 's invitation", 3),
 ('new york', 3),
 ('j', 3),
 ('head of government', 3),
 ('head', 3),
 ('pakistan', 3),
 ('tranche', 3),
 ('public interest', 3),
 ('customers', 2),
 ('deadline', 2),
 ('one month', 2),
 ('2001', 2),
 ('fresh sector investments', 2),
 ('need impetus', 2),
 ('private sector investments', 2),
 ('much need impetus', 2),
 ('sector investments', 2),
 ('fresh private sector investments', 2),
 ('will region of 5.0 % to 5.1 %', 2),
 ('w

In [7]:
# to count the occurence of the relation
Counter(np.array(text_file['relation'])).most_common()

[('is in', 36),
 ('fall from', 36),
 ('has', 34),
 ('is expected', 26),
 ('would have', 18),
 ('considers', 16),
 ('would however have', 16),
 ('will', 13),
 ('will in', 12),
 ('result in', 12),
 ('may invites', 12),
 ('massacres', 12),
 ('allegedly massacres', 12),
 ('is', 11),
 ('is with', 11),
 ('expects', 10),
 ('takes office as', 10),
 ('may', 10),
 ('select karzai as', 9),
 ('prohibit', 9),
 ('giving need impetus to', 8),
 ('be', 8),
 ('dies at', 8),
 ('may takes 0over as', 8),
 ('may sworn in as', 8),
 ('retires after', 8),
 ('decide', 8),
 ('quits as', 8),
 ('adopt', 8),
 ('forced out', 8),
 ('have', 8),
 ('keep leash', 8),
 ('ensure', 7),
 ('offers', 7),
 ('verify customers due', 6),
 ('also doing', 6),
 ('doing', 6),
 ('wins', 6),
 ('visit', 6),
 ('may accepts', 6),
 ('backs', 6),
 ('chargesheeted by', 6),
 ('were', 6),
 ('invited bids for', 6),
 ('verify', 5),
 ('surrender', 5),
 ('faces', 5),
 ('has proposed transition period after', 5),
 ('fails in', 5),
 ('has become', 4)

In [8]:
# Here i am assigning the id to each entity which is later  used in the algorithm

entity2id = {}
relation2id = {}


# sub_e = "E"
# sub_r = "R"

subjects = text_file.subject.unique()
objects = text_file.object.unique()
entities = np.append(subjects, objects, 0)

unique_relation = np.unique(text_file.relation)
unique_entity = np.unique(entities)

for i in range(len(unique_relation)):
    relation2id[unique_relation[i]] = i

for i in range(len(unique_entity)):
    entity2id[unique_entity[i]] = i    

In [9]:
unique_entity.shape

(781,)

In [10]:
id2relation = {value:key for key, value in relation2id.items()}

In [11]:
id2entity = {value:key for key, value in entity2id.items()}

In [12]:
import tensorflow as tf
import numpy as np
import json
import os
import time
import pickle
import random

In [13]:
# entity2id
# relation2id
# id2relation
# id2entity

In [14]:
text_file

Unnamed: 0,confidence_score,subject,relation,object
0,1.0,state governments,verify customers due,poor response from state governments
1,1.0,state governments,verify,linked non-core customers
2,1.0,state governments,verify customers due,poor response from certain state governments
3,1.0,state governments,verify customers due,response from state governments
4,1.0,state governments,verify customers due,poor response
...,...,...,...,...
857,1.0,companies,would however have,obligation into agreement with existing market...
858,1.0,companies,would however have,obligation by entering into suitable agreement...
859,1.0,companies,would however have,obligation
860,1.0,companies,would however have,flexibility to discharge


In [15]:
final = []
for i in range(text_file.shape[0]):
    final.append((entity2id[text_file.iloc[i,1]], relation2id[text_file.iloc[i,2]], entity2id[text_file.iloc[i,3]]))

In [16]:
final

[(641, 196, 542),
 (641, 195, 433),
 (641, 196, 541),
 (641, 196, 579),
 (641, 196, 540),
 (641, 196, 577),
 (641, 195, 432),
 (641, 195, 197),
 (641, 195, 491),
 (641, 196, 578),
 (641, 62, 253),
 (474, 83, 202),
 (474, 84, 511),
 (641, 62, 690),
 (474, 82, 511),
 (641, 47, 741),
 (474, 81, 202),
 (478, 195, 197),
 (87, 131, 351),
 (87, 131, 479),
 (778, 31, 31),
 (778, 93, 167),
 (778, 215, 31),
 (778, 93, 775),
 (777, 87, 62),
 (778, 93, 168),
 (778, 93, 774),
 (178, 80, 391),
 (178, 80, 200),
 (178, 80, 392),
 (178, 80, 129),
 (679, 26, 5),
 (282, 210, 574),
 (282, 210, 295),
 (397, 37, 223),
 (397, 37, 224),
 (223, 78, 293),
 (223, 77, 484),
 (222, 78, 293),
 (223, 78, 559),
 (222, 77, 481),
 (222, 78, 621),
 (397, 37, 222),
 (223, 77, 481),
 (222, 77, 484),
 (222, 78, 559),
 (223, 78, 292),
 (397, 37, 225),
 (397, 204, 755),
 (222, 78, 292),
 (223, 78, 621),
 (34, 118, 779),
 (386, 11, 572),
 (257, 11, 572),
 (257, 11, 573),
 (295, 110, 572),
 (295, 110, 573),
 (295, 50, 573),
 (

In [17]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
import torch
from torch import nn, optim
import matplotlib.pyplot as plt

In [18]:
rel_len = len(relation2id)
entity_len = len(entity2id)

In [19]:
all_entity = unique_entity

In [20]:
all_entity

array(['$ 3bn', '$ 3bn in damages', "'92 stock market scam",
       "'98 sanctions", "'99", '...', '0over', '10 hotels',
       '10 per cent stake', '10 per cent stake in l', '11 members',
       '11 members in kathmandu',
       '11 members in kathmandu including his parents',
       '11 members including his parents', '11 members of family',
       '11 members of family in kathmandu',
       '11 members of family in kathmandu including his parents',
       '11 members of family including his parents',
       '11 members of royal family',
       '11 members of royal family in kathmandu',
       '11 members of royal family in kathmandu including his parents',
       '11 members of royal family including his parents', '14',
       '142 members', '149 rooms', '16', '196 rooms',
       '196 rooms along hotel airport ashok at kolkata', '20 long years',
       '20 others', '20 years', '2001', '2001 02', '2002', '2002 03',
       '23', '25bn in stock become', '25bn in stock to become', '29',

## Implementation of TransE algorithm:

Here is the link of the research papers:
https://proceedings.neurips.cc/paper/2013/file/1cecc7a77928ca8133fa24680a88d2f9-Paper.pdf

In [21]:
class TransE(nn.Module):
    def __init__(self, entity_len, rel_len, embedding_dim, margin=0.5):
        super(TransE, self).__init__()
        
        self.entity_embeddings = nn.Embedding(entity_len, embedding_dim,).cuda()
        self.rel_embeddings = nn.Embedding(rel_len, embedding_dim).cuda()        
        
        embeddings_init_bound = 6 / np.sqrt(embedding_dim)
        nn.init.uniform_(
            self.entity_embeddings.weight.data,
            a=-embeddings_init_bound,
            b=+embeddings_init_bound,
        )

        nn.init.uniform_(
            self.rel_embeddings.weight.data,
            a=-embeddings_init_bound,
            b=+embeddings_init_bound,
        )
        
        self.criterion = nn.MarginRankingLoss(
            margin=margin
        )
        
        norms = torch.norm(self.rel_embeddings.weight, p=2, dim=1).data
        self.rel_embeddings.weight.data = self.rel_embeddings.weight.data.div(
            norms.view(rel_len, 1).expand_as(self.rel_embeddings.weight))
        
    
    def entities_to_ids(self, entities):
        return entity2id[entities[0]]
    
    def generate_negative_triplets(self, pos_batch, all_entities):        
        current_batch_size = len(pos_batch)
        batch_subjs = pos_batch[:, 0:1]
        batch_relations = pos_batch[:, 2:3]
        batch_objs = pos_batch[:, 1:2]

        num_subj_corrupt = len(pos_batch) // 2
        num_obj_corrupt = len(pos_batch) - num_subj_corrupt
        pos_batch = torch.tensor(pos_batch, dtype=torch.long)

        corrupted_subj_indices = np.random.choice(np.arange(0, all_entities.shape[0]), size=num_subj_corrupt)
        corrupted_subjects = np.reshape(all_entities[corrupted_subj_indices], newshape=(-1, 1))
        corrupted_converted_subjects = np.apply_along_axis(self.entities_to_ids,1,corrupted_subjects).reshape(num_subj_corrupt,1)
        subject_based_corrupted_triples = np.concatenate(
            [corrupted_converted_subjects, (batch_objs[:num_subj_corrupt]).cpu(), (batch_relations[:num_subj_corrupt]).cpu()], axis=1)

        corrupted_obj_indices = np.random.choice(np.arange(0, all_entities.shape[0]), size=num_obj_corrupt)
        corrupted_objects = np.reshape(all_entities[corrupted_obj_indices], newshape=(-1, 1))
        corrupted_converted_objects = np.apply_along_axis(self.entities_to_ids,1,corrupted_objects).reshape(num_obj_corrupt,1)
        object_based_corrupted_triples = np.concatenate(
            [(batch_subjs[num_subj_corrupt:]).cpu(), corrupted_converted_objects, (batch_relations[num_subj_corrupt:]).cpu()], axis=1)
        batch_subjs.cuda()
        batch_relations.cuda()
        batch_objs.cuda()
        neg_batch = np.concatenate([subject_based_corrupted_triples, object_based_corrupted_triples], axis=0)
        neg_batch = torch.tensor(neg_batch, dtype=torch.long).cuda()
        return neg_batch

    
    
#     def entities_to_ids(self, entities):
#         return entity2id_dict[entities[0]]
    
    def forward(self, pos_batch, neg_batch):
        pos_score = self.score_triplets(pos_batch)
        neg_score = self.score_triplets(neg_batch)
        
        loss = self.compute_loss(pos_score, neg_score)
        return loss

    
    def train(self, triplets, all_entities, batchsize=32, epochs=1):
        triplets_len = triplets.shape[0]
        optimiser = optim.SGD(self.parameters(), lr=0.01, momentum=0.9)
        loss_hist = []
        for epoch in range(epochs):
            print("Epoch: {} is started.".format(epoch))
            for i in range(0,triplets_len,batchsize):
                #raises error if last batch contains only one element!!
                pos_batch = triplets[i:i+batchsize]
                neg_batch = self.generate_negative_triplets(pos_batch, all_entities)
                optimiser.zero_grad()

                loss = self.forward(pos_batch, neg_batch)
                loss_hist.append(loss)
                print("Calculated loss for iteration {}: {}".format(i,loss))
                loss.backward()
                optimiser.step()
            
        return loss_hist
        
    
    def compute_loss(self, pos_scores, neg_scores):
        y = np.repeat([1], repeats=pos_scores.shape[0])
        y = torch.tensor(y, dtype=torch.float)

        positive_scores = torch.tensor(pos_scores, dtype=torch.float)
        negative_scores = torch.tensor(neg_scores, dtype=torch.float)

        loss = self.criterion(pos_scores.cpu(), neg_scores.cpu(), y)
        
        return loss
    
    def split_triplets(self, triplets):
        h = triplets[:, 1:2]
        t = triplets[:, 3:4]
        r = triplets[:, 2:3]
        return h, t, r

    #it is very important how to vectorize code. Before, i was using numpy.apply_along_axis function to fetch the embeddings
    #since it is using numpy, apply function was fetching the embeddings one by one with using vectorization.
    #however, this was creating a ndarray of tensors which i do not want.
    #then, i realize it is possible to give a tensor(which has indices of relevant embeddings) to embeddings.weight.data
    #to fetch the relevant embeddings. Since I am fetching directly from nn.embedding (is of type tensor) now resulting
    #data is also tensor in the shape I want.
    
    def get_embedding_of_triplets(self, triplets):
        heads, tails, relations = self.split_triplets(triplets)
        #print("SHAPE ",self.entity_embeddings.weight[heads].reshape(heads.shape[0],-1).shape)
        return self.entity_embeddings.weight[heads].reshape(heads.shape[0],-1), self.entity_embeddings.weight[tails].reshape(heads.shape[0],-1), self.entity_embeddings.weight[relations].reshape(heads.shape[0],-1)   
        
    def score_triplets(self, triplets):
        print(self.entity_embeddings.weight.data)
        norms = torch.norm(self.entity_embeddings.weight, dim=1).data
        self.entity_embeddings.weight.data = self.entity_embeddings.weight.data.div(norms.view(entity_len, 1).expand_as(self.entity_embeddings.weight))
        print(self.entity_embeddings.weight.data)
        heads, tails, rels = self.get_embedding_of_triplets(triplets)
        sum_res = heads + rels - tails
        distances = torch.norm(sum_res, p=1, dim=1)
        distances_view = distances.view(size=(-1,))

        return distances_view

In [22]:
train_df = text_file.copy()
train_df = train_df.drop(['confidence_score'], axis = 1)

def map_triplets(df, entity2id, relation2id):
    print("mapping started")
    df['subject'] = df.apply(lambda L: entity2id[L[0]], axis=1)
    df['relation'] = df.apply(lambda L: relation2id[L[1]], axis=1)
    df['object'] = df.apply(lambda L: entity2id[L[2]], axis=1)
    print("mapping end")
map_triplets(train_df, entity2id, relation2id)

train_df.head()

mapping started
mapping end


Unnamed: 0,subject,relation,object
0,641,196,542
1,641,195,433
2,641,196,541
3,641,196,579
4,641,196,540


In [23]:
train_data = text_file.iloc[:, 1:]

embedding_dim = 300
model = TransE(entity_len, rel_len,  embedding_dim, margin=1)
pos_triplets = torch.from_numpy(train_df.values).cuda()

In [24]:
pos_triplets.shape

torch.Size([862, 3])

In [25]:
len_pos_triplets = pos_triplets.shape[0]
model.generate_negative_triplets(pos_triplets, all_entity)
#loss_hist = model.train(pos_triplets, all_entities,len_pos_triplets,epochs=100)


for name, param in model.named_parameters():
    if param.requires_grad:
        print (name, param.data)
    else:
        print(param.name)

entity_embeddings.weight tensor([[ 0.1745, -0.3000,  0.1776,  ...,  0.0933, -0.3138,  0.0355],
        [-0.1722, -0.2806,  0.0762,  ..., -0.3423,  0.1884, -0.2712],
        [ 0.0332,  0.0229, -0.3115,  ..., -0.0683,  0.2563,  0.1248],
        ...,
        [-0.1128, -0.2084, -0.2280,  ..., -0.2191, -0.0746, -0.1599],
        [-0.2634, -0.0877,  0.0197,  ...,  0.1486,  0.2586, -0.3276],
        [ 0.0413, -0.2985,  0.1191,  ...,  0.0284,  0.0844,  0.1672]],
       device='cuda:0')
rel_embeddings.weight tensor([[-0.0757, -0.0918, -0.0936,  ...,  0.0636, -0.0194, -0.0145],
        [ 0.0603, -0.0318,  0.0122,  ..., -0.0257, -0.0944,  0.0513],
        [-0.0103,  0.0244, -0.0359,  ..., -0.0349,  0.0602,  0.0424],
        ...,
        [ 0.0253,  0.0104,  0.0274,  ..., -0.0710,  0.0703, -0.0319],
        [-0.0220, -0.0359, -0.0820,  ...,  0.0142,  0.0230, -0.0272],
        [-0.0351, -0.0286,  0.0068,  ..., -0.0003, -0.0952,  0.0425]],
       device='cuda:0')




In [26]:
text_file.head()

Unnamed: 0,confidence_score,subject,relation,object
0,1.0,state governments,verify customers due,poor response from state governments
1,1.0,state governments,verify,linked non-core customers
2,1.0,state governments,verify customers due,poor response from certain state governments
3,1.0,state governments,verify customers due,response from state governments
4,1.0,state governments,verify customers due,poor response


This is the final result. 
I have printed the embeddings of three entities below.

In [27]:
subj = entity2id['state governments']
rel = relation2id['verify customers due']
obj = entity2id['poor response from state governments']
print("subj index: ", subj)
print("rel index: ", rel)
print("obj index: ", obj)

subj_embed = model.entity_embeddings.weight[subj]
rel_embed = model.rel_embeddings.weight[rel]
obj_embed = model.entity_embeddings.weight[obj]

print("subj embed: ", subj_embed)
print("rel embed: ", rel_embed)
print("obj embed: ", obj_embed)
print("sum embed: ", subj_embed+rel_embed - obj_embed)

subj index:  641
rel index:  196
obj index:  542
subj embed:  tensor([ 1.0169e-01,  1.5760e-01,  1.0111e-01, -1.6504e-01, -1.9245e-01,
         1.7219e-01, -2.4491e-01,  1.4607e-01, -2.8628e-01, -2.1877e-01,
        -1.5458e-01,  3.1030e-01,  4.1511e-02,  2.6320e-01, -2.7863e-01,
        -3.3082e-01, -1.9666e-01, -2.3439e-01,  1.4035e-01,  3.0261e-01,
         1.1460e-01,  2.0834e-01,  2.8014e-01,  2.3050e-01,  6.1045e-02,
        -2.3531e-01, -1.8656e-01, -8.2295e-03, -3.2618e-01, -2.3911e-01,
         2.8170e-01, -3.1591e-01, -1.4738e-01, -6.5919e-02, -1.4020e-01,
         3.3690e-01,  3.3305e-01,  1.0934e-01, -2.8562e-01, -2.2550e-01,
        -2.1800e-01,  2.2501e-01, -6.3844e-02, -3.3279e-02, -3.3343e-01,
        -2.6974e-02, -3.0901e-01, -5.2196e-02, -5.2966e-02, -1.1670e-01,
         5.5271e-02,  1.4091e-01,  3.0245e-02, -1.2806e-01, -1.0957e-01,
         2.0285e-01,  1.0852e-02, -1.9151e-01,  3.3796e-01,  3.2782e-01,
        -9.2541e-02,  1.9469e-01,  2.0260e-01,  3.2807e-01,  2