In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
%cd gdrive/MyDrive/LibKGE Test/kge

/content/gdrive/.shortcut-targets-by-id/12lK9g6Ccl-njCvAuL28xCjV1XOyHp9bU/LibKGE Test/kge


In [None]:
! pip install transformers
! pip install path
! pip install rapidfuzz
! pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
from kge.model.kge_model import KgeModel
from kge.util.io import load_checkpoint

import spacy

from sentence_transformers import SentenceTransformer, util
import numpy as np

In [None]:
# returns entity embeddings 
def getEntityEmbeddings(kge_model, entity_dict):
    e = {}
    embedder = kge_model._entity_embedder
    f = open(entity_dict, 'r')
    for line in f:
        line = line[:-1].split('\t')
        ent_id = int(line[0])
        ent_name = line[1]
        e[ent_name] = embedder._embeddings(torch.LongTensor([ent_id]))[0]
    f.close()
    return e

In [None]:
# creates dictionaries that allow us to access entity name once we get the answer
def prepare_embeddings(embedding_dict):
    entity2idx = {}
    idx2entity = {}
    i = 0
    embedding_matrix = []
    for key, entity in embedding_dict.items():
        entity2idx[key] = i
        idx2entity[i] = key
        i += 1
        embedding_matrix.append(entity)
    return entity2idx, idx2entity, embedding_matrix

In [None]:
# given a head entity
# return a dictionary with the highest scoring entities as keys and scores as values
def get_answer(question, head_entity, k_answers, device, dataloader, model, entity2idx, idx2entity):

      scores_dict = dict()

      question_tokenized, attention_mask = dataloader.tokenize_question(question)
      head = torch.tensor(entity2idx[head_entity], dtype = torch.long).to(device) 
      question_tokenized = question_tokenized.to(device)
      attention_mask = attention_mask.to(device)
      #scores = model.get_score_ranked(head=head, question_tokenized=question_tokenized, attention_mask=attention_mask)[0] #half model
      scores = model.get_score_ranked(head=head, question_tokenized=question_tokenized, attention_mask=attention_mask)[0] #for full model
      mask = torch.zeros(len(entity2idx)).to(device)
      mask[head] = 1
      #reduce scores of all non-candidates
      new_scores = scores - (mask*99999)
      new_scores = torch.sigmoid(new_scores)

      # add scores for each head entity to a dictionary
      for i in range(len(new_scores)):
        scores_dict[new_scores[i].item()] = i

      answers_dict = dict()

      # take the highest scoring answer entities and scores from the dictionary
      for answer in range(k_answers):
        highest_score = max(scores_dict.keys())
        max_answer_idx = scores_dict[highest_score]
        answers_dict[idx2entity[max_answer_idx]] = highest_score
        del scores_dict[highest_score]

      return answers_dict

In [None]:
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from transformers import RobertaTokenizer

# process questions
class DatasetMetaQA(Dataset):
    def __init__(self, data, entities, entity2idx):
        self.data = data
        self.entities = entities
        self.entity2idx = entity2idx
        self.pos_dict = defaultdict(list)
        self.neg_dict = defaultdict(list)
        self.index_array = list(self.entities.keys())
        self.tokenizer_class = RobertaTokenizer
        self.pretrained_weights = 'roberta-base'
        self.tokenizer = self.tokenizer_class.from_pretrained(self.pretrained_weights)

    def __len__(self):
        return len(self.data)
    
    def pad_sequence(self, arr, max_len=128):
        num_to_add = max_len - len(arr)
        for _ in range(num_to_add):
            arr.append('<pad>')
        return arr

    def toOneHot(self, indices):
        indices = torch.LongTensor(indices)
        batch_size = len(indices)
        vec_len = len(self.entity2idx)
        one_hot = torch.FloatTensor(vec_len)
        one_hot.zero_()
        # one_hot = -torch.ones(vec_len, dtype=torch.float32)
        one_hot.scatter_(0, indices, 1)
        return one_hot

    def __getitem__(self, index):
        data_point = self.data[index]
        question_text = data_point[1]
        question_tokenized, attention_mask = self.tokenize_question(question_text)
        head_id = self.entity2idx[data_point[0].strip()]
        tail_ids = []
        for tail_name in data_point[2]:
            tail_name = tail_name.strip()
            #TODO: dunno if this is right way of doing things
            if tail_name in self.entity2idx:
                tail_ids.append(self.entity2idx[tail_name])
        tail_onehot = self.toOneHot(tail_ids)
        return question_tokenized, attention_mask, head_id, tail_onehot 

    def tokenize_question(self, question):
        question = "<s> " + question + " </s>"
        question_tokenized = self.tokenizer.tokenize(question)
        question_tokenized = self.pad_sequence(question_tokenized, 64)
        question_tokenized = torch.tensor(self.tokenizer.encode(question_tokenized, add_special_tokens=False))
        attention_mask = []
        for q in question_tokenized:
            # 1 means padding token
            if q == 1:
                attention_mask.append(0)
            else:
                attention_mask.append(1)
        return question_tokenized, torch.tensor(attention_mask, dtype=torch.long)

class DataLoaderMetaQA(DataLoader):
    def __init__(self, *args, **kwargs):
        super(DataLoaderMetaQA, self).__init__(*args, **kwargs)
        self.collate_fn = _collate_fn

In [None]:
import torch
import torch.nn as nn
import torch.nn.utils
import torch.nn.functional as F
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
from torch.nn.init import xavier_normal_
from transformers import RobertaModel
import random

# contains question embedding process and score ranking functions
class RelationExtractor(nn.Module):
    def __init__(self, embedding_dim, relation_dim, num_entities, pretrained_embeddings, device,
    entdrop=0.0, reldrop=0.0, scoredrop=0.0, l3_reg=0.0, model="ComplEx", ls=0.0, do_batch_norm=True, freeze=True):
        super(RelationExtractor, self).__init__()
        self.device = device
        self.model = model
        #self.model_attr_accessor = model.module
        self.freeze = freeze
        self.label_smoothing = ls
        self.l3_reg = l3_reg
        self.do_batch_norm = do_batch_norm
        if not self.do_batch_norm:
            print("Not doing batch norm")
        self.roberta_pretrained_weights = "roberta-base"
        self.roberta_model = RobertaModel.from_pretrained(self.roberta_pretrained_weights)
        for param in self.roberta_model.parameters():
            param.requires_grad = True
        if self.model == "ComplEx":
            multiplier = 2
            self.getScores = self.ComplEx
        else:
            print("Incorrect model specified:", self.model)
            exit(0)
        print("Model is", self.model)
        self.hidden_dim = 768
        self.relation_dim = relation_dim * multiplier
        self.num_entities = num_entities
        self.loss = self.kge_loss
        self.rel_dropout = torch.nn.Dropout(reldrop)
        self.ent_dropout = torch.nn.Dropout(entdrop)
        self.score_dropout = torch.nn.Dropout(scoredrop)
        self.fcnn_dropout = torch.nn.Dropout(0.1)
        print("Frozen:", self.freeze)
        self.embedding = nn.Embedding.from_pretrained(torch.stack(pretrained_embeddings, dim=0), freeze=self.freeze)
        print(self.embedding.weight.shape)
        self.mid1 = 512
        self.mid2 = 512
        self.mid3 = 512
        self.mid4 = 512
        self.lin1 = nn.Linear(self.hidden_dim, self.mid1)
        self.lin2 = nn.Linear(self.mid1, self.mid2)
        self.lin3 = nn.Linear(self.mid2, self.mid3)
        self.lin4 = nn.Linear(self.mid3, self.mid4)
        self.hidden2rel = nn.Linear(self.mid4, self.relation_dim)
        self.bn0 = torch.nn.BatchNorm1d(multiplier)
        self.bn2 = torch.nn.BatchNorm1d(multiplier)
        self.logsoftmax = torch.nn.LogSoftmax(dim=-1)
        self._klloss = torch.nn.KLDivLoss(reduction="sum")

    def set_bn_eval(self):
        self.bn0.eval()
        self.bn2.eval()

    def kge_loss(self, scores, targets):
        # loss = torch.mean(scores*targets)
        return self._klloss(
            F.log_softmax(scores, dim=1), F.normalize(targets.float(), p=1, dim=1)
        )

    def ComplEx(self, head, relation):
        head = torch.stack(list(torch.chunk(head, 2, dim=1)), dim=1)
        if self.do_batch_norm:
            head = self.bn0(head)
        head = self.ent_dropout(head)
        relation = self.rel_dropout(relation)
        head = head.permute(1, 0, 2)
        re_head = head[0]
        im_head = head[1]
        re_relation, im_relation = torch.chunk(relation, 2, dim=1)
        re_tail, im_tail = torch.chunk(self.embedding.weight, 2, dim =1)
        re_score = re_head * re_relation - im_head * im_relation
        im_score = re_head * im_relation + im_head * re_relation
        score = torch.stack([re_score, im_score], dim=1)
        if self.do_batch_norm:
            score = self.bn2(score)
        score = self.score_dropout(score)
        score = score.permute(1, 0, 2)
        re_score = score[0]
        im_score = score[1]
        score = torch.mm(re_score, re_tail.transpose(1,0)) + torch.mm(im_score, im_tail.transpose(1,0))
        pred = score
        return pred

    # Inputs a tokenized question and attention mask, returns a question embedding
    def getQuestionEmbedding(self, question_tokenized, attention_mask):
        roberta_last_hidden_states = self.roberta_model(question_tokenized, attention_mask=attention_mask)[0]
        states = roberta_last_hidden_states.transpose(1,0)
        cls_embedding = states[0]
        question_embedding = cls_embedding
        # question_embedding = torch.mean(roberta_last_hidden_states, dim=1)
        return question_embedding

    # Forward pass
    def forward(self, question_tokenized, attention_mask, p_head, p_tail):
        # create question embedding with roBERTa, 768 dim
        question_embedding = self.getQuestionEmbedding(question_tokenized, attention_mask)
        # NN that sends question_embedding to a vector with dimension = relation_dim
        rel_embedding = self.applyNonLinear(question_embedding)
        # embedding of the entity in the question
        p_head = self.embedding(p_head)
        # phi(e_h, e_r, e_a)
        pred = self.getScores(p_head, rel_embedding)
        actual = p_tail
        if self.label_smoothing:
            actual = ((1.0-self.label_smoothing)*actual) + (1.0/actual.size(1))
        loss = self.loss(pred, actual)
        if not self.freeze:
            if self.l3_reg:
                norm = torch.norm(self.embedding.weight, p=3, dim=-1)
                loss = loss + self.l3_reg * torch.sum(norm)
        return loss

    # modified code based on what it should be from the paper
    # pass through a 4 layer neural network
    def applyNonLinear(self, outputs):
        # linear layer with dropout
        outputs = self.fcnn_dropout(self.lin1(outputs))
        # reLU activation layer
        outputs = F.relu(outputs)
        # linear layer with dropout
        outputs = self.fcnn_dropout(self.lin2(outputs))
        # reLU activation layer
        outputs = F.relu(outputs)
        # linear layer
        outputs = self.lin3(outputs)
        # reLU activation layer
        outputs = F.relu(outputs)
        # linear layer
        outputs = self.lin4(outputs)
        # reLU activation layer
        outputs = F.relu(outputs)
        # linear layer
        outputs = self.hidden2rel(outputs)
        return outputs

    def get_score_ranked(self, head, question_tokenized, attention_mask):
        question_embedding = self.getQuestionEmbedding(question_tokenized.unsqueeze(0), attention_mask.unsqueeze(0))
        rel_embedding = self.applyNonLinear(question_embedding)
        head = self.embedding(head).unsqueeze(0)
        scores = self.getScores(head, rel_embedding)
        return scores

For the half dataset:

In [None]:
# best entity embedding weights trained
checkpoint = load_checkpoint('checkpoint_best.pt')
model = KgeModel.create_from(checkpoint)

Loading configuration of dataset fb_natural_language_data_half from /content/gdrive/.shortcut-targets-by-id/12lK9g6Ccl-njCvAuL28xCjV1XOyHp9bU/LibKGE Test/kge/data/fb_natural_language_data_half ...




In [None]:
#getting the entity embeddings for the half dataset
e = getEntityEmbeddings(model, "data/fb_natural_language_data_half/entity_ids.del")

In [None]:
entity2idx, idx2entity, embedding_matrix = prepare_embeddings(e)

In [None]:
len(entity2idx), len(idx2entity)

(479812, 479812)

In [None]:
from transformers import RobertaModel
device = torch.device("cpu")

#initialize model with question embedding and score ranking functions that does answer selection
model2 = RelationExtractor(embedding_dim=256, num_entities = len(idx2entity), relation_dim=50, 
                              pretrained_embeddings=embedding_matrix, device=device)

#load in the trained half model 
fname = "best_score_model.pt"

# fit model
model2.load_state_dict(torch.load(fname, map_location=torch.device('cpu')))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model is ComplEx
Frozen: True
torch.Size([479812, 100])


<All keys matched successfully>

For the full dataset:

In [None]:
checkpoint_full = load_checkpoint('new_checkpoint_best.pt')


In [None]:
full_model = KgeModel.create_from(checkpoint_full)



Loading configuration of dataset fb_natural_language_data_full from /content/gdrive/.shortcut-targets-by-id/12lK9g6Ccl-njCvAuL28xCjV1XOyHp9bU/LibKGE Test/kge/data/fb_natural_language_data_full ...




In [None]:
e_full = getEntityEmbeddings(full_model, "data/fb_natural_language_data_full/entity_ids.del")


In [None]:
entity2idx_full, idx2entity_full, embedding_matrix_full = prepare_embeddings(e_full)

In [None]:
len(entity2idx_full), len(idx2entity_full)

(701166, 701166)

In [None]:

from transformers import RobertaModel
device = torch.device("cpu")

#initialize model with question embedding and score ranking functions that does answer selection
model2_full = RelationExtractor(embedding_dim=256, num_entities = len(idx2entity_full), relation_dim=50, 
                              pretrained_embeddings=embedding_matrix_full, device=device)
#model2_full = nn.DataParallel(model2_full)
fname = "retrained_best_score_model.pt"

# fit model
model2_full.load_state_dict(torch.load(fname, map_location=torch.device('cpu')))

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model is ComplEx
Frozen: True
torch.Size([701166, 100])


<All keys matched successfully>

Evaluating Accuracy Starts Here: Given the correct head entity, what is the accuracy for getting the right answer with the natural language question?

In [None]:
#load in test data 
with open('qa_test_nl.txt') as f:  
  qa_test_lines = f.readlines()
  

Calculate the accuracy given the test data in the form of (natural language question [head entity] answer 1|answer 2|answer ...), the entity embeddings, the RelationExtractor model, entity2idx, idx2entity.

In [None]:

def accuracy(test_data, e, model2, entity2idx, idx2entity):
  correct_count = 0
  for line in test_data:
    question = line.split(' [')[0]
    given_head_entity = line.split('[', 1)[1].split(']')[0]
    correct_answers = line.split(']')[-1]
    correct_answers = correct_answers.replace('\t','').replace('\n','')
    correct_answers = correct_answers.split("|")
    dataloader = DatasetMetaQA(question, e, entity2idx)
    output_answers = list(get_answer(question, given_head_entity, 1, device, dataloader, model2, entity2idx, idx2entity))[0]
    if output_answers in correct_answers:
      correct_count += 1

  return correct_count/len(test_data)
    

In [None]:
#accuracy on the full model
accuracy(qa_test_lines, e_full, model2_full, entity2idx_full, idx2entity_full)

0


Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

1
correct count so far 1
2
3
correct count so far 2
4
correct count so far 3
5
correct count so far 4
6
7
correct count so far 5
8
correct count so far 6
9
correct count so far 7
10
correct count so far 8
11
correct count so far 9
12
13
correct count so far 10
14
correct count so far 11
15
16
17
18
correct count so far 12
19
correct count so far 13
20
correct count so far 14
21
correct count so far 15
22
23
correct count so far 16
24
correct count so far 17
25
26
27
correct count so far 18
28
29
correct count so far 19
30
correct count so far 20
31
32
correct count so far 21
33
correct count so far 22
34
35
36
correct count so far 23
37
38
correct count so far 24
39
correct count so far 25
40
correct count so far 26
41
correct count so far 27
42
correct count so far 28
43
correct count so far 29
44
45
46
47
correct count so far 30
48
49
correct count so far 31
50
51
correct count so far 32
52
53
correct count so far 33
54
correct count so far 34
55
56
correct count so far 35
57
correct

0.636986301369863

In [None]:
#accuracy on the half model
accuracy(qa_test_lines, e, model2, entity2idx, idx2entity)

0
correct count so far 1
1
2
correct count so far 2
3
4
5
correct count so far 3
6
7
correct count so far 4
8
correct count so far 5
9
10
11
correct count so far 6
12
13
14
correct count so far 7
15
16
17
18
correct count so far 8
19
20
correct count so far 9
21
correct count so far 10
22
23
correct count so far 11
24
25
26
27
correct count so far 12
28
correct count so far 13
29
correct count so far 14
30
31
32
correct count so far 15
33
34
correct count so far 16
35
36
37
38
correct count so far 17
39
correct count so far 18
40
41
42
correct count so far 19
43
44
45
46
47
48
49
50
51
correct count so far 20
52
53
54
55
56
correct count so far 21
57
58
59
correct count so far 22
60
correct count so far 23
61
62
63
64
65
66
correct count so far 24
67
68
69
70
71
72
correct count so far 25
73
74
75
76
77
correct count so far 26
78
correct count so far 27
79
correct count so far 28
80
81
82
83
84
85
86
correct count so far 29
87
correct count so far 30
88
89
90
91
92
93
94
95
96
97
98
co

0.3356164383561644

33.6% accuracy on half model. 63.7% accuracy on full model.

--------------------------------------------------