<a href="https://colab.research.google.com/github/parvathysarat/kg-qa/blob/master/qa_task_metaqa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Size of our transG embeddings : 50 (both entities and relations)

(contains : 
- utility functions to get entities, relations, documents from MetaQA processed files
- training functions
- embedding functions
- main
<br> Needs to be split into util, training and main files. Also need to add config file with specifications).

In [0]:
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import os
import time, datetime
import sys
import json
from tqdm import tqdm
os.chdir('./kg-qa/')
from dataloader import DataLoader
from questionanswering import QuestionAnswering

In [0]:
# !git clone https://github.com/parvathysarat/kg-qa
# ./kg-qa/data/transg/ has the embeddings obtained by TransG for MetaQA dataset

In [0]:
os.chdir('./data/transG/')

### Loading MetaQA datasets - entities, relations

In [0]:
import glob
import shutil
for file in glob.glob('./kg-qa/data/*.txt'):
  shutil.move(file,'./kg-qa/data/transg/')

In [5]:
# entities.txt from MetaQa : list of entities

def get_entities_relns():
  with open('../MetaQA/entities.txt') as f:
    entities = {id:line.strip() for id,line in enumerate(f)}
    print(len(entities),entities[0])
  with open('../MetaQA/relations.txt') as f:
    relations = {id:line.strip() for id,line in enumerate(f)}
    print(len(relations),relations[0])
    return entities, relations    
  
entities, relations = get_entities_relns()

43234 Kismet
9 has_imdb_rating


### Loading pretrained entity weights (transG) for MetaQA

In [0]:
# entity.txt storing TransG embeddings of MetaQA

def get_num_entity(param='num'):
   with open('entity.txt') as f:
    if param=='num':
      return sum([1 for line in f])
    if param=='weights':
      arr = []
      weights_dict = {}
      ct=0
      for line in f:
        entity = entities[ct]
        ct+=1
        for idx,el in enumerate(line.split()):
          # numeric entities are not vectorized
          if el.isnumeric() and idx==0: 
            continue

          if (' '.join(line.split()[:idx]))==entity:
            arr.append(np.array(line.split()[idx:],dtype=np.float32))
            weights_dict[entity] = np.array(line.split()[idx:],dtype=np.float32)
            break
      print(len(weights_dict),ct)
    pretrained_weights = torch.FloatTensor(arr)  
    return weights_dict, pretrained_weights
num_entities = get_num_entity()

In [7]:
num_entities = get_num_entity()
print(len(entities))
entity_weights_dict, pretrained_entity_weights = get_num_entity('weights')
print(pretrained_entity_weights.shape[0])

43234
43233 43234
43233


###Store pretrained weights to embedding vectors

In [0]:
def initialize_embeddings(pretrained_weights):
  embeddings = nn.Embedding(num_embeddings= pretrained_weights.shape[0] + 1, embedding_dim=50, padding_idx=pretrained_weights.shape[0])
  embeddings.weight = nn.Parameter(pretrained_weights)  
  # entity_embeddings = nn.Embedding(num_embeddings=num_entities+1, embedding_dim=50,padding_idx=num_entities)
  # entity_embeddings.weight = nn.Parameter(get_num_entity('weights'))
  embeddings.weight.requires_grad = False  
  return embeddings

# entity_embeddings = initialize_embeddings(pretrained_entity_weights)

In [0]:

# entity_embeddings

### Loading pretrained relation weights

In [0]:
# function to return the cluster number for each relation
# (n_cluster # of embeddings for each relation from TransG based on GMM)

def get_relations_clust():
  relations_cluster = {}
  with open('weight.txt') as f:
    for line in f:
      relations_cluster[line.split()[0]] = np.argmax(np.array(line.split()[1:], dtype = np.float32))
  return relations_cluster

rel_clusters = get_relations_clust()

(transG uses GMM model hence relations have a mixture of embedding vectors to represent multiple semantic relations, weights of each mixture/cluster stored in weights.txt, loaded into rel_clusters. Here we have used num_clusters=4)

In [11]:
rel_clusters

{'directed_by': 2,
 'has_genre': 3,
 'has_imdb_rating': 3,
 'has_imdb_votes': 3,
 'has_tags': 3,
 'in_language': 2,
 'release_year': 3,
 'starred_actors': 2,
 'written_by': 2}

In [0]:
def get_rel_embeddings(relation):
  with open('relation_'+relation+'.txt') as f:
    for line in f:
      if int(line[0])==rel_clusters[relation]:
        return np.array(line.split()[1:],dtype=np.float32)

rel_weights_dict= {}
pretrained_relation_weights = []
for rel in rel_clusters:
  rel_weights_dict[rel] = get_rel_embeddings(rel)
  pretrained_relation_weights.append(rel_weights_dict[rel])

pretrained_relation_weights = torch.FloatTensor(pretrained_relation_weights)  

In [0]:
relation_embeddings = initialize_embeddings(pretrained_relation_weights)

In [0]:
# print(entity_embeddings,relation_embeddings)

Mapping vocabulary, entities and relations to ids. 
dicts of {value:id} format

In [0]:
def map_ids(file):
  id_map = {}
  with open('../MetaQA/'+file+'.txt') as f:
    for line in f:
      id_map[line.strip()] = len(id_map)
  return id_map

vocab_ids = map_ids('vocab')
relation_ids = map_ids('relations')
entity_ids = map_ids('entities')

In [16]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from pprint import pprint

def map_documents(file):

  # first get the json into documents dict

  documents = {}
  with open(file,encoding='utf-8') as f:
    for line in f:
      line = json.loads(line)
      documents[line['documentId']] = line
      documents[line['documentId']]['tokens'] = word_tokenize(line['document']['text'])
      if 'title' in line:
        documents[line['documentId']]['tokens'] += ["/"] + word_tokenize(line['title']['text'])

  # index the docs by entities, start, stop etc.
  return documents

# documents = map_documents('../MetaQA/documents.json')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
def index_doc_entities(documents):
  doc_entity_index = {}
  all_doc_texts = {}
  all_doc_texts[-1] = np.full(40, len(vocab_ids), dtype=int)
  for id, doc in documents.items():
    word_ids, entity_ids_global = [],[]
    word_weights = []
    if 'title' in doc:
      for ent in doc['title']['entities'] : 
        # get number of words (ent['start'] = start word #, ent['end'] = end word #)
        len_entity = ent['end'] - ent['start']
        # entity id
        entity_ids_global.extend([entity_ids[ent['text']]]*len_entity)

        word_ids.extend(range(ent['start'],ent['end']))
        # word weight = 1/(# of words)
        word_weights.extend([1.0/len_entity]*len_entity)
      len_title = len(word_tokenize(doc['title']['text']))      
    else:
      len_title = 0
    for ent in doc['document']['entities']:
      # capping at 40 tokens
      if ent['start'] + len_title+1 >= 40: continue
      len_entity = min(40, ent['end']+ len_title+1) - (ent['start']+len_title+1)
      entity_ids_global.extend([entity_ids[ent['text']]]*len_entity)
      word_ids.extend(range(ent['start']+len_title+1, ent['start']+len_entity+len_title+1))
      if len_entity!=0: 
        word_weights.extend([1.0/len_entity]*len_entity)
      
    if len(word_weights)!=len(word_ids):
      print(len(word_weights),len(word_ids))
      raise ValueError("Lengths of weights and id's do not match!") 
    doc_entity_index[id] = (entity_ids_global,word_ids,word_weights)

    each_doc = np.full(40, len(vocab_ids),dtype=int)
    for ind, word in enumerate(doc['tokens']):
      if ind < 40:
        if word in vocab_ids:
          each_doc[ind] = vocab_ids[word]
        else: each_doc[ind] = vocab_ids['__unk__']

    all_doc_texts[id] = each_doc

  return doc_entity_index,all_doc_texts

# doc_entity_index== ([list of entity ids x #words in entity],[list of entity positions in text],[list of entity weights x #words in entity]

# doc_entity_index, all_doc_texts = index_doc_entities(documents)

In [0]:
# assert (len(all_doc_texts.keys())-1) == len(documents)== len(doc_entity_index)

In [0]:
# all_doc_texts[0],documents[0],doc_entity_index[0]

In [0]:
# len(vocab_ids)

In [0]:
def use_cuda(var):
    if torch.cuda.is_available():
        return var.cuda()
    else:
        return var

In [0]:
def get_model(pretrained_entity_weights,pretrained_relation_weights, num_entities):
  qa_model = use_cuda(QuestionAnswering(pretrained_entity_weights, pretrained_relation_weights, num_word = len(vocab_ids), num_entity = num_entities))
  ## load model state if previous state exists
  # if "../../model/best_model_doc" is not None:
  #   pretrained_model_states = torch.load("./model/best_model_doc")
  #   del pretrained_model_states['entity_embedding.weight']
  #   qa_model.load_state_dict(pretrained_model_states,strict=False)

  return qa_model

In [0]:
# os.chdir('./data/transg/')

In [24]:
# def train():

entities, relations = get_entities_relns()
num_entities = get_num_entity()
entity_weights_dict, pretrained_entity_weights = get_num_entity('weights')  
entity_embeddings = initialize_embeddings(pretrained_entity_weights)  
rel_clusters = get_relations_clust()  
rel_weights_dict= {}
pretrained_relation_weights = []
for rel in rel_clusters:
  rel_weights_dict[rel] = get_rel_embeddings(rel)
  pretrained_relation_weights.append(rel_weights_dict[rel])

pretrained_relation_weights = torch.FloatTensor(pretrained_relation_weights)  
vocab_ids = map_ids('vocab')
relation_ids = map_ids('relations')
entity_ids = map_ids('entities')
os.chdir('../../')
documents = map_documents('./data/MetaQA/documents.json')
doc_entity_index, all_doc_texts = index_doc_entities(documents)
assert (len(all_doc_texts.keys())-1) == len(documents)== len(doc_entity_index)
train_json = './data/MetaQA/train.json'
#   (documents,data, doc_entity_index, all_doc_texts, vocab_ids, relation_ids, entity_ids):
train_data = DataLoader(documents, train_json, doc_entity_index, all_doc_texts,vocab_ids, relation_ids, entity_ids)

# Generators
  
# test_documents = documents
# test_doc_entity_index = doc_entity_index
# test_all_doc_text = all_doc_texts

#creating model
qa_model = get_model(pretrained_entity_weights,pretrained_relation_weights, num_entities)
train_params = [param for param in qa_model.parameters() if param.requires_grad]
# optimizer
# learning rate 0.01 preliminary expt
optimizer = torch.optim.Adam(train_params, lr=0.01)

best_dev_acc = 0.0
num_epochs = 5


43234 Kismet
9 has_imdb_rating
43233 43234
10000
Data  10000


  2%|▏         | 180/10000 [00:00<00:05, 1789.62it/s]

Max facts 124
KB relation 9
Local entities (10000, 600)
KB fact rels size (10000, 124)
0


 13%|█▎        | 1338/10000 [00:00<00:04, 1764.11it/s]

1000


 23%|██▎       | 2308/10000 [00:01<00:04, 1912.52it/s]

2000


 33%|███▎      | 3290/10000 [00:01<00:03, 1865.36it/s]

3000


 43%|████▎     | 4262/10000 [00:02<00:03, 1879.02it/s]

4000


 52%|█████▏    | 5213/10000 [00:02<00:02, 1801.49it/s]

5000


 64%|██████▍   | 6410/10000 [00:03<00:01, 1869.70it/s]

6000


 74%|███████▍  | 7381/10000 [00:04<00:01, 1793.42it/s]

7000


 84%|████████▍ | 8394/10000 [00:04<00:00, 1798.59it/s]

8000


 94%|█████████▍| 9381/10000 [00:05<00:00, 1967.25it/s]

9000


100%|██████████| 10000/10000 [00:05<00:00, 1848.44it/s]


In [25]:
num_epochs = 3
for epoch in range(num_epochs): 
  print("Epoch ",epoch)
  train_data.reset_batches(is_sequential=False)

  qa_model.train()
  train_loss, train_accuracy, train_max_accuracy = [],[],[]
  batch_size = 50
  print(train_data.num_data,train_data.num_data//batch_size)
  for iteration in tqdm(range(train_data.num_data//batch_size)):
    # get batch
    batch = train_data.get_batch(iteration, batch_size, fact_dropout = False)
    loss, pred, _ = qa_model(batch)
    pred = pred.data.cpu().numpy()
    # save accuracies
    acc, max_acc = accuracy(pred,batch[-1])
    train_loss.append(loss.data[0])
    train_accuracy.append(acc)
    train_max_accuracy.append(max_acc)

    #back propagation
    qa_model.zero_grad()
    optimizer.zero_grad()
    loss.backward()
    
    # clip gradient
    torch.nn.utils.clip_grad_norm(qa_model.parameters(),1)
    optimizer.step()
  
  print("Avg train loss :", sum(train_loss)/len(train_loss))
  print("Max train accuracy :", sum(train_max_accuracy)/len(train_max_accuracy))
  print("Avg training accuracy :", sum(train_accuracy)/len(train_accuracy))

  save_path = "./model/best_model_doc"
  print("Saving model to ",save_path)
  torch.save(qa_model.state_dict(), save_path)

    # return train_accuracy

# if __name__ == "__main__":
  # train()





Epoch  0
10000 200
Padding 13545
124 torch.Size([50, 124]) torch.Size([50, 124])
Padding 9
After flatten torch.Size([50, 124, 100])
Padding 13545


  0%|          | 0/200 [00:01<?, ?it/s]

torch.Size([50, 600]) Embedding(43234, 50, padding_idx=43233)
Padding 43233





AssertionError: ignored

In [0]:
os.chdir('./data/transg/')
# os.getcwd()

In [0]:
relation_embedding = nn.Embedding(num_embeddings=9 + 1, embedding_dim=50, padding_idx=9)
rel_weights_dict= {}
pretrained_relation_weights = []
for rel in rel_clusters:
  rel_weights_dict[rel] = get_rel_embeddings(rel)
  pretrained_relation_weights.append(rel_weights_dict[rel])

pretrained_relation_weights = torch.FloatTensor(pretrained_relation_weights)  
relation_embedding.weight = nn.Parameter(pretrained_relation_weights)
relation_embedding.weight.requires_grad = False


In [0]:
relation_embedding(4)

In [0]:
len(vocab_ids)