# initializations

In [329]:
import networkx as nx
from graph_embeddings.data_loader import DataLoader
import os
import pickle
import numpy as np
from random import choice

# observing dataset 

In [12]:
data_loader = DataLoader(dataset='MetaQA', reverse_rel=True)

In [13]:
entities_voc, relations_voc = data_loader.load_entity_relations_vocab()
entities_inv_voc = {v: k for k, v in entities_voc.items()}

In [318]:
relations_inv_voc = {v: k for k, v in relations_voc.items()}
relations_voc


{'written_by_reverse': 0,
 'has_imdb_rating_reverse': 1,
 'has_imdb_votes_reverse': 2,
 'starred_actors': 3,
 'release_year_reverse': 4,
 'directed_by_reverse': 5,
 'directed_by': 6,
 'has_tags': 7,
 'in_language': 8,
 'in_language_reverse': 9,
 'starred_actors_reverse': 10,
 'written_by': 11,
 'has_imdb_rating': 12,
 'has_genre': 13,
 'has_imdb_votes': 14,
 'has_tags_reverse': 15,
 'release_year': 16,
 'has_genre_reverse': 17}

In [5]:
#entities_voc

# creating embeddings from KGE

## loading trained kge model

In [13]:
from kg_env import load_kge_model
import torch

model = load_kge_model(
    dataset_name='MetaQA',
    model_name='TuckER',
    ent_vec_dim=200,
    rel_vec_dim=200,
    loss_type='CE',
    device=('cuda' if torch.cuda.is_available() else 'cpu'),
    path='TuckER_MetaQA',
    input_dropout=0.3,
    hidden_dropout1=0.4,
    hidden_dropout2=0.5,
    l3_reg=0.2,
)

operating on cpu
building tucker model for embedding generation
TuckER(
  (input_dropout): Dropout(p=0.3, inplace=False)
  (hidden_dropout1): Dropout(p=0.4, inplace=False)
  (hidden_dropout2): Dropout(p=0.5, inplace=False)
  (bn0): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (E): Embedding(43234, 200, padding_idx=0)
  (R): Embedding(18, 200, padding_idx=0)
)
operating on cpu
building tucker model for embedding generation


In [14]:
# 'Adrian Moat': 970,
#'Adrian Pasdar': 971,
# 'Adrian Rawlins': 972,
# 'Adrian Shergold': 973

model.E(torch.Tensor([970, 971, 972]).long()).shape


torch.Size([3, 200])

## generating embeddings

In [24]:
from kg_env import generate_entity_embeddings
import pickle

ent_emb_dict = generate_entity_embeddings(model, entities_voc)
rel_emb_dict = generate_entity_embeddings(model, relations_voc)

with open('data/MetaQA/entity_emb.pickle', 'wb') as f:
    pickle.dump(ent_emb_dict, f)
    

with open('data/MetaQA/relation_emb.pickle', 'wb') as f:
    pickle.dump(rel_emb_dict, f)
    

    


## load embeddings

In [75]:
with open('data/MetaQA/entity_emb.pickle', 'rb') as f:
    ent_emb_dict = pickle.load(f)
    

with open('data/MetaQA/relation_emb.pickle', 'rb') as f:
    rel_emb_dict = pickle.load(f)
    

In [81]:
ent_emb_dict[list(ent_emb_dict.keys())[0]].shape

(200,)

## generating question embeddings

## analyze qa dataset

In [15]:
def load_raw_qa(path, mode='train_1hop'):
    with open(os.path.join(path, f'qa_{mode}.txt'), 'r') as f:
        text = f.read().strip().split('\n')
    return text

qa_train_raw = load_raw_qa('./data/QA_data/MetaQA/')

In [28]:
import re


def extract_question_entity_target(raw_questions):
    all_questions = []
    all_entities = []
    all_targets = []

    for raw_q in raw_questions:
        question, targets = raw_q.split('\t')
        entity = re.findall('\[.*?\]', question)[0] \
            .replace('[', '') \
            .replace(']', '')

        # todo: should I replace entity with some special token?
        question = question.replace(']', '').replace('[', '')
        targets = targets.strip().split('|')
        all_questions.append(question)
        all_targets.append(targets)
        all_entities.append(entity)

    return all_questions, all_entities, all_targets


qa_train_raw = load_raw_qa('./data/QA_data/MetaQA/', mode='dev_3hop')
questions, _, _ = extract_question_entity_target(qa_train_raw)
print(max([len(s) for s in questions]))

150


In [47]:
from typing import List, Dict
from transformers import RobertaTokenizer, RobertaModel
import torch
from tqdm import tqdm


device = 'cuda' if torch.cuda.is_available() else 'cpu'
MODELS_CACHE_PATH = '.'



def tokenize_sentences(sentences: List[str], tokenizer_path=MODELS_CACHE_PATH, max_len=160):
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base', cache_dir=MODELS_CACHE_PATH)
    tokenizer_res = tokenizer(
        sentences, return_tensors='pt', max_length=max_len, truncation=True, padding='max_length')
    tokenizer_res.to(device)
    return tokenizer_res

def get_batch_embeddings(tokenizer_results: Dict, batch_size=128):
    i = 0
    embeddings = []
    roberta = RobertaModel.from_pretrained('roberta-base', cache_dir=MODELS_CACHE_PATH).to(device)
    pbar = tqdm(total=len(tokenizer_results['input_ids'])//batch_size)
    while i < len(tokenizer_results['input_ids']):
        model_input = {}
        for key in tokenizer_results:
            model_input[key] = tokenizer_results[key][i:i+batch_size]
        outputs = roberta(**model_input)
        embeddings.append(outputs.pooler_output.detach().cpu().numpy())
        i += batch_size
        pbar.update(1)
    
    return embeddings



In [None]:
tokenizer_res = tokenize_sentences(questions)
print(tokenizer_res['input_ids'].shape)

In [48]:
embeddings = get_batch_embeddings(tokenizer_res, batch_size=1)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).





  0%|                                                                                                                                                                             | 0/14274 [00:00<?, ?it/s][A[A[A[A[A




  0%|                                                               

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0486,  0.0761, -0.0288,  ..., -0.1312, -0.0418, -0.0443],
         [ 0.0131, -0.1777, -0.0614,  ..., -0.3306,  0.1801,  0.0568],
         [ 0.0847,  0.1649,  0.0865,  ...,  0.2064, -0.0259, -0.0863],
         ...,
         [ 0.0565,  0.0920,  0.0201,  ..., -0.1220, -0.0156, -0.0208],
         [ 0.0565,  0.0920,  0.0201,  ..., -0.1220, -0.0156, -0.0208],
         [ 0.0565,  0.0920,  0.0201,  ..., -0.1220, -0.0156, -0.0208]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[-1.1188e-02, -2.0919e-01, -2.2745e-01, -1.1578e-01,  1.6160e-01,
          1.9390e-01,  2.5668e-01, -7.2665e-02, -5.9200e-02, -1.7820e-01,
          2.3287e-01, -1.2546e-03, -8.6979e-02,  8.9313e-02, -1.3310e-01,
          5.0997e-01,  2.2478e-01, -4.6840e-01,  7.0964e-03, -3.6931e-02,
         -2.4895e-01,  5.7112e-02,  4.6074e-01,  2.9635e-01,  1.3442e-01,
          7.6506e-02, -1.2433e-01, -3.5329e-02,  1.8925e-01,  2.2929






  0%|                                                                                                                                                                   | 2/14274 [00:01<3:06:13,  1.28it/s][A[A[A[A[A

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0483,  0.0658, -0.0396,  ..., -0.0925, -0.0512, -0.0151],
         [ 0.0408, -0.3234, -0.2024,  ..., -0.2671,  0.1068, -0.0395],
         [ 0.0705, -0.1757, -0.1514,  ..., -0.5613, -0.1411, -0.0526],
         ...,
         [ 0.0405, -0.0021,  0.0159,  ..., -0.0421, -0.0428,  0.0964],
         [ 0.0405, -0.0021,  0.0159,  ..., -0.0421, -0.0428,  0.0964],
         [ 0.0405, -0.0021,  0.0159,  ..., -0.0421, -0.0428,  0.0964]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[-3.0601e-03, -2.1779e-01, -2.0223e-01, -9.0303e-02,  1.1932e-01,
          1.8957e-01,  2.5946e-01, -7.7788e-02, -7.1099e-02, -1.6559e-01,
          2.2084e-01, -1.8129e-02, -1.0156e-01,  8.3201e-02, -1.3346e-01,
          4.9150e-01,  2.0551e-01, -4.5526e-01,  2.3696e-02, -3.3467e-02,
         -2.4801e-01,  6.0791e-02,  4.5524e-01,  3.0210e-01,  1.1699e-01,
          7.6753e-02, -1.2701e-01, -1.6016e-02,  1.9760e-01,  2.1011






  0%|                                                                                                                                                                   | 3/14274 [00:02<3:01:31,  1.31it/s][A[A[A[A[A

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0443,  0.0958, -0.0258,  ..., -0.1095, -0.0524, -0.0792],
         [ 0.0289, -0.2903, -0.0484,  ..., -0.3300,  0.1950, -0.2918],
         [ 0.3139,  0.2377,  0.0223,  ..., -0.1324,  0.3695, -0.1022],
         ...,
         [-0.0213, -0.0294,  0.0015,  ...,  0.0400, -0.0248, -0.0330],
         [-0.0213, -0.0294,  0.0015,  ...,  0.0400, -0.0248, -0.0330],
         [-0.0213, -0.0294,  0.0015,  ...,  0.0400, -0.0248, -0.0330]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[-1.1580e-02, -2.2915e-01, -2.1842e-01, -1.1189e-01,  1.5850e-01,
          2.0361e-01,  2.5357e-01, -1.0500e-01, -8.0442e-02, -1.5308e-01,
          2.3498e-01, -4.6535e-02, -9.5518e-02,  9.9269e-02, -1.4516e-01,
          4.9914e-01,  2.1731e-01, -4.7995e-01,  1.4769e-02, -3.3811e-02,
         -2.4594e-01,  4.8498e-02,  4.6983e-01,  3.0698e-01,  1.1170e-01,
          7.3765e-02, -1.1803e-01, -2.7452e-02,  2.2611e-01,  2.2226






  0%|                                                                                                                                                                   | 4/14274 [00:03<3:00:54,  1.31it/s][A[A[A[A[A

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0427,  0.0858, -0.0220,  ..., -0.1157, -0.0512, -0.0309],
         [ 0.0114, -0.2390, -0.0557,  ..., -0.3332,  0.1852,  0.0751],
         [ 0.0879,  0.1554,  0.1251,  ...,  0.1847, -0.0294, -0.0489],
         ...,
         [-0.0561,  0.0998, -0.0271,  ..., -0.2057,  0.0201, -0.1525],
         [-0.0561,  0.0998, -0.0271,  ..., -0.2057,  0.0201, -0.1525],
         [-0.0561,  0.0998, -0.0271,  ..., -0.2057,  0.0201, -0.1525]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[-1.3738e-02, -2.0784e-01, -2.1963e-01, -1.1118e-01,  1.5323e-01,
          1.8332e-01,  2.5949e-01, -7.3172e-02, -5.0565e-02, -1.7314e-01,
          2.2505e-01, -3.7901e-05, -8.0733e-02,  7.8247e-02, -1.2294e-01,
          5.1120e-01,  2.2906e-01, -4.6926e-01,  1.0668e-02, -3.9428e-02,
         -2.3985e-01,  7.2482e-02,  4.5569e-01,  2.9660e-01,  1.3645e-01,
          7.0573e-02, -1.2183e-01, -3.9758e-02,  1.9024e-01,  2.2503




In [49]:
embeddings[0].shape

(1, 768)

## gather all questions for embeddings generation

In [51]:
all_questions = []

for ds in ['train', 'dev', 'test']:
    for nhop in ['1', '2', '3']:
        raw = load_raw_qa('./data/QA_data/MetaQA/', mode=f'{ds}_{nhop}hop')
        questions, _, _ = extract_question_entity_target(raw)
        all_questions.extend(questions)

print(len(all_questions))

746105


In [52]:
all_questions[100]

'which movies can be described by robert schwentke'

In [56]:
with open('all-questions.pickle', 'wb') as f:
    pickle.dump(all_questions, f)

## load question embeddings

In [57]:
with open('q-emb-dict.pickle', 'rb') as f:
    question_embs = pickle.load(f)

In [60]:
question_embs['which movies can be described by robert schwentke'].shape

(768,)

In [74]:
np.min(np.array(list(question_embs.values())))

-1.0

# Knowledge graph

In [61]:
graph = data_loader.build_graph()

In [62]:
graph.adj[4]

AtlasView({767: {'relation_id': 6, 'relation_text': 'directed_by'}, 7125: {'relation_id': 13, 'relation_text': 'has_genre'}, 184: {'relation_id': 16, 'relation_text': 'release_year'}})

In [63]:
# same as graph.adj
graph[1024]

AtlasView({1121: {'relation_id': 11, 'relation_text': 'written_by'}, 167: {'relation_id': 16, 'relation_text': 'release_year'}, 15595: {'relation_id': 8, 'relation_text': 'in_language'}, 31150: {'relation_id': 6, 'relation_text': 'directed_by'}, 40514: {'relation_id': 7, 'relation_text': 'has_tags'}})

In [64]:
graph.nodes[1024]

{'entity_text': 'has_tags_reverse'}

In [65]:
for nbr, eattr in graph[5].items():
    print(nbr, eattr)

37824 {'relation_id': 11, 'relation_text': 'written_by'}
165 {'relation_id': 16, 'relation_text': 'release_year'}
28874 {'relation_id': 6, 'relation_text': 'directed_by'}


## moving in graph

In [66]:
def traverse(entity_name, relation):
    entity_id = entities_voc[entity_name]
    neighbors = graph[entity_id]
    for nbr_id, eattr in neighbors.items():
        if eattr['relation_text'] == relation:
            return entities_inv_voc[nbr_id]
        
traverse('After the Rain', 'written_by')

'Akira Kurosawa'

## creating knowledge graph environment

In [330]:
import gym
from gym import spaces

class KGEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, n_relations, observation_space, q_embeddings, ent_embeddings,
                 entities_vocab, entities_inv_vocab, rel_inv_voc, rel_voc, mode='train'):
        super(KGEnv, self).__init__()
        
        self.entities_vocab = entities_vocab
        self.entities_inv_vocab = entities_inv_vocab
        self.relations_inv_voc = rel_inv_voc
        self.action_space = spaces.Discrete(n_relations)
        self.observation_space = spaces.Box(low=-3, high=3, shape=(observation_space,))
        self.q_embeddings = q_embeddings
        self.ent_embeddings = ent_embeddings
        self.relations_voc = rel_voc
        
        for nhop in ['1', '2', '3']:
            raw = load_raw_qa('./data/QA_data/MetaQA/', mode=f'{mode}_{nhop}hop')
            self.questions, self.entities, self.targets = extract_question_entity_target(raw)

        self.entities_idx = [entities_voc[t] for t in self.entities]
        self.targets_idx = [[entities_voc[t] for t in ts] for ts in self.targets]
        self.env_size = len(self.targets)
        self.goal = None
        self.current_q = self.current_ent = None
    
    def step(self, action):
        next_ent_id = self.traverse(self.current_ent, action)
        if next_ent_id == -1:
            next_ent_id = self.entities_vocab[self.current_ent]
            
        reward = 0
        done = False
        
        #TODO:
        if next_ent_id in self.goal:
            done = True
            reward = 1
            
        self.current_ent = self.entities_inv_vocab[next_ent_id]
        next_state = self.build_state_vec(self.current_q, next_ent_id)
        
        return next_state, reward, done, {}
    
    
    def build_state_vec(self, question: str, entity: int):
        q_emb = self.q_embeddings[question]
        e_emb = self.ent_embeddings[entity]
        return np.concatenate([q_emb, e_emb])
        
    def traverse(self, entity_name, relation):
        entity_id = entities_voc[entity_name]
        neighbors = graph[entity_id]
        possible_moves = []
        for nbr_id, eattr in neighbors.items():
            if eattr['relation_id'] == relation:
                possible_moves.append(nbr_id)
        
        if len(possible_moves):
            return choice(possible_moves)
        return -1
        
    
    def reset(self):
        init_state = np.random.randint(low=0, high=self.env_size-1)
        self.goal = self.targets_idx[init_state]
        self.current_q = self.questions[init_state]
        self.current_ent = self.entities[init_state]
        return self.build_state_vec(self.current_q, self.entities_idx[init_state])
    
    
    def render(self, mode='human', close=False):
        print(self.current_q)
        print(f"current entity: {self.current_ent}")
        entity_id = entities_voc[self.current_ent]
        neighbors = graph[entity_id]
        
        print("available moves:")
        for nbr_id, eattr in neighbors.items():
            print(f"--{eattr['relation_text']}({eattr['relation_id']})--> {self.entities_inv_vocab[nbr_id]}")
    
    
kg_env = KGEnv(len(relations_voc), observation_space=768+200, q_embeddings=question_embs,
               ent_embeddings=ent_emb_dict, entities_vocab=entities_voc, entities_inv_vocab=entities_inv_voc,
                rel_inv_voc=relations_inv_voc, rel_voc=relations_voc)
s = kg_env.reset()
s.shape

(968,)

In [331]:
def manual_play_in_env():
    state = kg_env.reset()
    done = False
    
    while not done:
        kg_env.render()        
        action = int(input())
        state, reward, done, _ = kg_env.step(action)
        
manual_play_in_env()

what types are the movies starred by actors in What's the Worst That Could Happen?
current entity: What's the Worst That Could Happen?
available moves:
--written_by(11)--> Donald E. Westlake
--directed_by(6)--> Sam Weisman
--release_year(16)--> 2001
--starred_actors(3)--> Martin Lawrence
--starred_actors(3)--> John Leguizamo
--has_tags(7)--> martin lawrence
--starred_actors(3)--> Danny DeVito
--has_tags(7)--> sam weisman
--has_genre(13)--> Comedy
3
what types are the movies starred by actors in What's the Worst That Could Happen?
current entity: John Leguizamo
available moves:
--starred_actors_reverse(10)--> King of the Jungle
--starred_actors_reverse(10)--> Executive Decision
--starred_actors_reverse(10)--> Spawn
--starred_actors_reverse(10)--> Carlito's Way
--starred_actors_reverse(10)--> The Pest
--starred_actors_reverse(10)--> Titan A.E.
--starred_actors_reverse(10)--> Super Mario Bros.
--starred_actors_reverse(10)--> Body Count
--starred_actors_reverse(10)--> Ride Along
--starred_