In [1]:
import json
import nltk
nltk.download('punkt')
import torch
from models import InferSent
import numpy as np

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\erazu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import spacy
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pickle as pkl

### Loading test data

In [3]:
test_labels = json.load(open('data/data_test.json', 'r', encoding='utf-8'))

In [4]:
all_labels = []
for key in list(test_labels.keys()):
    curr_labels = []
    for key1 in ['label', 'synonyms']:
        if isinstance(test_labels[key][key1], list):
            curr_labels += test_labels[key][key1]
        else:
            curr_labels += [test_labels[key][key1]]
        
    all_labels.append(curr_labels)
    


In [5]:
all_labels[:9]

[['applies to jurisdiction',
  'of jurisdiction',
  'linked to jurisdiction',
  'belongs to jurisdiction',
  'jurisdiction',
  'country of jurisdiction',
  'valid in jurisdiction',
  'applies to territorial jurisdiction',
  'applied to jurisdiction'],
 ['member of political party',
  'political party',
  'party',
  'member of',
  'member of party',
  'party membership'],
 ['conferred by',
  'awarded by',
  'bestowed by',
  'given by',
  'granted by',
  'presented by'],
 ['donated by', 'given by', 'bestowed by', 'gift from', 'donation from'],
 ['native language',
  'first language',
  'mother tongue',
  'language native',
  'L1 speaker of'],
 ['manager/director', 'art director', 'manager', 'director'],
 ['relative', 'family', 'family member', 'kinsman', 'relation'],
 ['medical condition',
  'disability',
  'ailment',
  'health problem',
  'disorder',
  'illness',
  'disease',
  'paralympic disability',
  'health condition'],
 ['product or material produced',
  'material produced',
  'pr

In [11]:
questions = [['Which jurisdiction does European Bank belong to ?', 'Which jurisdiction works in the United Kingdom ?'], ['Which party does Donald Trump belong to ?', 'What are main parties in the United Kingdom ?'],
['Who received the Oscar for Titanik ?',
'Who was awarded Nobel Prize in Physics in 1972 ?'],
['Which fund makes the largest donations to the British Cancer Research ?',
'Whose donations was the Carnegie Museum founded on ?'],
['What is the first language of Albert Einstein ?',
'What was Tolstai native spaker of ?']]





In [12]:
len(questions)

5

In [13]:
all_labels[:5]

[['applies to jurisdiction',
  'of jurisdiction',
  'linked to jurisdiction',
  'belongs to jurisdiction',
  'jurisdiction',
  'country of jurisdiction',
  'valid in jurisdiction',
  'applies to territorial jurisdiction',
  'applied to jurisdiction'],
 ['member of political party',
  'political party',
  'party',
  'member of',
  'member of party',
  'party membership'],
 ['conferred by',
  'awarded by',
  'bestowed by',
  'given by',
  'granted by',
  'presented by'],
 ['donated by', 'given by', 'bestowed by', 'gift from', 'donation from'],
 ['native language',
  'first language',
  'mother tongue',
  'language native',
  'L1 speaker of']]

In [14]:
list(zip(all_labels[:5], questions))

[(['applies to jurisdiction',
   'of jurisdiction',
   'linked to jurisdiction',
   'belongs to jurisdiction',
   'jurisdiction',
   'country of jurisdiction',
   'valid in jurisdiction',
   'applies to territorial jurisdiction',
   'applied to jurisdiction'],
  ['Which jurisdiction does European Bank belong to ?',
   'Which jurisdiction works in the United Kingdom ?']),
 (['member of political party',
   'political party',
   'party',
   'member of',
   'member of party',
   'party membership'],
  ['Which party does Donald Trump belong to ?',
   'What are main parties in the United Kingdom ?']),
 (['conferred by',
   'awarded by',
   'bestowed by',
   'given by',
   'granted by',
   'presented by'],
  ['Who received the Oscar for Titanik ?',
   'Who was awarded Nobel Prize in Physics in 1972 ?']),
 (['donated by', 'given by', 'bestowed by', 'gift from', 'donation from'],
  ['Which fund makes the largest donations to the British Cancer Research ?',
   'Whose donations was the Carnegie 

### Infersent loading and processing

In [15]:
V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))
W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

### Training data load and collecting all sentences together

In [16]:
training = json.load(open('data/dataset.json', 'r', encoding='utf-8'))

In [17]:
all_sents = []
for key in list(training.keys()):
    for key1 in ['labels', 'questions']:
        for element in training[key][key1]:
            all_sents.append(element)
print(len(all_sents))

52849


In [18]:
for list_of_questions in questions:
    for question in list_of_questions:
        all_sents.append(question)
print(len(all_sents))

52859


In [22]:
for list_of_labels in all_labels[:5]:
    for label in list_of_labels:
        all_sents.append(label)

In [23]:
len(all_sents)

52921

### Building vocab for all sentences and labels

In [24]:
infersent.build_vocab(all_sents, tokenize=True)

Found 24994(/38793) words with w2v vectors
Vocab size : 24994


### Getting tuples for test questions and first 5 test labels

In [25]:
tuples_with_labels = []
tuples_questions_labels = list(zip(all_labels[:5], questions))
for tuple_1 in tuples_questions_labels:
    embedding_labels = infersent.encode(tuple_1[0], tokenize=True)
    embedding_label_av = np.mean(embedding_labels, axis=0)
    #print(embedding_label_av.shape)
    embedding_quest = infersent.encode(tuple_1[1], tokenize=True)
    print(embedding_quest.shape)
    tuples_with_labels.append((tuple_1[0], tuple_1[1], embedding_label_av, embedding_quest))

(2, 4096)
(2, 4096)
(2, 4096)
(2, 4096)
(2, 4096)


In [26]:
tuples_with_labels[0]

(['applies to jurisdiction',
  'of jurisdiction',
  'linked to jurisdiction',
  'belongs to jurisdiction',
  'jurisdiction',
  'country of jurisdiction',
  'valid in jurisdiction',
  'applies to territorial jurisdiction',
  'applied to jurisdiction'],
 ['Which jurisdiction does European Bank belong to ?',
  'Which jurisdiction works in the United Kingdom ?'],
 array([ 0.00746889, -0.03260348,  0.02242357, ...,  0.00092162,
        -0.05534602, -0.03559341], dtype=float32),
 array([[ 0.00746889, -0.03840831, -0.01202221, ...,  0.03808134,
          0.00890901, -0.01986518],
        [ 0.00746889, -0.04294045,  0.0971196 , ...,  0.0226282 ,
          0.01503134, -0.0214828 ]], dtype=float32))

### Model and finding probs for each label index

In [43]:
class ZS_net(nn.Module):
        def __init__(self, question_emb_size):
            super(ZS_net, self).__init__()
            self.W = nn.Parameter(torch.nn.init.xavier_uniform_(torch.Tensor(question_emb_size, question_emb_size)).double())
            
        
        def forward(self, question_emb, label_emb):
            mult = torch.mm(question_emb, self.W)
            output = torch.bmm(mult.unsqueeze(1), label_emb.unsqueeze(2))
            return output

In [25]:
model = ZS_net(4096).float()
model.load_state_dict(torch.load('my_model.ckpt'))

In [26]:
all_classes = []
for tuple_1 in tuples_with_labels:
    all_classes.append(tuple_1[2])
    
all_questions = []
for tuple_1 in tuples_with_labels:
    all_questions+=list(tuple_1[3])

In [29]:
for quest_number in range(len(all_questions)):
    for class_number in range(len(all_classes)):
        sim = model(torch.from_numpy(all_questions[quest_number]).float().unsqueeze(0), torch.from_numpy(all_classes[class_number]).unsqueeze(0))
        output = torch.sigmoid(sim)
        print('*********')
        print(str(quest_number) + ' ' + str(class_number) )
        print(output)
        

*********
0 0
tensor([[[0.9387]]], grad_fn=<SigmoidBackward>)
*********
0 1
tensor([[[0.6724]]], grad_fn=<SigmoidBackward>)
*********
0 2
tensor([[[0.6257]]], grad_fn=<SigmoidBackward>)
*********
0 3
tensor([[[0.6227]]], grad_fn=<SigmoidBackward>)
*********
0 4
tensor([[[0.0366]]], grad_fn=<SigmoidBackward>)
*********
1 0
tensor([[[0.7245]]], grad_fn=<SigmoidBackward>)
*********
1 1
tensor([[[0.2854]]], grad_fn=<SigmoidBackward>)
*********
1 2
tensor([[[0.1588]]], grad_fn=<SigmoidBackward>)
*********
1 3
tensor([[[0.2277]]], grad_fn=<SigmoidBackward>)
*********
1 4
tensor([[[0.0231]]], grad_fn=<SigmoidBackward>)
*********
2 0
tensor([[[0.5101]]], grad_fn=<SigmoidBackward>)
*********
2 1
tensor([[[0.5150]]], grad_fn=<SigmoidBackward>)
*********
2 2
tensor([[[0.6486]]], grad_fn=<SigmoidBackward>)
*********
2 3
tensor([[[0.5931]]], grad_fn=<SigmoidBackward>)
*********
2 4
tensor([[[0.0240]]], grad_fn=<SigmoidBackward>)
*********
3 0
tensor([[[0.7005]]], grad_fn=<SigmoidBackward>)
********

### Construct "dictionary" associating topic labels, URIs and their embeddings ('uri_topic_dict.pkl')

In [37]:
uri_topic_dict = []
k=0
for key in list(training.keys()):
    values = training[key]["labels"]
    embedding_labels = infersent.encode(values, tokenize=True)
    embedding_label_av = np.mean(embedding_labels, axis=0)
    uri_topic_dict.append((k, key, values, embedding_label_av))
    k+=1
    
for key in list(test_labels.keys()):
    curr_labels = []
    for key1 in ['label', 'synonyms']:
        if isinstance(test_labels[key][key1], list):
            curr_labels += test_labels[key][key1]
        else:
            curr_labels += [test_labels[key][key1]]
    embedding_labels = infersent.encode(curr_labels, tokenize=True)
    embedding_label_av = np.mean(embedding_labels, axis=0)
    uri_topic_dict.append((k, key, curr_labels, embedding_label_av))
    k+=1
    

In [38]:
uri_topic_dict

[(0,
  'http://www.wikidata.org/entity/P101',
  ['field of study',
   'fields',
   'discipline',
   'subject',
   'area',
   'specialism',
   'domain',
   'academic discipline',
   'scientific discipline',
   'academic subject',
   'academic area',
   'scientific area',
   'FOW',
   'field of work'],
  array([ 0.00746889, -0.09179498,  0.00967923, ..., -0.03022508,
         -0.05071909, -0.02850063], dtype=float32)),
 (1,
  'http://www.wikidata.org/entity/P1029',
  ['crew member'],
  array([ 0.00746889, -0.09595255, -0.02173634, ..., -0.01069365,
         -0.05936687, -0.06587837], dtype=float32)),
 (2,
  'http://www.wikidata.org/entity/P1040',
  ['editor', 'edited by', 'film editor'],
  array([ 0.00746889, -0.09455415,  0.01290073, ..., -0.04508448,
          0.04699417, -0.04150699], dtype=float32)),
 (3,
  'http://www.wikidata.org/entity/P105',
  ['taxonomic rank', 'rank', 'type of taxon', 'taxon rank'],
  array([ 0.00746889, -0.12191994,  0.05286136, ..., -0.02282107,
         -0.0

In [42]:
pkl.dump(uri_topic_dict, open('uri_topic_dict.pkl', 'wb') )

In [44]:
uri_topic_dict = pkl.load(open('uri_topic_dict.pkl', 'rb'))
print(uri_topic_dict)

[(0, 'http://www.wikidata.org/entity/P101', ['field of study', 'fields', 'discipline', 'subject', 'area', 'specialism', 'domain', 'academic discipline', 'scientific discipline', 'academic subject', 'academic area', 'scientific area', 'FOW', 'field of work'], array([ 0.00746889, -0.09179498,  0.00967923, ..., -0.03022508,
       -0.05071909, -0.02850063], dtype=float32)), (1, 'http://www.wikidata.org/entity/P1029', ['crew member'], array([ 0.00746889, -0.09595255, -0.02173634, ..., -0.01069365,
       -0.05936687, -0.06587837], dtype=float32)), (2, 'http://www.wikidata.org/entity/P1040', ['editor', 'edited by', 'film editor'], array([ 0.00746889, -0.09455415,  0.01290073, ..., -0.04508448,
        0.04699417, -0.04150699], dtype=float32)), (3, 'http://www.wikidata.org/entity/P105', ['taxonomic rank', 'rank', 'type of taxon', 'taxon rank'], array([ 0.00746889, -0.12191994,  0.05286136, ..., -0.02282107,
       -0.02680972, -0.0277157 ], dtype=float32)), (4, 'http://www.wikidata.org/entit

       -0.02665507, -0.02976383], dtype=float32))]


### Getting URI of top N topics

In [79]:
# !!! assuming InferSent has been loaded before
uri_topic_dict = pkl.load(open('uri_topic_dict.pkl', 'rb'))

def getting_most_prob_topics(question, n):
    model = ZS_net(4096).float()
    model.load_state_dict(torch.load('my_model.ckpt'))
    question_embedding = infersent.encode(question, tokenize=True)
    class_probs = np.zeros(len(uri_topic_dict),)
    for i in range(len(uri_topic_dict)):
#         print(question_embedding.shape)
#         print(uri_topic_dict[i][3].shape)
        sim = model(torch.from_numpy(question_embedding).float(), torch.from_numpy(uri_topic_dict[i][3]).unsqueeze(0))
        output = torch.sigmoid(sim)
        class_probs[i] = output.detach().numpy()
    n_best = class_probs.argsort()[-n:][::-1]
    uris = [uri_topic_dict[index][1] for index in n_best]
    return uris
        

In [80]:
getting_most_prob_topics(['What is the population of this city?'], 5)

['http://www.wikidata.org/entity/P1082',
 'http://www.wikidata.org/entity/P30',
 'http://www.wikidata.org/entity/P532',
 'http://www.wikidata.org/entity/P1383',
 'http://www.wikidata.org/entity/P2046']

In [59]:
class_probs

NameError: name 'class_probs' is not defined