1. Coreference Resolution

In [9]:
# Load your usual SpaCy model (one of SpaCy English models)

import spacy
import neuralcoref
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.pipeline import EntityRuler
from spacy.tokens import Token


stop_words_getter = lambda token: token.is_stop or token.lower_ in STOP_WORDS \
                                                or token.lemma_ in STOP_WORDS
Token.set_extension('is_stop', getter=stop_words_getter, force=True)

nlp = spacy.load("en_core_web_lg")
ruler = EntityRuler(nlp)
nlp.add_pipe(ruler)

neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7f5914461f90>

2. load Trie

In [2]:
import marisa_trie
# with open ("/dccstor/myu/experiments/knowledge_trie/openie_dev/full_str_20_5.txt") as f:
#     # full_str_list = [line.strip().split('\t') for line in f]
#     full_str_list = [" ".join(line.strip().split('\t')[:2]) for line in f]
#     ext_trie = marisa_trie.Trie(full_str_list)


import pickle
ext_trie = pickle.load(open('/dccstor/myu/experiments/knowledge_trie/openie_dev/dummy_trie.pickle', 'rb'))

In [4]:
a = {"trie": ext_trie}
pickle.dump(a, open('/dccstor/myu/experiments/knowledge_trie/openie_dev/dummy_id2trie.pickle', 'wb'))

In [4]:
ext_trie.keys("driving")

['driving while under influence of impairing substance driving',
 'driving while under influence of impairing substance driving Drug',
 'driving while under influence of impairing substance impaired driving',
 'driving while under influence of impairing substance impaired driving Drug',
 'driving while under influence of substance driving',
 'driving while under influence of substance driving Drug',
 'driving while under influence of substance impaired driving',
 'driving while under influence of substance impaired driving Drug',
 'driving while under influence driving',
 'driving while under influence driving Drug',
 'driving while under influence impaired driving',
 'driving while under influence impaired driving Drug',
 'driving while driving',
 'driving while driving Drug',
 'driving while impaired driving',
 'driving while impaired driving Drug',
 'driving Cannabinoids effects',
 'driving Cannabinoids effects on driver',
 'driving Cannabinoids effects on driver similar',
 'driving

In [19]:

import graphviz
g = graphviz.Digraph('G', filename="/dccstor/myu/experiments/knowledge_trie/asqa_dev/graph_3_0-100.gv")
edge_set = set()
for line in full_str_list:
    tokens = line.split(' ')
    for i in range(len(tokens)-1):
        if f"{tokens[i]} {tokens[i+1]}" not in edge_set:
            g.edge(tokens[i], tokens[i+1])
            edge_set.add(f"{tokens[i]} {tokens[i+1]}")
g.view()


'/dccstor/myu/experiments/knowledge_trie/asqa_dev/graph_3_0-10.gv.pdf'

In [32]:

import graphviz
g = graphviz.Digraph('G', filename="/dccstor/myu/experiments/knowledge_trie/asqa_dev/graph_3_0-100_subjobj.gv")
edge_set = set()
for subj,obj,score in full_str_list:
    tokens = line.split(' ')
  
    if f"{subj} {obj}" not in edge_set:
        g.edge(subj, obj)
        edge_set.add(f"{subj} {obj}" )
g.view()

'/dccstor/myu/experiments/knowledge_trie/asqa_dev/graph_3_0-100_subjobj.gv.pdf'

In [2]:
for token in nlp("Does marijuana impair driving ability?"):
    print(token.text, token.pos_)

Does VERB
marijuana NOUN
impair VERB
driving NOUN
ability NOUN
? PUNCT


3. query from Trie
In the KID paper they use max_hops=3 and the window size for local memory is 3

In [7]:
import marisa_trie
with open ("/dccstor/myu/experiments/knowledge_trie/openie_dev/full_str_20_5.txt") as f:
    # full_str_list = [line.strip().split('\t') for line in f]
    full_str_list = [" ".join(line.strip().split('\t')[:2]) for line in f]
    ext_trie = marisa_trie.Trie(full_str_list)

max_hops = 2
question = "Does marijuana impair driving ability?"
cur_gen_toks = question
local_kg = [
        token.lemma_.lower() for token in nlp(cur_gen_toks)
        if token.pos_ in ['PROPN', 'NOUN'] and not token.is_stop
]
local_kg = list(set(local_kg))
print("initial local memory:", local_kg)
tmp_kg = local_kg
related_kgs = set() # all accquired words through multi-hop querying, will serve as the vocab to guide next step generation
for i in range(max_hops):
    new_knowledge = [] # record new words that are queried from this hop
    for ent in tmp_kg:
        for span in ext_trie.keys(ent):
            new_knowledge.extend(span.split(' '))
    new_knowledge = set(new_knowledge)
    print(">>> new knowledge from hop", i+1,":",len(new_knowledge))
    print(new_knowledge)
    tmp_kg = list(new_knowledge) # reset tmp queries as new knowledge for next hop
    related_kgs |= new_knowledge

# print("queried knowledge demonstrations", list(related_kgs)) 

initial local memory: ['ability', 'driving', 'marijuana']
>>> new knowledge from hop 1 : 23
{'while', 'impaired', 'Cannabinoids', 'alcohol', 'Code', 'driving', 'impairing', 'driver', 'influence', 'effects', 'marijuana', 'to', 'users', 'substance', 'Utah', 'role', 'those', 'Section', 'similar', 'under', 'of', 'Drug', 'on'}
>>> new knowledge from hop 2 : 92
{'law', '-9-tetahydrocannabinol', 'impaired', 'by', 'test', 'least', 'person', 'crashes', 'after', 'often', 'conducted', 'marijuana', 'role', 'for', 'those', 'blood', 'fluids', 'intoxication', '10', '58', 'ml', 'on', 'played', 'no', 'THC', 'Cannabinoids', 'convicted', 'obtain', 'driving', 'impairing', 'driver', 'with', 'drivers', 'more', 'enforcement', 'Delta', 'to', 'its', 'site', 'detectable', 'Utah', 'their', 'definition', 'substantial', 'of', 'Drug', 'Ohio', 'National', 'accurate', 'because', 'statute', 'likely', 'prescription', 'alcohol', 'Code', 'detected', 'legal', 'body', 'amount', 'impairment', 'ng', 'illicit', 'it', 'cannabi

In [1]:
import marisa_trie
with open ("/dccstor/myu/experiments/knowledge_trie/asqa_dev/full_str_3_0-10.txt") as f:
    # full_str_list = [line.strip().split('\t') for line in f]
    full_str_list = [" ".join(line.strip().split('\t')[:2]) for line in f]
    ext_trie = marisa_trie.Trie(full_str_list)

max_hops = 2
question = "Who has the highest goals in world football?"
cur_gen_toks = question
local_kg = [
        token.lemma_.lower() for token in nlp(cur_gen_toks)
        if token.pos_ in ['PROPN', 'NOUN'] and not token.is_stop
]
local_kg = list(set(local_kg))
print("initial local memory:", local_kg)
tmp_kg = local_kg
related_kgs = set() # all accquired words through multi-hop querying, will serve as the vocab to guide next step generation
for i in range(max_hops):
    new_knowledge = [] # record new words that are queried from this hop
    for ent in tmp_kg:
        for span in ext_trie.keys(ent):
            new_knowledge.extend(span.split(' '))
    new_knowledge = set(new_knowledge)
    print(">>> new knowledge from hop", i+1,":",len(new_knowledge))
    print(new_knowledge)
    tmp_kg = list(new_knowledge) # reset tmp queries as new knowledge for next hop
    related_kgs |= new_knowledge

# print("queried knowledge demonstrations", list(related_kgs)) 

In [2]:
import marisa_trie
import pickle
trie = pickle.load(open("/dccstor/myu/experiments/knowledge_trie/eli5_openie_dev_pkl/id2kg_0_1507.pkl", "rb"))

In [11]:
ext_trie = trie["1ircew"]
max_hops = 2
question = "Why are some animals' flesh (beef, salmon, etc.) fine to eat raw, whilst others (chicken, pork, etc.) cause food poisoning?""Why are some animals' flesh (beef, salmon, etc.) fine to eat raw, whilst others (chicken, pork, etc.) cause food poisoning?"
cur_gen_toks = question
local_kg = [
        token.lemma_.lower() for token in nlp(cur_gen_toks)
        if token.pos_ in ['PROPN', 'NOUN'] and not token.is_stop
]
local_kg = list(set(local_kg))
print("initial local memory:", local_kg)
tmp_kg = local_kg
related_kgs = set() # all accquired words through multi-hop querying, will serve as the vocab to guide next step generation
for i in range(max_hops):
    new_knowledge = [] # record new words that are queried from this hop
    for ent in tmp_kg:
        for span in ext_trie.keys(ent):
            new_knowledge.extend(span.split(' '))
    new_knowledge = set(new_knowledge)
    new_knowledge -= related_kgs
    print(">>> new knowledge from hop", i+1,":",len(new_knowledge))
    print(new_knowledge)
    tmp_kg = list(new_knowledge) # reset tmp queries as new knowledge for next hop
    related_kgs |= new_knowledge

initial local memory: ['poisoning', 'beef', 'flesh', 'food', 'chicken', 'poisoning?why', 'salmon', 'animal', 'pork']
>>> new knowledge from hop 1 : 20
{'food', 'salmonella', 'commercial', 'raw', 'range', 'wide', 'bacteria', 'poisoning', 'malnutrition', 'chicken', 'foodborne', 'contaminated', 'animal', 'illness', 'value', 'adulterated', 'considered', 'shrimp', 'articles', 'found'}
>>> new knowledge from hop 2 : 10
{'seen', 'spp', 'products', 'bacterial', 'cl', 'contamination', 'infecting', '.', 'genera', 'meat'}


In [13]:
ext_trie = trie["3ji34o"]
max_hops = 2
question = "How does the rear view mirror work after flipping it up at night?"
cur_gen_toks = question
local_kg = [
        token.lemma_.lower() for token in nlp(cur_gen_toks)
        if token.pos_ in ['PROPN', 'NOUN'] and not token.is_stop
]
local_kg = list(set(local_kg))
print("initial local memory:", local_kg)
tmp_kg = local_kg
related_kgs = set() # all accquired words through multi-hop querying, will serve as the vocab to guide next step generation
for i in range(max_hops):
    new_knowledge = [] # record new words that are queried from this hop
    for ent in tmp_kg:
        for span in ext_trie.keys(ent):
            new_knowledge.extend(span.split(' '))
    new_knowledge = set(new_knowledge)
    new_knowledge -= related_kgs
    print(">>> new knowledge from hop", i+1,":",len(new_knowledge))
    print(new_knowledge)
    tmp_kg = list(new_knowledge) # reset tmp queries as new knowledge for next hop
    related_kgs |= new_knowledge

initial local memory: ['mirror', 'night', 'view', 'work']
>>> new knowledge from hop 1 : 55
{'mirror', 'affixed', 'catetus', 'unequal', 'pass', 'mirrors', 'mount', 'helmet', 'automobiles', 'user', 'positioned', 'image', 'frame', 'line', 'area', 'usually', 'windshield', 'tilt', 'appears', 'recently', 'equipped', 'double', 'video', '1970s', 'early', 'position', '1930s', 'bicycle', 'new', 'driver', 'built', 'night', 'swivel', 'allowing', 'material', 'large', 'screen', 'model', 'spectator', 'moved', 'vehicle', 'cameras', 'fitted', 'view', 'resulting', 'additional', 'convex', 'designed', 'tilted', 'purchase', 'unviewable', 'day', 'manual', 'cars', 'arm'}
>>> new knowledge from hop 2 : 22
{'turn', 'bicycles', 'surface', 'seen', 'actually', 'point', 'tilting', 'eyes', 'license', 'fixed', 'perfect', 'window', 'reflection', 'signal', 'handlebar', 'low', 'rear', 'repeaters', 'technique', 'mounted', 'cinematographic', 'stays'}


In [1]:
import torch
import numpy
lm_logits = torch.rand(4,2,5)


In [2]:
lm_logits

tensor([[[0.4987, 0.1781, 0.9329, 0.1802, 0.4110],
         [0.5884, 0.0087, 0.6253, 0.0630, 0.5300]],

        [[0.4661, 0.9510, 0.3345, 0.2594, 0.4666],
         [0.5377, 0.5169, 0.0254, 0.7068, 0.5137]],

        [[0.3831, 0.2606, 0.4062, 0.0303, 0.5544],
         [0.6495, 0.6991, 0.7450, 0.9325, 0.3657]],

        [[0.3691, 0.1282, 0.6169, 0.6170, 0.2063],
         [0.4647, 0.5559, 0.3278, 0.1534, 0.2684]]])

In [21]:
kg_embeds = torch.rand(4,5)
kg_embeds

tensor([[0.0256, 0.5137, 0.1751, 0.1179, 0.2028],
        [0.9846, 0.8263, 0.7400, 0.6195, 0.8569],
        [0.4272, 0.9955, 0.4984, 0.2734, 0.7437],
        [0.3437, 0.1794, 0.9025, 0.6721, 0.1655]])

In [22]:
kg_embeds = [x for x in kg_embeds]

In [23]:
kg_embeds

[tensor([0.0256, 0.5137, 0.1751, 0.1179, 0.2028]),
 tensor([0.9846, 0.8263, 0.7400, 0.6195, 0.8569]),
 tensor([0.4272, 0.9955, 0.4984, 0.2734, 0.7437]),
 tensor([0.3437, 0.1794, 0.9025, 0.6721, 0.1655])]

In [24]:
torch.stack(kg_embeds)

tensor([[0.0256, 0.5137, 0.1751, 0.1179, 0.2028],
        [0.9846, 0.8263, 0.7400, 0.6195, 0.8569],
        [0.4272, 0.9955, 0.4984, 0.2734, 0.7437],
        [0.3437, 0.1794, 0.9025, 0.6721, 0.1655]])

In [13]:
new = lm_logits/torch.sum(lm_logits, dim=-1).unsqueeze(-1).expand_as(lm_logits)

In [14]:
torch.sum(new, dim=-1)

tensor([[1.0000, 1.0000],
        [1.0000, 1.0000],
        [1.0000, 1.0000],
        [1.0000, 1.0000]])

In [44]:
token_ids = [0,1,2,3]
max_logits = torch.max(lm_logits[0], dim=-1)[0].unsqueeze(-1).expand(lm_logits.shape[1], len(token_ids))
max_logits

tensor([[0.9881, 0.9881, 0.9881, 0.9881]])

In [15]:
kg_logits

NameError: name 'kg_logits' is not defined