In [1]:
%matplotlib widget
import re
import json
import tqdm
import queue
import urllib
from nltk.tokenize import word_tokenize
from collections import defaultdict
import networkx as nx
import time
import socks
import socket
import pickle
import matplotlib.pyplot as plt
from multiprocessing.pool import ThreadPool as Pool

In [2]:
socks.set_default_proxy(socks.SOCKS5, "localhost", 10087)
socket.socket = socks.socksocket

In [3]:
def instance2dict(instance):
	# convert the instance into a dict
    tmp_dict = {}
    tmp_dict['question'] = instance[0].split('\t')[1].strip()
    logical_form_rexpr_pattern = r"[\(] [r\-]*mso:([\w\.]*) (([\?][xy])|[\S]+) (([\?][xy])|[\S]+) [\)]"
    entity_rexpr_pattern = r"([\S]+) \(entity\)" # r"([\S]+) (?:\(entity\)|\(value\)|\(type\)) \[([\d]+,[\d]+)\]"
    
    logical_form = instance[1].split('\t')[1].strip()
    tmp_dict['logical_form'] = re.findall(logical_form_rexpr_pattern, logical_form)
    
    tmp_dict['parameters'] = re.findall(entity_rexpr_pattern, instance[2].split('\t')[1].strip())
    tmp_dict['question_type'] = instance[3].split('\t')[1].strip()
    return tmp_dict

def fetch_data(file):
	# fetch data from the original file
	lines = file.readlines()
	instance_list = []
	tmp_instance = []
	for line in lines:
		if line.strip()=="==================================================":
			instance_list.append(instance2dict(tmp_instance))
			tmp_instance = []
		else:
			tmp_instance.append(line)
	return instance_list

In [4]:
dev_file = open("DATA/EMNLP.dev", 'r', encoding='utf-8')
train_file = open("DATA/EMNLP.train", 'r', encoding='utf-8')
test_file = open("DATA/EMNLP.test", 'r', encoding='utf-8')
dev_data = fetch_data(dev_file)
train_data = fetch_data(train_file)
test_data = fetch_data(test_file)

In [5]:
def get_logical_forms(samples):
    all_logical_form_stats = defaultdict(int)
    for sample in samples:
        logical_forms = sample.get("logical_form")
        for logical_form in logical_forms:
            all_logical_form_stats[logical_form[0]] += 1
    return all_logical_form_stats

def get_entities(samples):
    all_entities = defaultdict(int)
    for sample in samples:
        entities = sample.get("parameters")
        for entitiy in entities:
            all_entities[entitiy[0]] += 1
    return all_entities

def get_category_tree(all_logical_form_stats):
    G_cat = nx.DiGraph()
    for x in train_logical_form_stats:
        x = x[4:]
        cats = x.split(".")
        parent_node = "root"
        for cat in cats:
            if "#" in cat:
                print(cat)
            cat = parent_node + "#" + cat
            G_cat.add_node(cat)
            G_cat.add_edge(parent_node, cat)
            parent_node = cat    
    return G_cat

In [6]:
dev_logical_form_stats = get_logical_forms(dev_data)
train_logical_form_stats = get_logical_forms(train_data)

dev_category_tree = get_category_tree(dev_logical_form_stats)
train_category_tree = get_category_tree(train_logical_form_stats)

shortest_path = dict(nx.all_pairs_shortest_path(dev_category_tree))

In [17]:
sorted(dev_logical_form_stats.items(), key=lambda x:-x[1])

[('book.author.works_written', 144),
 ('baseball.player.batting_statistics', 57),
 ('comic_books.creator.series', 48),
 ('music.artist.album', 47),
 ('time.event.start_date', 47),
 ('tv.regular_tv_appearance.actor', 37),
 ('location.sovereign.calling_code', 35),
 ('baseball.batting_statistics.season', 34),
 ('location.sovereign.official_language', 29),
 ('film.film.writer', 28),
 ('sports.pro_athlete.team', 28),
 ('tv.character.appeared_in_tv_program', 28),
 ('time.event.person', 26),
 ('people.person.education', 26),
 ('biology.domesticated_animal.breeds', 26),
 ('book.written_work.date_of_first_publication', 25),
 ('baseball.batting_statistics.team', 24),
 ('sports.sports_team_roster.team', 24),
 ('time.event.end_date', 22),
 ('film.film.music', 21),
 ('film.actor.film', 20),
 ('organization.board_membership.organization', 19),
 ('organization.board_member.leader_of', 19),
 ('cvg.computer_videogame.developer', 19),
 ('people.person.date_of_birth', 19),
 ('sports.team.color', 19),
 ('

In [7]:
result_queue = queue.Queue()

In [8]:
dict_entity_lookup = pickle.load(open("DATA/dict_entity_lookup.pkl","rb"))
dict_entity_lookup= {ky:val for ky,val in dict_entity_lookup.items()}
print(len(dict_entity_lookup))

21097


In [77]:
dict_entity_lookup["chuck_connors"]["itemListElement"][0]

{'@type': 'EntitySearchResult',
 'result': {'@id': 'kg:/m/02hg53',
  'name': 'Chuck Connors',
  '@type': ['Thing', 'Person'],
  'description': 'American actor',
  'detailedDescription': {'articleBody': 'Kevin Joseph Aloysius "Chuck" Connors was an American actor, writer and professional basketball and baseball player. He is one of only 13 athletes in the history of American professional sports to have played both Major League Baseball and in the National Basketball Association. ',
   'url': 'https://en.wikipedia.org/wiki/Chuck_Connors',
   'license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'},
  'image': {'contentUrl': 'http://t1.gstatic.com/images?q=tbn:ANd9GcQy1LqHRx9f0hthm-ZKa3_go2UQ3MSkdCUb3zNltqeMkEkoUufZ',
   'url': 'https://commons.wikimedia.org/wiki/File:Chuck_Connors_Brooklyn_Dodgers.JPG'},
  'url': 'http://www.ourchuckconnors.com/'},
 'resultScore': 602.20166}

In [10]:
dev_jobs_to_do = [(1, x) for y in dev_data for x in y.get("parameters")]
train_jobs_to_do = [(1, x) for y in train_data for x in y.get("parameters")]
test_jobs_to_do = [(1, x) for y in test_data for x in y.get("parameters")]
all_jobs_to_do = [(x[0],x[1],ite) for ite, x in enumerate(train_jobs_to_do+dev_jobs_to_do+test_jobs_to_do)]

In [11]:
tally = 0
for x in all_jobs_to_do:
    if x[1] not in dict_entity_lookup:
        tally += 1
    else:
        if len(dict_entity_lookup[x[1]]["itemListElement"]) < 1:
            tally += 1
print(tally)

4239


In [12]:
KB_tag_stats = defaultdict(int)
for x in dict_entity_lookup:
    for y in dict_entity_lookup[x]["itemListElement"]:
        try:
            for z in y["result"]["@type"]:
                KB_tag_stats[z] += 1
        except:
            print(y)
        break

In [13]:
dev_data[0]

{'question': 'the music composition sinfonia concertante for violin, viola & orchestra in e major, k. 320d/364: iii. presto is of which track?',
 'logical_form': [('mso:music.composition.track',
   'sinfonia_concertante_for_violin,_viola_&_orchestra_in_e_major,_k._320d/364:_iii._presto',
   '',
   '?x',
   '?x')],
 'parameters': ['sinfonia_concertante_for_violin,_viola_&_orchestra_in_e_major,_k._320d/364:_iii._presto'],
 'question_type': 'single-relation'}