In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

import pandas as pd
import numpy as np

In [39]:
vect = TfidfVectorizer()
le = LabelEncoder()

In [40]:
questions = pd.read_json('data/freebase_samp_questions.json')
questions.head()

Unnamed: 0,e1,property,e2,question,hun_question
0,www.freebase.com/m/01xdf5,www.freebase.com/book/author/works_written,www.freebase.com/m/02qxck5,What book did stephen colbert write?,Milyen könyvet írt Stephen Colbert?
1,www.freebase.com/m/03m9c9l,www.freebase.com/book/author/works_written,www.freebase.com/m/04wbprh,whats the name of michael mcgarrity's books,mi a neve Michael McCgarrity könyveinek
2,www.freebase.com/m/041h0,www.freebase.com/book/author/works_written,www.freebase.com/m/04t05ry,what was written by j. r. r. tolkien,amit j írt. r. r. tolkien
3,www.freebase.com/m/031qmr,www.freebase.com/book/author/works_written,www.freebase.com/m/04w95tk,what was one of pete hamill's works,mi volt Pete Hamill egyik műve
4,www.freebase.com/m/01st1f,www.freebase.com/book/author/works_written,www.freebase.com/m/05qcvw1,What book did andre norton write?,Milyen könyvet írt andre norton?


In [43]:
X = questions.hun_question.tolist()
y = np.array(questions.property)

In [61]:
questions.property.unique()

array(['www.freebase.com/book/author/works_written',
       'www.freebase.com/cvg/computer_videogame/publisher',
       'www.freebase.com/military/military_person/participated_in_conflicts',
       'www.freebase.com/music/group_member/instruments_played',
       'www.freebase.com/music/release/label',
       'www.freebase.com/people/deceased_person/cause_of_death',
       'www.freebase.com/people/deceased_person/place_of_burial',
       'www.freebase.com/people/deceased_person/place_of_death',
       'www.freebase.com/people/person/children',
       'www.freebase.com/people/person/ethnicity',
       'www.freebase.com/people/person/gender',
       'www.freebase.com/people/person/nationality',
       'www.freebase.com/people/person/place_of_birth',
       'www.freebase.com/people/person/profession',
       'www.freebase.com/people/person/religion',
       'www.freebase.com/royalty/monarch/royal_line',
       'www.freebase.com/sports/sports_team/sport',
       'www.freebase.com/transporta

In [45]:
X_trf = vect.fit_transform(X).toarray()
y_trf = le.fit_transform(y)

In [47]:
logreg = LogisticRegression()

In [48]:
logreg.fit(X_trf, y_trf)

LogisticRegression()

In [69]:
vec = vect.transform(['Miben halt meg József Attila?'])

le.inverse_transform(logreg.predict(vec))

array(['www.freebase.com/people/deceased_person/cause_of_death'],
      dtype=object)

# Single-hop QA

In [74]:
import spacy
from spaczz.matcher import FuzzyMatcher

import pickle

In [70]:
triplets = pd.read_json('data/poet_triplets.json')
triplets.head()

Unnamed: 0,source,edge,destination
0,"{'name': 'Bencze József', 'wiki_url': '/wiki/B...","{'wikidata_id': 'P31', 'name': 'osztály, amely...","{'wikidata_id': 'Q5', 'name': 'ember', 'descri..."
1,"{'name': 'Bencze József', 'wiki_url': '/wiki/B...","{'wikidata_id': 'P21', 'name': 'nem', 'descrip...","{'wikidata_id': 'Q6581097', 'name': 'férfi', '..."
2,"{'name': 'Bencze József', 'wiki_url': '/wiki/B...","{'wikidata_id': 'P735', 'name': 'utónév', 'des...","{'wikidata_id': 'Q17498051', 'name': 'József',..."
3,"{'name': 'Bencze József', 'wiki_url': '/wiki/B...","{'wikidata_id': 'P570', 'name': 'halálozási id...","{'time': '+1988-02-02T00:00:00Z', 'timezone': ..."
4,"{'name': 'Bencze József', 'wiki_url': '/wiki/B...","{'wikidata_id': 'P106', 'name': 'foglalkozás',...","{'wikidata_id': 'Q49757', 'name': 'költő', 'de..."


In [78]:
with open('data/freebase_mapping.p', 'rb') as f:
    freebase_mapping = pickle.load(f)
    
freebase_mapping_rev = {v: k for k, v in freebase_mapping.items()}nationality

In [92]:
def filter_kg(nodes, edge):
    entity_triplets = triplets[triplets.source.apply(lambda d: d['wikidata_id'] in nodes)]
    edge_triplets = entity_triplets[entity_triplets.edge.apply(lambda d: d['wikidata_id'] == edge)]
    
    res = []
    if len(edge_triplets):
        for d in edge_triplets.destination.tolist():
            if 'name' in d:
                res.append(d['name'])
            else:
                res.append(d)
    
    return res

In [102]:
nlp = spacy.blank("hu")
source_matcher = FuzzyMatcher(nlp.vocab)

sources = []

for d in triplets.source.tolist():
    tup = (d['wikidata_id'], d['name'])
    if tup not in sources:
        sources.append(tup)


for _id, name in sources:
    source_matcher.add(_id, [nlp(name)])


def ask_single_hop(text):
    doc = nlp(text)
    matches = source_matcher(doc)
    
    vec = vect.transform([text])
    freebase_pred = le.inverse_transform(logreg.predict(vec))[0]

    source_matches = []
    edge_m = freebase_mapping_rev[freebase_pred]

    for match_id, start, end, ratio in matches:
        source_matches.append((ratio, match_id))

    source_matches = sorted(source_matches, reverse=True)
        
    if source_matches:
        source_m = [x[1] for x in source_matches if x[0] >= 80]

        return filter_kg(source_m, edge_m)
    else:
        return []

In [103]:
ask_single_hop('Miben halt meg József Attila?')

['']

In [104]:
ask_single_hop('Hol halt meg József Attila?')

['Balatonszárszó']

In [111]:
ask_single_hop('Mi volt József Attila nemzetisége?')

['Magyarország', 'Románia']

In [112]:
ask_single_hop('Milyen nemzetiségű József Attila?')

['Magyarország']

In [113]:
ask_single_hop('Honnan szárzamik József Attila?')

['Magyarország']

In [106]:
ask_single_hop('Mi Petőfi Sándor foglalkozása?')

['költő', 'műfordító', 'színész', 'író']

In [108]:
ask_single_hop('Miben halt meg Petőfi Sándor?')

['csatában elesett']

In [109]:
ask_single_hop('Hol halt meg Petőfi Sándor?')

['Fehéregyháza']

In [118]:
ask_single_hop('Milyen felekezethez tartozott Petőfi?')

[]

In [119]:
ask_single_hop('Milyen felekezethez tartozott Petőfi Sándor?')

['evangélikus kereszténység']

In [120]:
ask_single_hop('Mi volt Petőfi Sándor vallása?')

['evangélikus kereszténység']

In [121]:
ask_single_hop('Mi Kosztolányi Dezső neme?')

['férfi', 'férfi', 'férfi']

In [122]:
ask_single_hop('Kosztolányi Dezső férfi volt?')

['férfi', 'férfi', 'férfi']