In [1]:
import pandas as pd

import spacy
from spaczz.matcher import FuzzyMatcher

In [2]:
triplets = pd.read_json('data/poet_triplets.json')
triplets.head()

Unnamed: 0,source,edge,destination
0,"{'name': 'Bencze József', 'wiki_url': '/wiki/B...","{'wikidata_id': 'P31', 'name': 'osztály, amely...","{'wikidata_id': 'Q5', 'name': 'ember', 'descri..."
1,"{'name': 'Bencze József', 'wiki_url': '/wiki/B...","{'wikidata_id': 'P21', 'name': 'nem', 'descrip...","{'wikidata_id': 'Q6581097', 'name': 'férfi', '..."
2,"{'name': 'Bencze József', 'wiki_url': '/wiki/B...","{'wikidata_id': 'P735', 'name': 'utónév', 'des...","{'wikidata_id': 'Q17498051', 'name': 'József',..."
3,"{'name': 'Bencze József', 'wiki_url': '/wiki/B...","{'wikidata_id': 'P570', 'name': 'halálozási id...","{'time': '+1988-02-02T00:00:00Z', 'timezone': ..."
4,"{'name': 'Bencze József', 'wiki_url': '/wiki/B...","{'wikidata_id': 'P106', 'name': 'foglalkozás',...","{'wikidata_id': 'Q49757', 'name': 'költő', 'de..."


In [3]:
triplets.edge.iloc[5]

{'wikidata_id': 'P569',
 'name': 'születési idő',
 'description': 'mikor született az alany',
 'aliases': ['született', 'születési dátum']}

In [4]:
def filter_kg(node, edge):
    entity_triplets = triplets[triplets.source.apply(lambda d: d['wikidata_id'] == node)]
    edge_triplets = entity_triplets[entity_triplets.edge.apply(lambda d: d['wikidata_id'] == edge)]
    
    res = []
    if len(edge_triplets):
        for d in edge_triplets.destination.tolist():
            if 'name' in d:
                res.append(d['name'])
            else:
                res.append(d)
    
    return res

In [5]:
sources = []

for d in triplets.source.tolist():
    tup = (d['wikidata_id'], d['name'])
    if tup not in sources:
        sources.append(tup)
        
len(sources)

2052

In [6]:
edges = []

for d in triplets.edge.tolist():
    tup = (d['wikidata_id'], d['name'])
    if tup not in edges:
        edges.append(tup)
        
len(edges)

381

In [7]:
nlp = spacy.blank("hu")
source_matcher = FuzzyMatcher(nlp.vocab)

In [None]:
for _id, name in sources:
    source_matcher.add(_id, [nlp(name)])
    
for _id, name in edges:
    source_matcher.add(_id, [nlp(name)])

In [None]:
def ask_single_hop(text):
    doc = nlp(text)
    matches = source_matcher(doc)

    source_matches = []
    edge_matches = []

    for match_id, start, end, ratio in matches:
        if match_id.startswith('Q'):
            source_matches.append((ratio, match_id))
        else:
            edge_matches.append((ratio, match_id))

    source_matches = sorted(source_matches, reverse=True)
    edge_matches = sorted(edge_matches, reverse=True)
    
    if source_matches and edge_matches:
        source_m = source_matches[0][1]
        edge_m = edge_matches[0][1]

        return filter_kg(source_m, edge_m)
    else:
        return []

In [9]:
ask_single_hop('Mi Petőfi Sándor foglalkozása?')

['költő', 'műfordító', 'színész', 'író']

In [11]:
ask_single_hop('Mivel foglalkozott Petőfi Sándor?')

['költő', 'műfordító', 'színész', 'író']