In [2]:
import pandas as pd
from tqdm import tqdm

# Get entities

In [1]:
from kg_tools.wikitools import get_entities_of_category

In [2]:
BASE_CAT_URL = 'https://hu.wikipedia.org/w/index.php?title=Kateg%C3%B3ria:Magyar_k%C3%B6lt%C5%91k&from=A'

In [None]:
entity_df = get_entities_of_category(BASE_CAT_URL)
entity_df

Vitéz Ferenc failed.
Bék Timur failed.
Halmi Tibor failed.


In [None]:
entity_df['wikidata_id'] = entity_df.wikidata_url.apply(lambda s: s.split('/')[-1])

In [None]:
entity_df

In [None]:
entity_df.to_json('data/poet_entites.json', orient='records')

# Get triplets

In [16]:
from kg_tools.kg_creation import triplet_for_id

In [17]:
entity_df = pd.read_json('data/poet_entites.json', orient='records')

In [20]:
poet_ids = entity_df.wikidata_id.unique().tolist()

In [22]:
triplets = []

for _id in tqdm(poet_ids):
    triplets.append(triplet_for_id(_id))
    
triplets_df = pd.concat(triplets)

100%|██████████| 2053/2053 [15:31<00:00,  2.20it/s]


In [23]:
triplets_df.to_json('data/poet_triplets.json', orient='records')

# Query edge properties

In [1]:
from kg_tools.kg_creation import get_properties

In [3]:
triplets_df = pd.read_json('data/poet_triplets.json', orient='records')
triplets_df.head()

Unnamed: 0,source,edge,destination
0,Q17361654,P31,"{'entity-type': 'item', 'numeric-id': 5, 'id':..."
1,Q17361654,P21,"{'entity-type': 'item', 'numeric-id': 6581097,..."
2,Q17361654,P735,"{'entity-type': 'item', 'numeric-id': 17498051..."
3,Q17361654,P570,"{'time': '+1988-02-02T00:00:00Z', 'timezone': ..."
4,Q17361654,P106,"{'entity-type': 'item', 'numeric-id': 49757, '..."


In [4]:
edges = triplets_df.edge.unique().tolist()

In [7]:
edge_properties = {}

for edge_id in tqdm(edges):
    try:
        edge_properties[edge_id] = get_properties(edge_id)
    except:
        print(edge_id, 'skipped.')

  3%|▎         | 11/381 [00:07<03:11,  1.93it/s]

T_description skipped.


  9%|▉         | 36/381 [00:22<03:16,  1.75it/s]

T_alias skipped.


100%|██████████| 381/381 [03:19<00:00,  1.91it/s]


In [9]:
edge_properties['T_alias'] = {'name': 'alias'}
edge_properties['T_description'] = {'name': 'description'}

In [10]:
triplets_df['edge'] = triplets_df['edge'].apply(lambda e: edge_properties[e])

In [12]:
triplets_df.to_json('data/poet_triplets.json', orient='records')

# Merge source properties

In [18]:
triplets_df = pd.read_json('data/poet_triplets.json', orient='records')
source_df = pd.read_json('data/poet_entites.json', orient='records')

In [19]:
source_dict = dict([(d['wikidata_id'], d) for d in source_df.to_dict(orient='records')])

In [19]:
triplets_df['source'] = triplets_df['source'].apply(lambda s: source_dict[s])

In [21]:
triplets_df.head()

Unnamed: 0,source,edge,destination
0,"{'name': 'Bencze József (költő)', 'wiki_url': ...","{'wikidata_id': 'P31', 'name': 'osztály, amely...","{'entity-type': 'item', 'numeric-id': 5, 'id':..."
1,"{'name': 'Bencze József (költő)', 'wiki_url': ...","{'wikidata_id': 'P21', 'name': 'nem', 'descrip...","{'entity-type': 'item', 'numeric-id': 6581097,..."
2,"{'name': 'Bencze József (költő)', 'wiki_url': ...","{'wikidata_id': 'P735', 'name': 'utónév', 'des...","{'entity-type': 'item', 'numeric-id': 17498051..."
3,"{'name': 'Bencze József (költő)', 'wiki_url': ...","{'wikidata_id': 'P570', 'name': 'halálozási id...","{'time': '+1988-02-02T00:00:00Z', 'timezone': ..."
4,"{'name': 'Bencze József (költő)', 'wiki_url': ...","{'wikidata_id': 'P106', 'name': 'foglalkozás',...","{'entity-type': 'item', 'numeric-id': 49757, '..."


In [22]:
triplets_df.to_json('data/poet_triplets.json', orient='records')

# Expand and normalize destination

In [3]:
from kg_tools.kg_creation import get_properties

In [4]:
triplets_df = pd.read_json('data/poet_triplets.json', orient='records')

In [5]:
destinations = triplets_df.destination.tolist()

In [6]:
dest_nodes = []

for dest in destinations:
    if 'entity-type' in dest:
        if dest['entity-type'] == 'item':
            dest_nodes.append(dest['id'])
            
dest_nodes = list(set(dest_nodes))

In [7]:
len(dest_nodes)

3145

In [8]:
dest_nodes_dct = {}

for d in tqdm(dest_nodes):
    dest_nodes_dct[d] = get_properties(d)

100%|██████████| 3145/3145 [28:17<00:00,  1.85it/s]


In [11]:
def expand_wikidata_node(d):
    if 'entity-type' in d:
        if d['entity-type'] == 'item':
            return dest_nodes_dct[d['id']]
    
    return d

In [13]:
triplets_df['destination'] = triplets_df.destination.apply(lambda d: expand_wikidata_node(d))

In [16]:
triplets_df.head()

Unnamed: 0,source,edge,destination
0,"{'name': 'Bencze József (költő)', 'wiki_url': ...","{'wikidata_id': 'P31', 'name': 'osztály, amely...","{'wikidata_id': 'Q5', 'name': 'ember', 'descri..."
1,"{'name': 'Bencze József (költő)', 'wiki_url': ...","{'wikidata_id': 'P21', 'name': 'nem', 'descrip...","{'wikidata_id': 'Q6581097', 'name': 'férfi', '..."
2,"{'name': 'Bencze József (költő)', 'wiki_url': ...","{'wikidata_id': 'P735', 'name': 'utónév', 'des...","{'wikidata_id': 'Q17498051', 'name': 'József',..."
3,"{'name': 'Bencze József (költő)', 'wiki_url': ...","{'wikidata_id': 'P570', 'name': 'halálozási id...","{'time': '+1988-02-02T00:00:00Z', 'timezone': ..."
4,"{'name': 'Bencze József (költő)', 'wiki_url': ...","{'wikidata_id': 'P106', 'name': 'foglalkozás',...","{'wikidata_id': 'Q49757', 'name': 'költő', 'de..."


In [15]:
triplets_df.to_json('data/poet_triplets.json', orient='records')