In [1]:
import pandas as pd
from tqdm import tqdm

# Get entities

In [1]:
from kg_tools.wikitools import get_entities_of_category

In [2]:
BASE_CAT_URL = 'https://hu.wikipedia.org/w/index.php?title=Kateg%C3%B3ria:Magyar_k%C3%B6lt%C5%91k&from=A'

In [None]:
entity_df = get_entities_of_category(BASE_CAT_URL)
entity_df

Vitéz Ferenc failed.
Bék Timur failed.
Halmi Tibor failed.


In [None]:
entity_df['wikidata_id'] = entity_df.wikidata_url.apply(lambda s: s.split('/')[-1])

In [None]:
entity_df

In [None]:
entity_df.to_json('data/poet_entites.json', orient='records')

# Get triplets

In [16]:
from kg_tools.kg_creation import triplet_for_id

In [17]:
entity_df = pd.read_json('data/poet_entites.json', orient='records')

In [20]:
poet_ids = entity_df.wikidata_id.unique().tolist()

In [22]:
triplets = []

for _id in tqdm(poet_ids):
    triplets.append(triplet_for_id(_id))
    
triplets_df = pd.concat(triplets)

100%|██████████| 2053/2053 [15:31<00:00,  2.20it/s]


In [23]:
triplets_df.to_json('data/poet_triplets.json', orient='records')

# Query edge properties

In [2]:
from kg_tools.kg_creation import get_edge_properties

In [3]:
triplets_df = pd.read_json('data/poet_triplets.json', orient='records')
triplets_df.head()

Unnamed: 0,source,edge,destination
0,Q17361654,P31,"{'entity-type': 'item', 'numeric-id': 5, 'id':..."
1,Q17361654,P21,"{'entity-type': 'item', 'numeric-id': 6581097,..."
2,Q17361654,P735,"{'entity-type': 'item', 'numeric-id': 17498051..."
3,Q17361654,P570,"{'time': '+1988-02-02T00:00:00Z', 'timezone': ..."
4,Q17361654,P106,"{'entity-type': 'item', 'numeric-id': 49757, '..."


In [4]:
edges = triplets_df.edge.unique().tolist()

In [7]:
edge_properties = {}

for edge_id in tqdm(edges):
    try:
        edge_properties[edge_id] = get_edge_properties(edge_id)
    except:
        print(edge_id, 'skipped.')

  3%|▎         | 11/381 [00:07<03:11,  1.93it/s]

T_description skipped.


  9%|▉         | 36/381 [00:22<03:16,  1.75it/s]

T_alias skipped.


100%|██████████| 381/381 [03:19<00:00,  1.91it/s]


In [9]:
edge_properties['T_alias'] = {'name': 'alias'}
edge_properties['T_description'] = {'name': 'description'}

In [10]:
triplets_df['edge'] = triplets_df['edge'].apply(lambda e: edge_properties[e])

In [12]:
triplets_df.to_json('data/poet_triplets.json', orient='records')

# Merge source properties

In [13]:
triplets_df = pd.read_json('data/poet_triplets.json', orient='records')
source_df = pd.read_json('data/poet_entites.json', orient='records')

In [16]:
source_dict = dict([(d['wikidata_id'], d) for d in source_df.to_dict(orient='records'))
                    source_dict

[{'name': 'Bencze József (költő)',
  'wiki_url': '/wiki/Bencze_J%C3%B3zsef_(k%C3%B6lt%C5%91)',
  'wikidata_url': 'https://www.wikidata.org/wiki/Special:EntityPage/Q17361654',
  'wikidata_id': 'Q17361654'},
 {'name': 'Erg Ágoston',
  'wiki_url': '/wiki/Erg_%C3%81goston',
  'wikidata_url': 'https://www.wikidata.org/wiki/Special:EntityPage/Q990595',
  'wikidata_id': 'Q990595'},
 {'name': 'Szokolay Zoltán',
  'wiki_url': '/wiki/Szokolay_Zolt%C3%A1n',
  'wikidata_url': 'https://www.wikidata.org/wiki/Special:EntityPage/Q1299686',
  'wikidata_id': 'Q1299686'},
 {'name': 'Payer Imre (költő)',
  'wiki_url': '/wiki/Payer_Imre_(k%C3%B6lt%C5%91)',
  'wikidata_url': 'https://www.wikidata.org/wiki/Special:EntityPage/Q1231706',
  'wikidata_id': 'Q1231706'},
 {'name': 'Nógrádi Gábor',
  'wiki_url': '/wiki/N%C3%B3gr%C3%A1di_G%C3%A1bor',
  'wikidata_url': 'https://www.wikidata.org/wiki/Special:EntityPage/Q1221507',
  'wikidata_id': 'Q1221507'},
 {'name': 'Fekete Anna',
  'wiki_url': '/wiki/Fekete_Anna',