In [45]:
import json
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

# Get wikidata ids

In [29]:
WIKI_URL = "https://ja.wikipedia.org/wiki/%E4%BA%BA%E7%89%A9%E5%8F%A2%E6%9B%B8"


def get_jinbutsu_list():
    """ returns names and wikidata links of all entries in wikipedia jinbutsu sousho list """
    re = requests.get(WIKI_URL)
    soup = BeautifulSoup(re.text, "html.parser")
    content = soup.find("div", {"id": "mw-content-text"})
    
    entry_list = []
    
    for li in content.find_all("li"):
        title_links = []
        for i, a in enumerate(li.find_all("a")):
            if i == 0:
                author_link = a
            else:
                title_links.append(a)
        entry_list.append({
            "author": { 
                "name": author_link.text,
                "link": author_link["href"]
            },
            "title": [ { "name": x.text, "link": x["href"] } for x in title_links ]
        })
    #print(content)
    return entry_list

In [52]:
JA_WIKI = "https://ja.wikipedia.org"

def get_wikidata_id(wiki_link):
    """ return wikidata id for an wikidata link """
    re = requests.get(JA_WIKI+wiki_link)
    soup = BeautifulSoup(re.text, "html.parser")
    wikibase = soup.find("li", {"id":"t-wikibase"})
    try:
        return wikibase.find("a")["href"]
    except:
        return None

In [253]:
wikidata_ids = []
author_ids = []

entries = get_jinbutsu_list()
for entry in tqdm(entries):
    wikidata = get_wikidata_id(entry["author"]["link"])
    if wikidata:
        author_ids.append(wikidata)
    for title in entry["title"]:
        wikidata = get_wikidata_id(title["link"])
        if wikidata:
            wikidata_ids.append(wikidata)


100%|██████████| 290/290 [03:35<00:00,  1.46it/s]


In [254]:
wkps = [ x.split("/")[-1].strip() for x in wikidata_ids ]
author_wkps = [ x.split("/")[-1].strip() for x in author_ids ]

# Setup wikidata sparql functions

In [114]:
from SPARQLWrapper import SPARQLWrapper, JSON

In [162]:
QUERY = """
SELECT ?genderLabel ?countryLabel ?birth ?death ?place_birthLabel ?occupationLabel ?viaf
WHERE 
{{
  OPTIONAL {{
    wd:{id} wdt:P21 ?gender .
  }}
  OPTIONAL {{
    wd:{id} wdt:P27 ?country .
  }}
  OPTIONAL {{
    wd:{id} wdt:P569 ?birth .
  }}
  OPTIONAL {{
    wd:{id} wdt:P570 ?death .
  }}
  OPTIONAL {{
    wd:{id} wdt:P19 ?place_birth .
  }}
  OPTIONAL {{
    wd:{id} wdt:P106 ?occupation .
  }}
   OPTIONAL {{
    wd:{id} wdt:P214 ?viaf .
  }}
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
}}
"""

In [163]:
def get_field(results, field):
    """ returns field value from a sparql result set  """
    rv = set()
    for result in results:
        if field in result:
            rv.add(result[field]["value"])
    rv = list(rv)
    if rv == []:
        return None
    elif len(rv) == 1:
        return rv[0]
    else:
        return rv[0]

In [164]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

def query_info(wkp):
    """ fetches person info from wikidata """
    info = {}
    sparql.setQuery(QUERY.format(id=wkp))
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    result = results["results"]["bindings"]
    info["wkp"] = wkp
    info["gender"] = get_field(result, "genderLabel")
    info["birth_date"] = get_field(result, "birth")
    info["birth_place"] = get_field(result, "place_birthLabel")
    info["death_date"] = get_field(result, "death")
    info["occupation"] = get_field(result, "occupationLabel")
    info["viaf"] = get_field(result, "viaf")
    
    return info

# Retrieve jinbutsu person infos

In [165]:
jinbutsu = []

for wkp in tqdm(wkps):
    jinbutsu.append(query_info(wkp))

100%|██████████| 289/289 [01:09<00:00,  4.37it/s]


In [None]:
df = pd.DataFrame(jinbutsu)
df.to_csv("jinbutsu.csv")

# Retrieve jinbutsu authors infos

In [255]:
jinbutsu_authors = []
for wkp in tqdm(author_wkps):
    jinbutsu_authors.append(query_info(wkp))

100%|██████████| 216/216 [00:50<00:00,  4.34it/s]


In [258]:
df = pd.DataFrame(jinbutsu_authors)
df.to_csv("jinbutsu_authors.csv")