In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-

from WikidataTreeBuilderSPARQL import WikidataTreeBuilder
from WikidataEntityExplorer import WikidataGetEntities

from datetime import datetime
import pandas as pd
import simplejson as json

Recursively explore the node 'Computer Science':

In [None]:
tree=WikidataTreeBuilder(keepTraceVisitedNodes=True)

print("Exploring 'Computer Science' start: "+str(datetime.now()))
flareComputerScience=tree.fromRoot(root=("Q21198","Computer Science"),forbidden=[])
print("Exploring 'Computer Science' end: "+str(datetime.now()))

# Save the extraction result
with open("outputComputerScience.json","w") as f:
    json.dump(flareComputerScience,f)

Convert the nested flare to list, and import it to a pandas dataframe:

In [None]:
def nestedNodeExplore(nested):
    if nested["name"][0] != "-1":
        flatFrame.append({"id":nested["name"][0], "name":nested["name"][1],"visitedNodes":nested["visitedNodes"]})
    for node in nested.get("children",[]):
        nestedNodeExplore(node)
    return "DONE"

In [None]:
flatFrame = list()
nestedNodeExplore(flareComputerScience)
df=pd.DataFrame(flatFrame)

#convert to tuple because list is not hashable and drop_duplicates will fail
df["visitedNodes"]=df["visitedNodes"].apply(lambda x:tuple(x))
df=df.drop_duplicates()

df.head()

There are many paths from the root to "Natural Language Processing":

In [None]:
df[df["name"]=="natural language processing"]

For better readabilty of the result, we'll extract a conversion table from Q to name:

In [None]:
Q2name={item["id"]:item['name'] for item in df.drop("visitedNodes",1).to_dict(orient='records')}

and make the 'visited nodes' column human-readable:

In [None]:
df["visitedNodes"]=df["visitedNodes"].apply(lambda x:tuple([(i,Q2name.get(i,i)) for i in x]))

We will group all paths in a single entry:

In [None]:
grouped=df.groupby("id").apply(lambda x:x.visitedNodes)
df["visitedNodes"]=df["id"].apply(lambda x:tuple(grouped[x]))
df=df.drop_duplicates()

In [None]:
df[df["name"]=="natural language processing"]

And now we will enrich the table with several other properties:

In [None]:
getNewData=WikidataGetEntities()
dfDict=df.to_dict(orient='records')
dfUpdated=list ()
print("Enriching database. Start : "+str(datetime.now()))
for item in dfDict:
    item.update(getNewData.enrichTable(item))
    dfUpdated.append(item)
print("Enriching database. End : "+str(datetime.now()))

In [None]:
df=pd.DataFrame(dfUpdated)
df=df.fillna("")

Make several columns human-readable:

In [None]:
fillMissing=lambda c:getNewData.query(c).get("entities",{}).get(c,{}).get("labels",{}).get("en",{}).get("value",c)
makeHumanReadable=lambda x:tuple([(i,Q2name.get(i,i)) if i in Q2name.keys() else (i,fillMissing(i))] for i in x)

df["P306"]=df["P306"].apply(makeHumanReadable)
df["P366"]=df["P366"].apply(makeHumanReadable)
df["P277"]=df["P277"].apply(makeHumanReadable)
df["P31"]=df["P31"].apply(makeHumanReadable)
df["P275"]=df["P275"].apply(makeHumanReadable)
df["P178"]=df["P178"].apply(makeHumanReadable)
df["P101"]=df["P101"].apply(makeHumanReadable)

Make column labels human-readable:

In [None]:
for c in df.columns:
    label=fillMissing(c)
    nc=c+(" ("+label+")")*(label != c)
    df=df.rename(columns={c:nc})
df=df.reindex_axis(sorted(df.columns,reverse=True), axis=1)

In [None]:
df.head()

In [None]:
df.to_excel("ComputerScienceTable.xlsx")