In [1]:
#!/usr/bin/python
# -*- coding: utf-8 -*-

from WikidataTreeBuilderSPARQL import WikidataTreeBuilder
from WikidataEntityExplorer import WikidataGetEntities

from datetime import datetime
import pandas as pd
import simplejson as json

In [2]:
print("Program starts at "+str(datetime.now()))

Program starts at 2017-07-11 16:51:29.313976


Recursively explore the node 'Computer Science':

In [3]:
tree=WikidataTreeBuilder(keepTraceVisitedNodes=True)

print("Exploring 'Computer Science' start: "+str(datetime.now()))
flareComputerScience=tree.fromRoot(root=("Q21198","Computer Science"),forbidden=[])
print("Exploring 'Computer Science' end: "+str(datetime.now()))

# Save the extraction result
with open("outputComputerScience.json","w") as f:
    json.dump(flareComputerScience,f)

Exploring 'Computer Science' start: 2017-07-11 16:51:29.328603
Exploring 'Computer Science' end: 2017-07-11 16:51:33.719879


Convert the nested flare to list, and import it to a pandas dataframe:

In [4]:
def nestedNodeExplore(nested):
    if nested["name"][0] != "-1":
        flatFrame.append({"id":nested["name"][0], "name":nested["name"][1],"visitedNodes":nested["visitedNodes"]})
    for node in nested.get("children",[]):
        nestedNodeExplore(node)
    return "DONE"

In [5]:
flatFrame = list()
nestedNodeExplore(flareComputerScience)
df=pd.DataFrame(flatFrame)

#convert to tuple because list is not hashable and drop_duplicates will fail
df["visitedNodes"]=df["visitedNodes"].apply(lambda x:tuple(x))
df=df.drop_duplicates()

df.head()

Unnamed: 0,id,name,visitedNodes
0,Q21198,Computer Science,()
1,Q2539,machine learning,"(Q21198,)"
2,Q334384,supervised learning,"(Q21198, Q2539)"
3,Q282453,Support vector machine,"(Q21198, Q2539, Q334384)"
4,Q910067,chemometrics,"(Q21198, Q2539)"


There are many paths from the root to "Natural Language Processing":

In [6]:
df[df["name"]=="natural language processing"]

Unnamed: 0,id,name,visitedNodes
28,Q30642,natural language processing,"(Q21198,)"
60,Q30642,natural language processing,"(Q21198, Q182557)"


For better readabilty of the result, we'll extract a conversion table from Q to name:

In [7]:
Q2name={item["id"]:item['name'] for item in df.drop("visitedNodes",1).to_dict(orient='records')}

and make the 'visited nodes' column human-readable:

In [8]:
df["visitedNodes"]=df["visitedNodes"].apply(lambda x:tuple([(i,Q2name.get(i,i)) for i in x]))

We will group all paths in a single entry:

In [9]:
grouped=df.groupby("id").apply(lambda x:x.visitedNodes)
df["visitedNodes"]=df["id"].apply(lambda x:tuple(grouped[x]))
df=df.drop_duplicates()

In [10]:
df[df["name"]=="natural language processing"]

Unnamed: 0,id,name,visitedNodes
28,Q30642,natural language processing,"(((Q21198, Computer Science),), ((Q21198, Comp..."


And now we will enrich the table with several other properties:

In [11]:
getNewData=WikidataGetEntities()
dfDict=df.to_dict(orient='records')
dfUpdated=list ()
print("Enriching database. Start : "+str(datetime.now()))
for item in dfDict:
    item.update(getNewData.enrichTable(item))
    dfUpdated.append(item)
print("Enriching database. End : "+str(datetime.now()))

Enriching database. Start : 2017-07-11 16:51:33.920598
Enriching database. End : 2017-07-11 16:51:51.263014


In [12]:
df=pd.DataFrame(dfUpdated)
df=df.fillna("")

Make several columns human-readable:

In [13]:
fillMissing=lambda c:getNewData.query(c).get("entities",{}).get(c,{}).get("labels",{}).get("en",{}).get("value",c)
makeHumanReadable=lambda x:tuple([(i,Q2name.get(i,i)) if i in Q2name.keys() else (i,fillMissing(i))] for i in x)

df["P306"]=df["P306"].apply(makeHumanReadable)
df["P366"]=df["P366"].apply(makeHumanReadable)
df["P277"]=df["P277"].apply(makeHumanReadable)
df["P31"]=df["P31"].apply(makeHumanReadable)
df["P275"]=df["P275"].apply(makeHumanReadable)
df["P178"]=df["P178"].apply(makeHumanReadable)
df["P101"]=df["P101"].apply(makeHumanReadable)
df["P170"]=df["P170"].apply(makeHumanReadable)
df["P1324"]=df["P1324"].apply(makeHumanReadable)

Make column labels human-readable:

In [14]:
for c in df.columns:
    label=fillMissing(c)
    nc=c+(" ("+label+")")*(label != c)
    df=df.rename(columns={c:nc})
df=df.reindex_axis(sorted(df.columns,reverse=True), axis=1)

In [15]:
df.to_excel("ComputerScienceTable.xlsx")

In [16]:
print("Program ends at "+str(datetime.now()))

Program ends at 2017-07-11 16:52:15.342681
