In [1]:
import numpy as np
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
from apyori import apriori
from mlxtend.preprocessing import TransactionEncoder
from tqdm import tqdm

In [2]:
queryString = "SELECT * WHERE { ?s ?p ?o. }"
sparql = SPARQLWrapper("http://localhost:3030/taxon/sparql")
sparql.setQuery(queryString)

try :
   ret = sparql.query()
   # ret is a stream with the results in XML, see <http://www.w3.org/TR/rdf-sparql-XMLres/>
except :
   pass

In [3]:
sparql.setQuery("""
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT ?s (COUNT(?prop) AS ?total) {
  
  SELECT DISTINCT ?s ?prop
    WHERE {
    ?s wdt:P31 wd:Q16521 .
    ?s ?prop ?value .
    }

} GROUP BY ?s
ORDER BY DESC(?total) 
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

results

{'head': {'vars': ['s', 'total']},
 'results': {'bindings': [{'s': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q140'},
    'total': {'type': 'literal',
     'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
     'value': '27'}},
   {'s': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q152'},
    'total': {'type': 'literal',
     'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
     'value': '22'}},
   {'s': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q14443'},
    'total': {'type': 'literal',
     'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
     'value': '18'}},
   {'s': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q19075'},
    'total': {'type': 'literal',
     'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
     'value': '16'}},
   {'s': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q19078'},
    'total': {'type': 'literal',
     'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
   

In [4]:
res = []
for results in results["results"]["bindings"]:
    print('%s: %s' % (results["s"]["value"], results["total"]["value"]))
    entity = str(results["s"]["value"]).split('/')
    res.append(entity[-1])
print('---------------------------')

http://www.wikidata.org/entity/Q140: 27
http://www.wikidata.org/entity/Q152: 22
http://www.wikidata.org/entity/Q14443: 18
http://www.wikidata.org/entity/Q19075: 16
http://www.wikidata.org/entity/Q19078: 16
http://www.wikidata.org/entity/Q19070: 15
http://www.wikidata.org/entity/Q5176: 15
http://www.wikidata.org/entity/Q14272: 14
http://www.wikidata.org/entity/Q5174: 14
http://www.wikidata.org/entity/Q5182: 14
http://www.wikidata.org/entity/Q10630: 13
http://www.wikidata.org/entity/Q10639: 13
http://www.wikidata.org/entity/Q10642: 13
http://www.wikidata.org/entity/Q10631: 12
http://www.wikidata.org/entity/Q10635: 12
http://www.wikidata.org/entity/Q10637: 11
http://www.wikidata.org/entity/Q10669: 9
http://www.wikidata.org/entity/Q14532: 8
---------------------------


In [5]:
db = []

for i in range(len(res)):
    query_string = """
    PREFIX wd: <http://www.wikidata.org/entity/>
    SELECT DISTINCT ?s ?prop {
    VALUES ?s {wd:""" + res[i] + """}
    ?s ?prop ?value .
    }
    """

    sparql.setQuery(query_string)
    sparql.setReturnFormat(JSON)
    results_entity = sparql.query().convert()
    propLabel = []
    for results in results_entity["results"]["bindings"]:
#         print('%s: %s' % (results["country"]["value"], results["propLabel"]["value"]))
        propLabel.append(results["prop"]["value"])
#     print('---------------------------')
    db.append(propLabel)

In [6]:
te = TransactionEncoder()
te_ary = te.fit(db).transform(db)
df = pd.DataFrame(te_ary, columns=te.columns_)
df

Unnamed: 0,http://www.wikidata.org/prop/P105,http://www.wikidata.org/prop/P1343,http://www.wikidata.org/prop/P141,http://www.wikidata.org/prop/P1421,http://www.wikidata.org/prop/P1552,http://www.wikidata.org/prop/P171,http://www.wikidata.org/prop/P18,http://www.wikidata.org/prop/P181,http://www.wikidata.org/prop/P1843,http://www.wikidata.org/prop/P1889,...,http://www.wikidata.org/prop/direct/P2579,http://www.wikidata.org/prop/direct/P279,http://www.wikidata.org/prop/direct/P2974,http://www.wikidata.org/prop/direct/P31,http://www.wikidata.org/prop/direct/P3512,http://www.wikidata.org/prop/direct/P361,http://www.wikidata.org/prop/direct/P427,http://www.wikidata.org/prop/direct/P5008,http://www.wikidata.org/prop/direct/P5125,http://www.wikidata.org/prop/direct/P910
0,True,False,True,False,False,True,True,True,True,False,...,False,True,False,True,False,False,False,False,False,True
1,True,False,False,False,False,True,True,False,False,False,...,True,True,False,True,True,False,False,True,True,True
2,True,False,True,False,False,True,True,False,True,False,...,False,True,False,True,False,False,False,False,False,True
3,True,False,True,False,False,True,True,False,True,False,...,False,False,False,True,False,False,False,False,False,True
4,True,False,False,False,False,True,True,False,False,True,...,False,False,True,True,False,True,False,False,False,True
5,True,False,False,True,False,True,True,False,True,False,...,False,False,False,True,False,False,True,False,False,True
6,True,False,False,False,True,True,True,False,True,False,...,False,False,False,True,False,False,False,False,False,True
7,True,False,False,False,False,True,True,False,True,False,...,False,False,False,True,False,False,True,False,False,True
8,True,False,False,False,False,True,True,False,False,False,...,False,False,False,True,False,False,False,False,False,True
9,True,True,False,False,False,True,True,False,False,False,...,False,False,False,True,False,False,False,False,False,True


In [7]:
wikidata = SPARQLWrapper("https://query.wikidata.org/sparql")

propList = df.columns.tolist()
for i in range(len(propList)):
    propList[i]=propList[i].split('/')[-1]

In [8]:
propLabel = []

for i in tqdm(range(len(propList))):
    query_string = """
    SELECT DISTINCT ?propLabel {
      VALUES ?p {wdt:""" + propList[i] + """}
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". } 
      ?prop wikibase:directClaim ?p .
    }
    """

    wikidata.setQuery(query_string)
    wikidata.setReturnFormat(JSON)
    results_prop = wikidata.query().convert()
    for results in results_prop["results"]["bindings"]:
#         print('%s: %s' % (results["country"]["value"], results["propLabel"]["value"]))
        propLabel.append(results["propLabel"]["value"])
#     print('---------------------------')

100%|██████████| 53/53 [00:26<00:00,  1.97it/s]


In [9]:
df.columns = propLabel
df

Unnamed: 0,taxon rank,described by source,IUCN conservation status,GRIN URL,has quality,parent taxon,image,taxon range map image,taxon common name,different from,...,studied by,subclass of,habitat,instance of,means of locomotion,part of,taxonomic type,on focus list of Wikimedia project,Wikimedia outline,topic's main category
0,True,False,True,False,False,True,True,True,True,False,...,False,True,False,True,False,False,False,False,False,True
1,True,False,False,False,False,True,True,False,False,False,...,True,True,False,True,True,False,False,True,True,True
2,True,False,True,False,False,True,True,False,True,False,...,False,True,False,True,False,False,False,False,False,True
3,True,False,True,False,False,True,True,False,True,False,...,False,False,False,True,False,False,False,False,False,True
4,True,False,False,False,False,True,True,False,False,True,...,False,False,True,True,False,True,False,False,False,True
5,True,False,False,True,False,True,True,False,True,False,...,False,False,False,True,False,False,True,False,False,True
6,True,False,False,False,True,True,True,False,True,False,...,False,False,False,True,False,False,False,False,False,True
7,True,False,False,False,False,True,True,False,True,False,...,False,False,False,True,False,False,True,False,False,True
8,True,False,False,False,False,True,True,False,False,False,...,False,False,False,True,False,False,False,False,False,True
9,True,True,False,False,False,True,True,False,False,False,...,False,False,False,True,False,False,False,False,False,True


In [10]:
from mlxtend.frequent_patterns import association_rules, fpmax, fpgrowth

In [11]:
frequent_itemsets = fpgrowth(df, min_support=0.5, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,1.000000,(instance of)
1,1.000000,(parent taxon)
2,1.000000,(instance of)
3,1.000000,(taxon name)
4,1.000000,(parent taxon)
...,...,...
2554,0.722222,"(taxon rank, parent taxon, image, instance of,..."
2555,0.722222,"(taxon rank, parent taxon, image, instance of,..."
2556,0.722222,"(taxon rank, parent taxon, image, instance of,..."
2557,0.722222,"(taxon rank, parent taxon, image, instance of,..."


In [12]:
res = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
res

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(parent taxon),(instance of),1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,inf
1,(instance of),(parent taxon),1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,inf
2,(instance of),(taxon name),1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,inf
3,(taxon name),(instance of),1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,inf
4,(parent taxon),(taxon name),1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,inf
...,...,...,...,...,...,...,...,...,...
2535,"(taxon name, topic's main category)","(taxon rank, Commons category, parent taxon, i...",0.777778,0.666667,0.666667,0.857143,1.285714,0.148148,2.333333
2536,(taxon rank),"(Commons category, parent taxon, image, instan...",0.944444,0.666667,0.666667,0.705882,1.058824,0.037037,1.133333
2537,(Commons category),"(taxon rank, parent taxon, image, instance of,...",0.777778,0.722222,0.666667,0.857143,1.186813,0.104938,1.944444
2538,(image),"(taxon rank, Commons category, parent taxon, i...",0.944444,0.722222,0.666667,0.705882,0.977376,-0.015432,0.944444
