In [1]:
from SPARQLWrapper import SPARQLWrapper2 # SPARQLWrapper2 works with a JSON SELECT return result only
import pandas as pds

## define function for converting sparql results to a dataframe

In [3]:
def sparql_to_df(results):
    data = {}
    for variable in results.variables:
        data[variable] = [x.value for x in results.getValues(variable)]
    return pds.DataFrame(data)

## set connection to local triple store

In [10]:
# sparql = SPARQLWrapper2("http://192.168.1.243:7200/repositories/EDR_NO_ES") # local server
sparql = SPARQLWrapper2("http://10.16.128.24:7200/repositories/EDR_NO_ES") # RI server

## test getting some results

In [11]:
q = "select ?s ?p ?o where { ?s ?p ?o } limit 5"
sparql.setQuery(q)
res = sparql.query()
sparql_to_df(res)

Unnamed: 0,s,p,o
0,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...
1,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/2002/07/owl#Thing
2,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/2002/07/owl#ObjectProperty
3,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/2000/01/rdf-schema#Class
4,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/2002/07/owl#Class


## define some standard prefixes to use in queries

In [16]:
prefixes = """
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX part_of: <http://purl.obolibrary.org/obo/BFO_0000050>
PREFIX has_part: <http://purl.obolibrary.org/obo/BFO_0000051>
PREFIX material: <http://purl.obolibrary.org/obo/OHD_0000000>
PREFIX tooth: <http://purl.obolibrary.org/obo/FMA_12516>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX ada_num: <http://purl.obolibrary.org/obo/OHD_0000065>
PREFIX patient: <http://purl.obolibrary.org/obo/OHD_0000012>
PREFIX male_patient: <http://purl.obolibrary.org/obo/OHD_0000054>
PREFIX female_patient: <http://purl.obolibrary.org/obo/OHD_0000049>
PREFIX provider: <http://purl.obolibrary.org/obo/OHD_0000051>
PREFIX procedure: <http://purl.obolibrary.org/obo/OHD_0000002>
PREFIX restoration_procedure: <http://purl.obolibrary.org/obo/OHD_0000004>
PREFIX participates_in: <http://purl.obolibrary.org/obo/BFO_0000056>
PREFIX has_participant: <http://purl.obolibrary.org/obo/BFO_0000057>
PREFIX has_specified_input: <http://purl.obolibrary.org/obo/OBI_0000293>
PREFIX has_specified_output: <http://purl.obolibrary.org/obo/OBI_0000299>
PREFIX sesame: <http://www.openrdf.org/schema/sesame#>
PREFIX birth_date: <http://purl.obolibrary.org/obo/OHD_0000050>
PREFIX npbrn_id: <http://purl.obolibrary.org/obo/OHD_0000273>
"""

## find patients that had a procedure

In [17]:
q = """
%s
select distinct ?patient_id ?gender ?dob ?practice where {
    values ?gender_type {obo:OHD_0000049 obo:OHD_0000054}
    ?patient_i a patient:;
               birth_date: ?bdate;
               npbrn_id: ?npbrn_id;
             sesame:directType ?gender_type .
    ?tooth_i a tooth:;
           part_of: ?patient_i .
    ?proc_i a procedure:;
          has_specified_input: ?tooth_i .
    filter (!isblank(?patient_i))

    bind(strafter(str(?patient_i), "patient/") as ?patient_id)
    bind(strbefore(str(?bdate), "T") as ?dob)
    bind(strafter(str(?npbrn_id), "NDPBRN practice ") as ?practice)
    bind (if(?gender_type = obo:OHD_0000049, "female", "male") as ?gender )
}
""" % prefixes
# print(q)

In [66]:
sparql.setQuery(q)
res = sparql.query()
patient_df = sparql_to_df(res)

In [90]:
patient_df.drop_duplicates(inplace=True) # make sure there are not duplicates
len(patient_df)

226800

## calculate patients age as of 2017, include only patients with age > 0

In [305]:
## calculate age of patient as of 2017
patient_df.dob = pds.to_datetime(patient_df.dob)
patient_df['patient_age'] = 2017 - pds.to_datetime(patient_df.dob).dt.year

In [306]:
# drop patients with age less than 0
patient_df = patient_df.query('patient_age > 0')

## save local copy of patient info

In [307]:
patient_df.to_csv('triplestore-patient-info.tsv', sep='\t')