In [1]:
from SPARQLWrapper import SPARQLWrapper2 # SPARQLWrapper2 works with a JSON SELECT return result only
import pandas as pds

## define function for converting sparql results to a dataframe

In [2]:
def sparql_to_df(results):
    data = {}
    for variable in results.variables:
        data[variable] = [x.value for x in results.getValues(variable)]
    return pds.DataFrame(data)

## define function for retrieving sparql query

In [3]:
def get_sparql(filename):
    with open(filename, 'r') as f:
        query = f.read()
        return query

## set connection to local triple store

In [4]:
sparql = SPARQLWrapper2("http://192.168.1.243:7200/repositories/EDR_NO_ES") # local server
# sparql = SPARQLWrapper2("http://10.16.128.24:7200/repositories/EDR_NO_ES") # RI server

## test getting some results

In [5]:
q = get_sparql('queries/test-query.sparql')
sparql.setQuery(q)
res = sparql.query()
sparql_to_df(res)

Unnamed: 0,s,p,o
0,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...
1,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/2002/07/owl#Thing
2,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/2002/07/owl#ObjectProperty
3,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/2000/01/rdf-schema#Class
4,http://www.w3.org/1999/02/22-rdf-syntax-ns#Pro...,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/2002/07/owl#Class


## define some standard prefixes to use in queries

In [6]:
prefixes = get_sparql('queries/prefixes.sparql')
# print(prefixes)

## find patients that had a procedure

In [7]:
q = prefixes + get_sparql('queries/patient-info.sparql')
# print(q)

In [8]:
sparql.setQuery(q)
res = sparql.query()
patient_df = sparql_to_df(res)

In [15]:
patient_df.drop_duplicates(inplace=True) # make sure there are not duplicates
len(patient_df)

216905

## calculate patients age as of 2017, include only patients with age > 0

In [16]:
## calculate age of patient as of 2017
patient_df.dob = pds.to_datetime(patient_df.dob)
patient_df['patient_age'] = 2017 - pds.to_datetime(patient_df.dob).dt.year

In [17]:
# drop patients with age less than 0
patient_df = patient_df.query('patient_age > 0')
len(patient_df)

216903

## save local copy of patient info

In [12]:
patient_df.to_csv('triplestore-patient-info.tsv', sep='\t', index=False)