In [2]:
from SPARQLWrapper import SPARQLWrapper2
import pandas as pds

## define function for converting sparql results to a dataframe

In [3]:
def sparql_to_df(results):
    data = {}
    for variable in results.variables:
        data[variable] = [x.value for x in results.getValues(variable)]
    return pds.DataFrame(data)

## define function for retrieving sparql query

In [4]:
def get_sparql(filename):
    with open(filename, 'r') as f:
        query = f.read()
        return query

## set connection to local triple store

In [5]:
sparql = SPARQLWrapper2("http://192.168.1.243:7200/repositories/EDR_NO_ES") # local server
# sparql = SPARQLWrapper2("http://10.16.128.24:7200/repositories/EDR_NO_ES") # RI server

## define some standard prefixes to use in queries

In [6]:
prefixes = get_sparql('queries/prefixes.sparql')
# print(prefixes)

## load patient info: this is needed b/c patients have been filtered for ages > 0, and we want to join our results on these patients

In [7]:
patient_df = pds.read_csv('triplestore-patient-info.tsv', sep='\t')

# Query/download data from triplestore

## find teeth (and their surfaces) that underwent a procedure

In [8]:
q = prefixes + get_sparql('queries/tooth-procedure-info.sparql')
# print(q)

In [8]:
sparql.setQuery(q)
res = sparql.query()
tooth_proc_df = sparql_to_df(res)

In [9]:
tooth_proc_df.drop_duplicates(inplace=True) # make sure there are not duplicates
len(tooth_proc_df)

1645416

## filter out invalid dates and convert datatypes (note: ignore warnings)

In [10]:
tooth_proc_df = tooth_proc_df[tooth_proc_df.event_date.str.startswith('19') | tooth_proc_df.event_date.str.startswith('20')]

In [11]:
tooth_proc_df.event_date = pds.to_datetime(tooth_proc_df.event_date)

In [12]:
tooth_proc_df['patient_id'] = tooth_proc_df['patient_id'].astype(str)
tooth_proc_df['tooth_id'] = tooth_proc_df['tooth_id'].astype(str)
tooth_proc_df['tooth_num'] = tooth_proc_df['tooth_num'].astype(str)
tooth_proc_df['cdt_code'] = tooth_proc_df['cdt_code'].astype(str)
tooth_proc_df['cdt_label'] = tooth_proc_df['cdt_label'].astype(str)

## join with patient info to filter out invalid patients

In [None]:
tooth_proc_df = pds.merge(tooth_proc_df, patient_df[['patient_id']], how='inner', on='patient_id')
len(tooth_proc_df)

## save local copy of tooth/procedure info

In [13]:
tooth_proc_df.to_csv('triplestore-tooth-procedure-info.tsv', sep='\t', index=False)

## find surfaces restored by procedures

In [11]:
q = prefixes + get_sparql('queries/procedure-surface-info.sparql')
# print(q)

In [13]:
sparql.setQuery(q)
res = sparql.query()
proc_surface_df = sparql_to_df(res)

In [9]:
proc_surface_df.drop_duplicates(inplace=True) # make sure there are no dups
len(proc_surface_df)

2626650

## join with patient info to filter out invalid patients

In [None]:
proc_surface_df = pds.merge(proc_surface_df, patient_df[['patient_id']], how='inner', on='patient_id')
len(proc_surface_df)

## save local copy of procedure/surface info

In [10]:
proc_surface_df.to_csv('triplestore-procedure-surface-info.tsv', sep='\t', index=False)

## find tooth type metadata for each tooth

In [15]:
q = prefixes + get_sparql('queries/tooth-type-info.sparql')
# print(q)

In [16]:
sparql.setQuery(q)
res = sparql.query()
tooth_type_df = sparql_to_df(res)

In [17]:
tooth_type_df = pds.read_csv('triplestore-tooth-type-info.tsv', sep='\t')

In [13]:
tooth_type_df.drop_duplicates(inplace=True) # make sure there are no dups
len(tooth_type_df)

1210430

## add maxillary maxilla (1-16) mandible (17-32) metadata

In [14]:
tooth_type_df.tooth_num = tooth_type_df.tooth_num.astype(int) # convert tooth_num to int
tooth_type_df['maxillary'] = tooth_type_df.tooth_num.map(lambda x: 1 if x < 17 else 0)
tooth_type_df['mandibular'] = tooth_type_df.tooth_num.map(lambda x: 1 if x > 16 else 0)

## add left/right metadata

In [15]:
tooth_type_df['left'] = tooth_type_df.tooth_num.map(lambda x: 1 if x > 8 and x < 25 else 0)
tooth_type_df['right'] = tooth_type_df.tooth_num.map(lambda x: 1 if x < 9 or x > 24 else 0)

## join with patient info to filter out invalid patients

In [None]:
tooth_type_df = pds.merge(tooth_type_df, patient_df[['patient_id']], how='inner', on='patient_id')
len(tooth_type_df)

## save local copy of tooth type metadata info

In [16]:
tooth_type_df.to_csv('triplestore-tooth-type-info.tsv', sep='\t', index=False)

## find materials used crowns (inluding onlays), fillings (including inlays), and veneers

In [22]:
q = prefixes + get_sparql('queries/proc-material-info.sparql')
# print(q)

In [23]:
sparql.setQuery(q)
res = sparql.query()
proc_material_df = sparql_to_df(res)

In [19]:
proc_material_df.drop_duplicates(inplace=True) # make sure there are no dups
len(proc_material_df)

1686822

## join with patient info to filter out invalid patients

In [None]:
proc_material_df = pds.merge(proc_material_df, patient_df[['patient_id']], how='inner', on='patient_id')
len(proc_material_df)

## save local copy of procedure/material info

In [20]:
proc_material_df.to_csv('triplestore-proc-material-info.tsv', sep='\t', index=False)