In [1]:
from SPARQLWrapper import SPARQLWrapper2
import pandas as pds

## define function for converting sparql results to a dataframe

In [3]:
def sparql_to_df(results):
    data = {}
    for variable in results.variables:
        data[variable] = [x.value for x in results.getValues(variable)]
    return pds.DataFrame(data)

## set connection to local triple store

In [5]:
# sparql = SPARQLWrapper2("http://192.168.1.243:7200/repositories/EDR_NO_ES") # local server
sparql = SPARQLWrapper2("http://10.16.128.24:7200/repositories/EDR_NO_ES") # RI server

## define some standard prefixes to use in queries

In [6]:
prefixes = """
PREFIX obo: <http://purl.obolibrary.org/obo/>
PREFIX part_of: <http://purl.obolibrary.org/obo/BFO_0000050>
PREFIX has_part: <http://purl.obolibrary.org/obo/BFO_0000051>
PREFIX material: <http://purl.obolibrary.org/obo/OHD_0000000>
PREFIX tooth: <http://purl.obolibrary.org/obo/FMA_12516>
PREFIX restored_tooh: <http://purl.obolibrary.org/obo/OHD_0000189>
PREFIX surface: <http://purl.obolibrary.org/obo/FMA_no_fmaid_Surface_enamel_of_tooth>
PREFIX restored_surface: <http://purl.obolibrary.org/obo/OHD_0000208>
PREFIX restored_buccal: <http://purl.obolibrary.org/obo/OHD_0000222>
PREFIX restored_distal: <http://purl.obolibrary.org/obo/OHD_0000223>
PREFIX restored_mesial: <http://purl.obolibrary.org/obo/OHD_0000227>
PREFIX restored_labial: <http://purl.obolibrary.org/obo/OHD_0000225>
PREFIX restored_lingual: <http://purl.obolibrary.org/obo/OHD_0000226>
PREFIX restored_incisal: <http://purl.obolibrary.org/obo/OHD_0000224>
PREFIX restored_occlusal: <http://purl.obolibrary.org/obo/OHD_0000228>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX ada_num: <http://purl.obolibrary.org/obo/OHD_0000065>
PREFIX patient: <http://purl.obolibrary.org/obo/OHD_0000012>
PREFIX male_patient: <http://purl.obolibrary.org/obo/OHD_0000054>
PREFIX female_patient: <http://purl.obolibrary.org/obo/OHD_0000049>
PREFIX provider: <http://purl.obolibrary.org/obo/OHD_0000051>
PREFIX procedure: <http://purl.obolibrary.org/obo/OHD_0000002>
PREFIX restoration_procedure: <http://purl.obolibrary.org/obo/OHD_0000004>
PREFIX crown_procedure: <http://purl.obolibrary.org/obo/OHD_0000033>
PREFIX filling_procedure: <http://purl.obolibrary.org/obo/OHD_0000006>
PREFIX veneer_procedure: <http://purl.obolibrary.org/obo/OHD_0000027>
PREFIX endodontic_procedure: <http://purl.obolibrary.org/obo/OHD_0000003>
PREFIX endodontic_restorative_procedure:<http://purl.obolibrary.org/obo/OHD_0000242>
PREFIX root_canal_treatment: <http://purl.obolibrary.org/obo/OHD_0000230>
PREFIX tooth_extraction: <http://purl.obolibrary.org/obo/OHD_0000057>
PREFIX participates_in: <http://purl.obolibrary.org/obo/BFO_0000056>
PREFIX has_participant: <http://purl.obolibrary.org/obo/BFO_0000057>
PREFIX has_specified_input: <http://purl.obolibrary.org/obo/OBI_0000293>
PREFIX has_specified_output: <http://purl.obolibrary.org/obo/OBI_0000299>
PREFIX sesame: <http://www.openrdf.org/schema/sesame#>
PREFIX birth_date: <http://purl.obolibrary.org/obo/OHD_0000050>
PREFIX npbrn_id: <http://purl.obolibrary.org/obo/OHD_0000273>
PREFIX occur_date: <http://purl.obolibrary.org/obo/OHD_0000015>
PREFIX cdt_code: <http://purl.obolibrary.org/obo/CDT_1000001>
PREFIX is_about: <http://purl.obolibrary.org/obo/IAO_0000136>
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX anterior_tooth: <http://purl.obolibrary.org/obo/OHD_0000307>
PREFIX posterior_tooth: <http://purl.obolibrary.org/obo/OHD_0000308>
PREFIX incisor: <http://purl.obolibrary.org/obo/FMA_12823>
PREFIX canine: <http://purl.obolibrary.org/obo/FMA_55636>
PREFIX premolar: <http://purl.obolibrary.org/obo/FMA_55637>
PREFIX molar: <http://purl.obolibrary.org/obo/FMA_55638>
"""

# Query/download data from triplestore

## find teeth (and their surfaces) that underwent a procedure

In [7]:
q = """
%s
select 
    ?patient_id
    ?tooth_id
    ?tooth_num
    ?proc_id
    ?event
    ?event_date
    ?cdt_code
    ?cdt_label
where {
    ?patient_i a patient: .
    ?tooth_i a tooth:;
             part_of: ?patient_i;
             sesame:directType ?tooth_t .
    ?tooth_t ada_num: ?ada_num .
            
    ?proc_i a procedure:;
            sesame:directType ?proc_t;
            has_specified_input: ?tooth_i;
            occur_date: ?occur_date .
    ?proc_t rdfs:label ?proc_label .
    
    ?code_i a cdt_code:;
               	sesame:directType ?code_t;
                is_about: ?proc_i .
    
    ?code_t dc:identifier ?cdt_code;
            obo:cdt_label ?cdt_label .
    
    
    filter (!isblank(?patient_i))
    filter (!isblank(?tooth_i))
    filter (!isblank(?proc_i))
    
    bind(strafter(str(?patient_i), "patient/") as ?patient_id)
    bind(strafter(str(?tooth_i), "tooth/") as ?tooth_id)
    bind(strafter(str(?proc_i), "procedure/") as ?proc_id)
    bind(str(?proc_label) as ?event) # removes the laguage tags
    bind(strafter(?ada_num, "Tooth ") as ?tooth_num)
    bind(strbefore(str(?occur_date), "T") as ?event_date)
    
} 
""" % prefixes
# print(q)

In [8]:
sparql.setQuery(q)
res = sparql.query()
tooth_proc_df = sparql_to_df(res)

In [9]:
tooth_proc_df.drop_duplicates(inplace=True) # make sure there are not duplicates
len(tooth_proc_df)

1645416

## filter out invalid dates and convert datatypes (note: ignore warnings)

In [10]:
tooth_proc_df = tooth_proc_df[tooth_proc_df.event_date.str.startswith('19') | tooth_proc_df.event_date.str.startswith('20')]

In [11]:
tooth_proc_df.event_date = pds.to_datetime(tooth_proc_df.event_date)

In [12]:
tooth_proc_df['patient_id'] = tooth_proc_df['patient_id'].astype(str)
tooth_proc_df['tooth_id'] = tooth_proc_df['tooth_id'].astype(str)
tooth_proc_df['tooth_num'] = tooth_proc_df['tooth_num'].astype(str)
tooth_proc_df['cdt_code'] = tooth_proc_df['cdt_code'].astype(str)
tooth_proc_df['cdt_label'] = tooth_proc_df['cdt_label'].astype(str)

## save local copy of tooth/procedure info

In [13]:
tooth_proc_df.to_csv('triplestore-tooth-procedure-info.tsv', sep='\t')

## find surfaces restored by procedures

In [7]:
q = """
%s
select distinct
    ?patient_id
    ?tooth_id
    ?tooth_i
    ?tooth_num
    ?proc_i
    ?surface_i
    ?m
    ?o
    ?d
    ?b
    ?f
    ?l
    ?i
where {
    ?patient_i a patient: .
    ?tooth_i a tooth:;
             part_of: ?patient_i;
             sesame:directType ?tooth_t .
    ?tooth_t ada_num: ?ada_num .
    
    ?surface_i a restored_surface:;
               sesame:directType ?surface_t .
    
    ?proc_i a procedure:;
            sesame:directType ?proc_t;
            has_specified_input: ?tooth_i;
            has_specified_output: ?surface_i;
            occur_date: ?occur_date .
    
    filter (!isblank(?patient_i))
    filter (!isblank(?tooth_i))
    filter (!isblank(?proc_i))
    filter (!isblank(?surface_i))
    filter (!isblank(?surface_t))
    
    bind (strafter(str(?patient_i), "patient/") as ?patient_id)
    bind (strafter(str(?tooth_i), "tooth/") as ?tooth_id)
    bind (strafter(str(?proc_i), "procedure/") as ?proc_id)
    bind (strafter(?ada_num, "Tooth ") as ?tooth_num)
    bind(if(?surface_t = restored_buccal:, 1, 0) as ?b)
    bind(if(?surface_t = restored_distal:, 1, 0) as ?d)
    bind(if(?surface_t = restored_mesial:, 1, 0) as ?m)
    bind(if(?surface_t = restored_labial:, 1, 0) as ?f)
    bind(if(?surface_t = restored_lingual:, 1, 0) as ?l)
    bind(if(?surface_t = restored_incisal:, 1, 0) as ?i)
    bind(if(?surface_t = restored_occlusal:, 1, 0) as ?o)
    
}
""" % prefixes
# print(q)

In [8]:
sparql.setQuery(q)
res = sparql.query()
proc_surface_df = sparql_to_df(res)

In [9]:
proc_surface_df.drop_duplicates(inplace=True) # make sure there are no dups
len(proc_surface_df)

2626650

## save local copy of procedure/surface info

In [10]:
proc_surface_df.to_csv('triplestore-procedure-surface-info.tsv', sep='\t')

## find tooth type metadata for each tooth

In [11]:
q = """
%s
select distinct
    ?patient_id
    ?tooth_id
    ?tooth_num
    ?anterior
    ?posterior
    ?incisor
    ?canine
    ?premolar
    ?molar
where {
    values ?tooth_type {incisor: canine: premolar: molar:}
    ?patient_i a patient: .
    ?tooth_i a ?tooth_type;
             part_of: ?patient_i;
             sesame:directType ?tooth_t . 
    ?tooth_t ada_num: ?ada_num .
    
    ?proc_i a procedure:;
            has_specified_input: ?tooth_i.
    
    filter (!isblank(?patient_i))
    filter (!isblank(?tooth_i))
    filter (!isblank(?tooth_t))
    filter (!isblank(?proc_i))
    
    bind (strafter(str(?patient_i), "patient/") as ?patient_id)
    bind (strafter(str(?tooth_i), "tooth/") as ?tooth_id)
    bind (strafter(?ada_num, "Tooth ") as ?tooth_num)
    bind (if(?tooth_type in (incisor:, canine:), 1, 0) as ?anterior)
    bind (if(?tooth_type in (premolar:, molar:), 1, 0) as ?posterior)
    bind (if(?tooth_type = incisor:, 1, 0) as ?incisor)
    bind (if(?tooth_type = canine:, 1, 0) as ?canine)
    bind (if(?tooth_type = premolar:, 1, 0) as ?premolar)
    bind (if(?tooth_type = molar:, 1, 0) as ?molar)
}
""" % prefixes
# print(q)


In [None]:
sparql.setQuery(q)
res = sparql.query()
tooth_type_df = sparql_to_df(res)

In [12]:
tooth_type_df = pds.read_csv('triplestore-tooth-type-info.tsv', sep='\t')

In [13]:
tooth_type_df.drop_duplicates(inplace=True) # make sure there are no dups
len(tooth_type_df)

1210430

## add maxillary maxilla (1-16) mandible (17-32) metadata

In [14]:
tooth_type_df.tooth_num = tooth_type_df.tooth_num.astype(int) # convert tooth_num to int
tooth_type_df['maxillary'] = tooth_type_df.tooth_num.map(lambda x: 1 if x < 17 else 0)
tooth_type_df['mandibular'] = tooth_type_df.tooth_num.map(lambda x: 1 if x > 16 else 0)

## add left/right metadata

In [15]:
tooth_type_df['left'] = tooth_type_df.tooth_num.map(lambda x: 1 if x > 8 and x < 25 else 0)
tooth_type_df['right'] = tooth_type_df.tooth_num.map(lambda x: 1 if x < 9 or x > 24 else 0)

## save local copy of tooth type metadata info

In [16]:
tooth_type_df.to_csv('triplestore-tooth-type-info.tsv', sep='\t')

## find materials used crowns (inluding onlays), fillings (including inlays), and veneers

In [17]:
q = """
%s
select distinct
    ?patient_id
    ?tooth_id
    ?tooth_num
    ?proc_id
    ?restoration_type
    ?material
where {
    values ?restoration_proc {crown_procedure: filling_procedure: veneer_procedure:}
    ?patient_i a patient: .
    ?tooth_i a tooth:;
             part_of: ?patient_i;
             sesame:directType ?tooth_t .
    ?tooth_t ada_num: ?ada_num .
    
    ?material_i a material:;
                sesame:directType ?material_t .
    ?material_t rdfs:label ?material_label .
    
    ?proc_i a ?restoration_proc;
            sesame:directType ?proc_t;
            has_specified_input: ?tooth_i;
            has_specified_input: ?material_i .
    ?proc_t rdfs:label ?proc_label .
    ?restoration_proc rdfs:label ?restoraton_label .
    
    filter (!isblank(?patient_i))
    filter (!isblank(?tooth_i))
    filter (!isblank(?proc_i))
    
    bind (strafter(str(?patient_i), "patient/") as ?patient_id)
    bind (strafter(str(?tooth_i), "tooth/") as ?tooth_id)
    bind (strafter(str(?proc_i), "procedure/") as ?proc_id)
    bind (strbefore(str(?restoraton_label), " restoration procedure") as ?restoration_type)
    bind (strafter(?ada_num, "Tooth ") as ?tooth_num)
    bind (strbefore(str(?material_label), " dental restoration material") as ?material)
    
}
""" % prefixes
# print(q)

In [18]:
sparql.setQuery(q)
res = sparql.query()
proc_material_df = sparql_to_df(res)

In [19]:
proc_material_df.drop_duplicates(inplace=True) # make sure there are no dups
len(proc_material_df)

1686822

## save local copy of procedure/material info

In [20]:
proc_material_df.to_csv('triplestore-proc-material-info.tsv', sep='\t')