In [128]:
# import libaries 
%pip install pandas rdflib scipy
import pandas as pd
from rdflib import Dataset
from string import Template
from datetime import datetime, timedelta
from IPython.display import display

# verbosity
verbose = True

# data
synicadf = pd.read_csv("data/syntheticican2/ucanNeurovascSimulatedData_14042025.csv", sep=";")
if verbose == True:
    display(synicadf.head())

# ontology terms
## for coding tracking use, because there are too many terms to memorize
ontologyTermsExplained = {
    'hpo:0007029': 'Cerebral berry aneurysm', 
    'hpo:0100546': 'Carotid artery stenosis', 
    'hpo:0004942': 'Aortic aneurysm',
    'hpo:0000822': 'Arterial hypertension', 
    'hpo:0008071': 'Maternal hypertension', 
    'hpo:0032320' : 'Affected family member', 
    'pato:0002118' : 'Multiple quantity (here, aneursysms)', 
    'hpo:0100651': 'Diabetes mellitus type 1',
    'hpo:0005978': 'Diabetes mellitus type 2',
    'hpo:0009800': 'Maternal diabetes mellitus', 
    'hpo:0003119': 'Dyslipidemia'}
## for the knowledge graph
ontologyTerms = {'presenceOfAneurysm' : 'hpo:0007029', 
                 'carotidArteryStenosis' : 'hpo:0100546', 
                 'aorticAneurysm' : 'hpo:0004942', 
                 'arterialHypertension' : 'hpo:0000822', 
                 'maternalHypertension' : 'hpo:0008071', 
                 'familialCase' : 'hpo:0032320', 
                 'multipleAneurysms' : 'pato:0002118', 
                 'diabetesMellitusType1' : 'hpo:0100651',
                 'diabetesMellitusType2' : 'hpo:0005978',
                 'maternalDiabetes' : 'hpo:0009800', 
                 'dyslipidemia' : 'hpo:0003119'}

Note: you may need to restart the kernel to use updated packages.


Unnamed: 0,biosampleId,sampleType-arterialBlood,sampleType-veinousBlood,patientId,inclusionNb,referralCenter,consent,sex,lifeStatus,firstDiagnosisAge,...,aic-5-locationCode,aic-5-locationDetail,aic-5-locationVessel,aic-5-locationSide,aic-5-size,aic-5-rupture,aic-5-patientAgeAtRupture,aic-5-treatment,aic-5-patientAgeAtTreatment,aic-5-treatmentType
0,HG00096,True,False,pid_HG00096,AIC_09_0,Dijon,Yes,M,alive,40,...,,,,,,,,,,
1,HG00097,True,False,pid_HG00097,AIC_01_1,Angers,Yes,F,alive,70,...,,,,,,,,,,
2,HG00099,True,False,pid_HG00099,AIC_08_2,Paris – Hopital Sainte Anne,Yes,F,alive,63,...,,,,,,,,,,
3,HG00100,True,False,pid_HG00100,AIC_06_3,Tours,Yes,M,alive,49,...,,,,,,,,,,
4,HG00101,True,False,pid_HG00101,AIC_05_4,Nantes,Yes,F,alive,45,...,,,,,,,,,,


In [129]:
synicadf.rename(columns={"patientId": "pid", 
                         "bodyMassIndex": "bmi", 
                         "medicalHistory-cardioVascular-carotidArteryStenosis" : "carotidArteryStenosis",
                         "medicalHistory-cardioVascular-aorticAneurysm" : "aorticAneurysm", 
                         "medicalHistory-arterialHypertension" : "arterialHypertension", 
                         "medicalHistory-arterialHypertension-gestational" : "maternalHypertension", 
                         "medicalHistory-diabetes_gestational" : "maternalDiabetes",
                         "medicalHistory-diabetes_type1" : "diabetesMellitusType1",
                         "medicalHistory-diabetes_type2" : "diabetesMellitusType2", 
                         "medicalHistory-dyslipidemia" : "dyslipidemia"}, 
                        inplace=True)
synicadf['age'] = synicadf.loc[:,'firstDiagnosisAge'] # for simulated data, age and ageat diagnosis of aneurysms are the same
synicadf['pid'] = synicadf['pid'].replace('HG', 'SIM', regex=True) # SIM pour Simulated
synicadf['biosampleId'] = synicadf['biosampleId'].replace('HG', 'SIM', regex=True) # SIM pour Simulated
synicadf['pid'] = synicadf['pid'].replace('NA', 'SIM', regex=True) # SIM pour Simulated
synicadf['biosampleId'] = synicadf['biosampleId'].replace('NA', 'SIM', regex=True) # SIM pour Simulated
synicadf['presenceOfAneurysm'] = True
if verbose == True:
    display(synicadf.head())

Unnamed: 0,biosampleId,sampleType-arterialBlood,sampleType-veinousBlood,pid,inclusionNb,referralCenter,consent,sex,lifeStatus,firstDiagnosisAge,...,aic-5-locationVessel,aic-5-locationSide,aic-5-size,aic-5-rupture,aic-5-patientAgeAtRupture,aic-5-treatment,aic-5-patientAgeAtTreatment,aic-5-treatmentType,age,presenceOfAneurysm
0,SIM00096,True,False,pid_SIM00096,AIC_09_0,Dijon,Yes,M,alive,40,...,,,,,,,,,40,True
1,SIM00097,True,False,pid_SIM00097,AIC_01_1,Angers,Yes,F,alive,70,...,,,,,,,,,70,True
2,SIM00099,True,False,pid_SIM00099,AIC_08_2,Paris – Hopital Sainte Anne,Yes,F,alive,63,...,,,,,,,,,63,True
3,SIM00100,True,False,pid_SIM00100,AIC_06_3,Tours,Yes,M,alive,49,...,,,,,,,,,49,True
4,SIM00101,True,False,pid_SIM00101,AIC_05_4,Nantes,Yes,F,alive,45,...,,,,,,,,,45,True


In [133]:
# jinja style templates 
# prefix and templates 
prefix = """@prefix sphn: <https://biomedit.ch/rdf/sphn-schema/sphn#> .
@prefix ican: <http://ican.ressource.org/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix efo: <http://www.ebi.ac.uk/efo/EFO_> .
@prefix obo: <http://purl.obolibrary.org/obo/> .
@prefix obi: <http://purl.obolibrary.org/obo/OBI_> .
@prefix iao: <http://purl.obolibrary.org/obo/IAO_> .
@prefix ncit: <http://purl.obolibrary.org/obo/NCIT_> .
@prefix hpo: <http://purl.obolibrary.org/obo/HPO_> .
@prefix ucum: <https://ucum.org/ucum#> .
@prefix pato: <http://purl.obolibrary.org/obo/PATO_> . 
@prefix duo: <http://purl.obolibrary.org/obo/DUO_> .
"""

# ican individual identifier
sphn_person_template = Template(
    """
    $person_iri a ncit:C25190 .
    $person_iri rdfs:label "person profile simulatd with probabilities from the ICAN Biocollection"^^xsd:string . 
    $person_iri rdf:type efo:0022043 . 
    $person_iri rdf:type ncit:C16960 .
    $person_iri rdf:type sphn:SubjectPseudoIdentifier .
    $person_iri sphn:hasIdentifier "$person_id"^^xsd:string .
    $person_iri sphn:hasDataProvider $dataProvider_iri .
    """
)
# age of individual at study time
sphn_age_template = Template(
    """
    $age_iri a sphn:Age .
    $age_iri a obi:0001167 .
    $age_iri rdfs:label "age of the individual at study time"^^xsd:string .
    $age_iri sphn:hasDeterminationDateTime "$age_determination_date"^^xsd:dateTime .
    $age_iri sphn:hasQuantity $ageQuantity_iri . 
    $ageQuantity_iri a sphn:Quantity ;
            sphn:hasValue "$age_value"^^xsd:decimal ;
            sphn:hasUnit ucum:$ageUnit .
    $age_iri sphn:hasSubjectPseudoIdentifier $person_iri .
    """
)
# administrative sex (no info about gender roles tbh)
sphn_sex_template = Template(
    """
    $sex_iri a sphn:AdministrativeSex, ncit:C17357 ;
            sphn:hasCode $sex_value .
    $sex_iri sphn:hasSubjectPseudoIdentifier $person_iri .
    """
)
# body mass index
sphn_bmi_template = Template(
    """
    $bmi_iri a sphn:BodyMassIndex.
    $bmi_iri a  ncit:C16358 .
    $bmi_iri rdfs:label "Body Mass Index of the individual at study time"^^xsd:string .
    $bmi_iri sphn:hasDeterminationDateTime "$bmi_determination_date"^^xsd:dateTime .
    $bmi_iri sphn:hasQuantity $bmiQuantity_iri .
    $bmiQuantity_iri a sphn:Quantity ;
            sphn:hasValue "$bmi_value"^^xsd:decimal ;
            sphn:hasUnit ucum:$bmiUnit .
    $bmi_iri sphn:hasSubjectPseudoIdentifier $person_iri .
    """
)
# diagnosis
# https://www.biomedit.ch/rdf/sphn-schema/sphn#Diagnosis
# Aneurysm is special because there is an age at diagnosis, cardinality, familial status, other info etc
sphn_diagnosisAneurysmAgeDetail_code_template = Template(
    """
    $diagnosis_iri sphn:hasSubjectAge $ageAtDiagnosis_iri .
    $ageAtDiagnosis_iri a sphn:Age .
    $ageAtDiagnosis_iri a obi:0001167 .
    $ageAtDiagnosis_iri rdfs:label "age of the individual at first diagnosis of intracranial aneurysm"^^xsd:string .
    $ageAtDiagnosis_iri sphn:hasQuantity $ageAtDiagnosisQuantity_iri . 
    $ageAtDiagnosisQuantity_iri a sphn:Quantity ;
            sphn:hasValue "$ageAtDiagnosis_value"^^xsd:decimal ;
            sphn:hasUnit ucum:$ageUnit .
    $ageAtDiagnosis_iri sphn:hasSubjectPseudoIdentifier $person_iri .
    $diagnosis_iri sphn:hasSubjectPseudoIdentifier $person_iri .
    """
)
sphn_diagnosisAneurysmFormDetail_code_template = Template(
    """
    $diagnosis_iri sphn:hasCode $diagnosisFormDetail_code .
    $diagnosis_iri rdfs:label "$diagnosisFormDetail_label"^^xsd:string .
    $diagnosis_iri sphn:hasSubjectPseudoIdentifier $person_iri .
    """
)
# All diagnosis, boolean, unique code being enough to identify the diagnosis
sphn_diagnosis_code_template = Template(
    """
    $diagnosis_iri a sphn:Diagnosis .
    $diagnosis_iri rdfs:label "$diagnosis_label"^^xsd:string .
    $diagnosis_iri sphn:hasCode $diagnosis_code .
    $diagnosis_iri sphn:hasSubjectPseudoIdentifier $person_iri .
    """
)
# Data provider
sphn_data_provider_template = Template(
    """
    $dataProvider_iri a sphn:DataProvider ;
        sphn:hasInstitutionCode "l institut du thorax UMR1087 Inserm"^^xsd:string ;
        sphn:hasCategory "research institute"^^xsd:string .
    """
)
# Biobanksample
sphn_biobanksample_template = Template(
    """
    $biobanksample_iri a sphn:BiobankSample .
    $biobanksample_iri rdfs:label "Biobank sample of the simulated individual from the ICAN Biocollection"^^xsd:string .
    $biobanksample_iri sphn:hasIdentifier "$biobanksample_id"^^xsd:string .
    $biobanksample_iri sphn:hasSubjectPseudoIdentifier $person_iri .
    """
)
# Consent 
sphn_consent_template = Template(
    """
    $consent_iri a sphn:Consent .
    $consent_iri sphn:hasStatusCode "active"^^xsd:string .
    $consent_iri sphn:hasTypeCode $consent_code .
    $consent_iri sphn:hasSubjectPseudoIdentifier $person_iri .
    """
)
# Imaging Procedure (to include the ICAN/UCAN inclusion number, which then can be used to retrieve imaging metadata with Shanoir)
sphn_imagingProcedure_template = Template(
    """
    $imagingProcedure_iri a sphn:ImagingProcedure .
    $imagingProcedure_iri rdfs:label "Imaging procedure for the simulated individual from the ICAN Biocollection, the identifier being the [I|U]CAN inclusion number"^^xsd:string .
    $imagingProcedure_iri sphn:hasIdentifier "$imagingProcedure_id"^^xsd:string .
    $imagingProcedure_iri sphn:hasSubjectPseudoIdentifier $person_iri .
    $imagingProcedure_iri sphn:hasBodySite $imagingProcedureBodySite_iri .
    $imagingProcedureBodySite_iri a sphn:BodySite .
    $imagingProcedureBodySite_iri rdfs:label "Intracranial vasculature"^^xsd:string .
    $imagingProcedureBodySite_iri sphn:hasCode $bodySite .
    $imagingProcedure_iri sphn:hasSubjectPseudoIdentifier $person_iri .
    """
)

In [None]:
# semantic ICAN individual
class SemanticPerson:
    def __init__(self, row):
        self.row = row

    # iri contructors
    def get_pid(self):
        return {'iri': "<http://ican.ressource.org/individual/" + str(self.row['pid']) + ">" , 
                'value' : self.row['pid']}
    
    def get_biobanksample(self):
        return {'iri': "<http://ican.ressource.org/individual/"+str(self.row['pid'])+"/biobanksample/" + str(self.row['biosampleId']) + ">" , 
                'value' : self.row['biosampleId']}
    
    def get_consent(self):
        return {'iri': "<http://ican.ressource.org/individual/" + str(self.row['pid']) + '/consent/' + str(self.row['consent']) + ">", 
                'value' :  self.row['consent'], 
                'code' : "<http://purl.obolibrary.org/obo/DUO_0000037>"}
    
    def get_imagingProcedure(self):
        return {'iri': "<http://ican.ressource.org/individual/" + str(self.row['pid']) + '/imagingProcedure/' + str(self.row['inclusionNb']) + ">", 
                'id' :  self.row['inclusionNb'], 
                'bodySite_iri' : "<http://ican.ressource.org/individual/" + str(self.row['pid']) + '/imagingProcedure/' + str(self.row['inclusionNb']) + "/bodySite/brain>",
                'bodySite' : "<http://purl.obolibrary.org/obo/UBERON_0000955>" }
    
    def get_age(self):
        return {'iri': "<http://ican.ressource.org/individual/" + str(self.row['pid']) + '/age/' + str(self.row['age']) + ">", 
                'value' :  self.row['age'] }
    
    def get_ageAtDiagnosis(self, diagnosis_column):
        if diagnosis_column == 'presenceOfAneurysm':
            ageAtDiagnosis = self.row['firstDiagnosisAge']
            return {'iri': "<http://ican.ressource.org/individual/" + str(self.row['pid']) + '/ageAtDiagnosis/' + str(ageAtDiagnosis) + ">", 
                    'value' :  ageAtDiagnosis }
        else:
            exit("Age at diagnosis is only available for presence of aneurysm, other diagnoses do not have an age at diagnosis in the simulated data.")

    def get_bmi(self):
        return {'iri': "<http://ican.ressource.org/individual/" + str(self.row['pid']) + '/bmi/' + str(self.row['bmi']).replace(".", "_") + ">", 
                'value' :  self.row['bmi'] }

    def get_sex(self):
        sexCode = {'M' : 'ncit:C20197', 'F' : 'ncit:C16576', 'U' : 'ncit:C17998'}
        return {'iri': "<http://ican.ressource.org/individual/" + str(self.row['pid']) + '/sex/' + str(self.row['sex']) + ">", 
                'value' :  sexCode[self.row['sex']] }

    def get_dataProvider(self):
        return {'iri': "<http://ican.ressource.org/organisation/dataProvider#" + 'ITXUMR1087' + '>' , 
                'value' : 'ITXUMR1087'}
    
    def get_diagnosis(self, diagnosis_column):
        diagnosis_status = self.row[diagnosis_column]
        diagnosis_label = ontologyTermsExplained[ontologyTerms[diagnosis_column]]
        diagnosis_code = ontologyTerms[diagnosis_column]
        diagnosis_iri = "<http://ican.ressource.org/individual/" + str(self.row['pid']) + '/diagnosis/' + diagnosis_label.replace(' ', '_') + ">"
        return {'status': diagnosis_status, 
                'iri': diagnosis_iri, 
                'label' : diagnosis_label, 
                'code' : diagnosis_code}
    
    def get_quantity(self, quantityName, unitName):
        ucum = {
            'ageAtDiagnosis' : 'a_j',
            'ageAtStudyTime' : 'a_j', 
            'bmi' : 'm-2.g',}
        return {'iri': "<http://ican.ressource.org/individual/" + str(self.row['pid']) + '/quantity/' + str(quantityName).replace(".", "_") + ">", 
                'unit' : ucum[unitName]}
    
    # diagnosis rdf generation
    def add_diagnosis_rdf(semanticPerson, rdf, diagnosis_manualLabel):
        diagnosis_status, diagnosis_iri, diagnosis_label, diagnosis_code = semanticPerson.get_diagnosis(diagnosis_manualLabel).values()
        if diagnosis_status == True:
            rdf += sphn_diagnosis_code_template.substitute(
                person_iri=semanticPerson.get_pid()['iri'],
                diagnosis_iri=diagnosis_iri,
                diagnosis_label=diagnosis_label,
                diagnosis_code=diagnosis_code
            )
            if diagnosis_manualLabel == 'presenceOfAneurysm':
                # age and additional descriptors
                ageAtDiagnosis_iri, ageAtDiagnosis_value = semanticPerson.get_ageAtDiagnosis('presenceOfAneurysm').values()
                ageAtDiagnosisQuantity_iri, ageAtDiagnosisUnit = semanticPerson.get_quantity('ageAtDiagnosis','ageAtDiagnosis').values()
                #
                rdf += sphn_diagnosisAneurysmAgeDetail_code_template.substitute(
                    person_iri=semanticPerson.get_pid()['iri'],
                    diagnosis_iri=diagnosis_iri,
                    ageAtDiagnosis_iri=ageAtDiagnosis_iri,
                    ageAtDiagnosis_value=ageAtDiagnosis_value,
                    ageAtDiagnosisQuantity_iri=ageAtDiagnosisQuantity_iri,
                    ageUnit=ageAtDiagnosisUnit 
                )
                modifiers = ['familialCase', 'multipleAneurysms']#, 'sporadicCase', 
                for mod in modifiers: 
                    if semanticPerson.row[mod] == True :
                        rdf += sphn_diagnosisAneurysmFormDetail_code_template.substitute(
                            person_iri=semanticPerson.get_pid()['iri'],
                            diagnosis_iri=diagnosis_iri,
                            diagnosisFormDetail_code=ontologyTerms[mod],
                            diagnosisFormDetail_label=ontologyTermsExplained[ontologyTerms[mod]]
                        )
                

        return rdf
    
#

def generate_semanticBiollection_rdf(row, kg):
    semanticPerson = SemanticPerson(row)

    # patient id
    person_iri, person_id = semanticPerson.get_pid().values()
    # age
    age_iri, age_value = semanticPerson.get_age().values()
    ageQuantity_iri, ageUnit = semanticPerson.get_quantity('ageAtStudyTime','ageAtStudyTime').values()
    age_determination_date = "2018-01-01T00:00:00"
    # administrative gender
    sex_iri, sex_value = semanticPerson.get_sex().values()
    # data provider 
    dataProvider_iri, dataProvider_value = semanticPerson.get_dataProvider().values()
    # biobanksample
    biobanksample_iri, biobanksample_id = semanticPerson.get_biobanksample().values()
    # consent
    consent_iri, consent_value, consent_code = semanticPerson.get_consent().values()
    # imaging procedure 
    imagingProcedure_iri, imagingProcedure_id, imagingProcedureBodySite_iri, bodySite = semanticPerson.get_imagingProcedure().values()
    # person, admistrative sex, age
    rdf = \
        sphn_person_template.substitute(
                            person_iri=person_iri, 
                            person_id=person_id, 
                            dataProvider_iri=dataProvider_iri) + \
        sphn_age_template.substitute(
                            person_iri=person_iri,
                            age_iri=age_iri,
                            age_value=f"{age_value:.1f}",
                            age_determination_date=age_determination_date, 
                            ageQuantity_iri=ageQuantity_iri, 
                            ageUnit=ageUnit) + \
        sphn_sex_template.substitute(
                            person_iri=person_iri,
                            sex_iri=sex_iri, 
                            sex_value=sex_value ) + \
        sphn_data_provider_template.substitute(
                            dataProvider_iri=dataProvider_iri) + \
        sphn_biobanksample_template.substitute(
                            biobanksample_iri=biobanksample_iri, 
                            biobanksample_id=biobanksample_id, 
                            person_iri=person_iri) + \
        sphn_consent_template.substitute(
                            consent_iri=consent_iri, 
                            consent_code=consent_code,
                            person_iri=person_iri) + \
        sphn_imagingProcedure_template.substitute(
                            imagingProcedure_iri=imagingProcedure_iri, 
                            imagingProcedure_id=imagingProcedure_id, 
                            imagingProcedureBodySite_iri=imagingProcedureBodySite_iri,
                            bodySite=bodySite,
                            person_iri=person_iri)
    # bmi if available
    # bmi
    bmi_iri, bmi_value = semanticPerson.get_bmi().values()
    bmiQuantity_iri, bmiUnit = semanticPerson.get_quantity('bmi','bmi').values()
    bmi_determination_date = "2018-01-01T00:00:00"
    if not pd.isna(bmi_value):
        rdf += \
            sphn_bmi_template.substitute(
                                person_iri=person_iri,
                                bmi_iri=bmi_iri, 
                                bmi_value=bmi_value,
                                bmiUnit=bmiUnit,
                                bmiQuantity_iri=bmiQuantity_iri,
                                bmi_determination_date=bmi_determination_date)
    # diagnosis
    ### status -> true or false, will be used to generate or not the sphn diagnosis
    ## presence of intracranial aneurysm
    rdf += semanticPerson.add_diagnosis_rdf(rdf, 'presenceOfAneurysm')
    ## carotid artery stenosis
    rdf += semanticPerson.add_diagnosis_rdf(rdf, 'carotidArteryStenosis')
    ## aortic aneurysm
    rdf += semanticPerson.add_diagnosis_rdf(rdf, 'aorticAneurysm')
    ## arterialHypertension
    rdf += semanticPerson.add_diagnosis_rdf(rdf, 'arterialHypertension')
    ## maternalHypertension
    rdf += semanticPerson.add_diagnosis_rdf(rdf, 'maternalHypertension')
    ## maternalDiabetes
    rdf += semanticPerson.add_diagnosis_rdf(rdf, 'maternalDiabetes')
    ## diabetesMellitusType1
    rdf += semanticPerson.add_diagnosis_rdf(rdf, 'diabetesMellitusType1')
    ## diabetesMellitusType2
    rdf += semanticPerson.add_diagnosis_rdf(rdf, 'diabetesMellitusType2')
    ## dyslipidemia
    rdf += semanticPerson.add_diagnosis_rdf(rdf, 'dyslipidemia')

    full_rdf = prefix + "\n" + rdf

    if verbose == False:
        print("======= RDF DATA =======")
        print(full_rdf)
        print("========================")

    kg.parse(data=full_rdf, format="turtle")


## Serialize data
kg = Dataset()
synicadf[60:61].apply(generate_semanticBiollection_rdf, axis=1, kg=kg)
print(f"Generated {len(kg)} RDF triples")
#print(kg)
#print(kg.serialize(format="turtle"))
kg.serialize(destination="deliverable/synica3.ttl", format="turtle")

KeyError: 'bodySite'