In [None]:
# import libaries 
%pip install pandas rdflib scipy
import pandas as pd
from rdflib import Dataset
from string import Template
from datetime import datetime, timedelta
from IPython.display import display

# verbosity
verbose = True

# data
angiodf = pd.read_csv("deliverable/structured_ANGPTL6_patients.csv", sep=";")
if verbose == True:
    display(angiodf.head())

# ontology terms
## for coding tracking use, because there are too many terms to memorize
ontologyTermsExplained = {'hpo:0007029': 'Cerebral berry aneurysm'}
## for the knowledge graph
ontologyTerms = {'presence of aneurysm' : 'hpo:0007029'}

Note: you may need to restart the kernel to use updated packages.


Unnamed: 0,pid,sex,age,bmi,presence of aneurysm,number of aneurysms,location aneurysum 1,location aneurysum 2,location aneurysum 3,sequenced dna,...,never smoked,former smoker,current smoker,high blood pressure,high blood pressure treatment,diabetes type 1,dyslipidemia,dyslipidemia treatment,aspirin treatment,vitamin K antagonist treatment
0,A_II-2,F,92,,True,1,ACoA,,,True,...,True,False,False,True,True,False,True,statins,False,False
1,A_II-5,F,45,,True,1,MCA,,,False,...,True,False,False,True,True,False,True,,False,False
2,A_III-1,F,58,,True,3,MCA,ICA,ACA,True,...,False,False,True,True,True,False,False,,False,False
3,A_III-5,F,60,23.0,True,2,ICA,ICA,,True,...,True,False,False,True,True,False,False,,False,False
4,A_IV-1,F,40,22.0,True,1,MCA,,,True,...,False,True,False,False,False,False,False,,False,False


In [31]:
# semantic ICAN individual
class SemanticPerson:
    def __init__(self, row):
        self.row = row

    def get_pid(self):
        return {'iri': "<http://ican.ressource.org/individual#" + str(self.row['pid']) + ">" , 'value' : self.row['pid'] }

    def get_age(self):
        return {'iri': "<http://ican.ressource.org/individual#" + str(self.row['pid']) + '/age#' + str(self.row['age']) + ">", 'value' :  self.row['age'] }

    def get_bmi(self):
        #bmiValue = self.row['bmi']
        #if pd.isna(self.row['bmi']):
        #    bmiValue = 'ncit:C17998'  # unknown
        return {'iri': "<http://ican.ressource.org/individual#" + str(self.row['pid']) + '/bmi#' + str(self.row['bmi']) + ">", 'value' :  self.row['bmi'] }

    def get_sex(self):
        sexCode = {'M' : 'ncit:C20197', 'F' : 'ncit:C16576', 'U' : 'ncit:C17998'}
        return {'iri': "<http://ican.ressource.org/individual#" + str(self.row['pid']) + '/sex#' + str(self.row['sex']) + ">", 'value' :  sexCode[self.row['sex']] }

    def get_diagnosis(self, diagnosis_column):
        diagnosis_status = self.row[diagnosis_column]
        diagnosis_label = diagnosis_column
        diagnosis_code = ontologyTerms[diagnosis_column]
        diagnosis_iri = "<http://ican.ressource.org/individual#" + str(self.row['pid']) + '/diagnosis#' + diagnosis_label.replace(' ', '_') + ">"
        return {'status': diagnosis_status, 'iri': diagnosis_iri, 'label' : diagnosis_label, 'code' : diagnosis_code }
#
# prefix and templates 
prefix = """@prefix sphn: <http://sphn.org/> .
@prefix ican: <http://ican.ressource.org/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix efo: <http://www.ebi.ac.uk/efo/EFO_> .
@prefix obi: <http://purl.obolibrary.org/obo/OBI_> .
@prefix iao: <http://purl.obolibrary.org/obo/IAO_> .
@prefix ncit: <http://purl.obolibrary.org/obo/NCIT_> .
@prefix hpo: <http://purl.obolibrary.org/obo/HPO_> .
"""

# ican individual identifier
sphn_person_template = Template(
    """
    $person_iri a ncit:C25190 .
    $person_iri rdf:type sphn:SubjectPseudoIdentifier ;
                sphn:hasIdentifier "$person_id"^^xsd:string ;
                sphn:hasDataProvider "ICAN-biocollection"^^xsd:string .
    """
)
# age of individual at study time
sphn_age_template = Template(
    """
    $age_iri a sphn:Age , obi:0001167 ;
        sphn:hasDeterminationDateTime "$age_determination_date"^^xsd:dateTime ;
        sphn:hasQuantity [ rdf:type sphn:Quantity ;
                            sphn:hasValue "$age_value" ;
                            sphn:hasUnit "years" ] .
    $person_iri iao:0000221 $age_iri .
    """
)
# administrative sex (no info about gender roles tbh)
sphn_sex_template = Template(
    """
    $sex_iri a sphn:AdministrativeSex, ncit:C17357 ;
            sphn:hasCode $sex_value .
    $person_iri iao:0000221 $sex_iri .
    """
)
# body mass index
sphn_bmi_template = Template(
    """
    $bmi_iri a sphn:BodyMassIndex, ncit:C16358 ;
            sphn:hasQuantity $bmi_value ;
            sphn:hasDeterminationDat "$bmi_determination_date"^^xsd:dateTime .
    $person_iri iao:0000221 $bmi_iri .
    """
)
# diagnosis
sphn_diagnosis_code_template = Template(
    """
    $diagnosis_iri a sphn:Diagnosis ;
        rdfs:label "$diagnosis_label"^^xsd:string ;
        sphn:hasCode $diagnosis_code .
        
    $patient_iri sphn:hasDiagnosis $diagnosis_iri .
    """
)

def generate_semanticBiollection_rdf(row, kg):
    semanticPerson = SemanticPerson(row)

    # patient id
    person_iri, person_id = semanticPerson.get_pid().values()
    # age
    age_iri, age_value = semanticPerson.get_age().values()
    age_determination_date = "2018-01-01T00:00:00"
    # administrative gender
    sex_iri, sex_value = semanticPerson.get_sex().values()
    # person, admistrative sex, age
    rdf = \
        sphn_person_template.substitute(
                            person_iri=person_iri, 
                            person_id=person_id) + \
        sphn_age_template.substitute(
                            person_iri=person_iri,
                            age_iri=age_iri,
                            age_value=age_value,
                            age_determination_date=age_determination_date) + \
        sphn_sex_template.substitute(
                            person_iri=person_iri,
                            sex_iri=sex_iri, 
                            sex_value=sex_value )
    # bmi if available
    # bmi
    bmi_iri, bmi_value = semanticPerson.get_bmi().values()
    bmi_determination_date = "2018-01-01T00:00:00"
    if not pd.isna(bmi_value):
        rdf += \
            sphn_bmi_template.substitute(
                                person_iri=person_iri,
                                bmi_iri=bmi_iri, 
                                bmi_value=bmi_value,
                                bmi_determination_date=bmi_determination_date)
    # diagnosis
    ## intracranial aneurysm
    ### status -> true or false, will be used to generate or not the sphn diagnosis
    diagnosis_status, diagnosis_iri, diagnosis_label, diagnosis_code = semanticPerson.get_diagnosis('presence of aneurysm').values()
    if diagnosis_status == True:
        rdf += sphn_diagnosis_code_template.substitute(
            patient_iri=person_iri,
            diagnosis_iri=diagnosis_iri,
            diagnosis_label=diagnosis_label,
            diagnosis_code=diagnosis_code
        )



    full_rdf = prefix + "\n" + rdf

    if verbose == True:
        print("======= RDF DATA =======")
        print(full_rdf)
        print("========================")

    kg.parse(data=full_rdf, format="turtle")


## Serialize data
kg = Dataset()
angiodf[0:4].apply(generate_semanticBiollection_rdf, axis=1, kg=kg)
print(f"Generated {len(kg)} RDF triples")
kg.serialize(destination="deliverable/patients.ttl", format="turtle")

@prefix sphn: <http://sphn.org/> .
@prefix ican: <http://ican.ressource.org/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix efo: <http://www.ebi.ac.uk/efo/EFO_> .
@prefix obi: <http://purl.obolibrary.org/obo/OBI_> .
@prefix iao: <http://purl.obolibrary.org/obo/IAO_> .
@prefix ncit: <http://purl.obolibrary.org/obo/NCIT_> .
@prefix hpo: <http://purl.obolibrary.org/obo/HPO_> .


    <http://ican.ressource.org/individual#A_II-2> a ncit:C25190 .
    <http://ican.ressource.org/individual#A_II-2> rdf:type sphn:SubjectPseudoIdentifier ;
                sphn:hasIdentifier "A_II-2"^^xsd:string ;
                sphn:hasDataProvider "ICAN-biocollection"^^xsd:string .
    
    <http://ican.ressource.org/individual#A_II-2/age#92> a sphn:Age , obi:0001167 ;
        sphn:hasDeterminationDateTime "2018-01-01T00:00:00"^^xsd:dateTime ;
        sphn:hasQuantity [ rdf:type sp

<Graph identifier=N9ba8776a5f3f4e44974d80e496963f04 (<class 'rdflib.graph.Dataset'>)>