# Transformation of tabular dataset into SPHN RDF data
source: Predicting-clinical-outcomes-with-TKG-B2B9
Clinical outcomes papier :     
https://doi.org/10.48550/arXiv.2502.21138

In [57]:
%pip install pandas rdflib scipy
import pandas as pd
import numpy as np
import joblib
import uuid
from rdflib import ConjunctiveGraph, Dataset
from string import Template
from itertools import accumulate
from scipy.stats import norm
from datetime import datetime, timedelta
from IPython.display import display

Note: you may need to restart the kernel to use updated packages.


In [58]:
# syntheticican2 clinical data
df_syntheticican2 = pd.read_csv('data/syntheticican2/ucanNeurovascSimulatedData_14042025.csv', index_col=0, sep=';', quotechar='"')
df_syntheticican2 = df_syntheticican2.sample(frac=1, random_state=42).reset_index(drop=True)
# patients from rare coding variants in angptl6
df_rcvangptl6 = pd.read_csv('data/rarecodingvariantsinANGPTL6/mmc2.csv', sep=';')
#
def is_valid_int_gt_zero(val):
    try:
        intval = int(val)
        return intval > 0
    except (ValueError, TypeError):
        return False
#
df_rcvangptl6_patients = df_rcvangptl6[df_rcvangptl6['number of IA'].apply(is_valid_int_gt_zero)]
print(df_rcvangptl6_patients.reset_index(drop=True))


   family     ID sex date of birth (-death) phenotype number of IA  \
0       A   II-2   F                   1926  affected            1   
1       A   II-5   F              1929-1974  affected            1   
2       A  III-1   F                   1960  affected            3   
3       A  III-5   F                   1958  affected            2   
4       A   IV-1   F                   1978  affected            1   
5       B   II-1   F                   1951  affected            2   
6       B   II-2   F                   1947  affected            1   
7       C   II-4   M                   1956  affected            1   
8       C   II-5   M                   1957  affected            2   
9       D   II-1   F                   1960  affected            2   
10      E   II-1   M                   1948  affected            1   
11      E  III-1   M                   1970  affected            2   
12      E  III-3   M                   1975  affected            1   
13      F   II-3   M

In [59]:
# functions to transfrom rcvangptl6 df into something more structured
# pid: patient id
def get_pid(row):
    return row['family']+"_"+row['ID']
# sex
def get_sex(row):
    return row['sex']
# age
def get_age(row):
    dob = row['date of birth (-death)'].split('-')[0]
    doe = 2018 # doe = date of end, 2018 = year of publication of the patient table
    try:
        doe = int(row['date of birth (-death)'].split('-')[1])
    except:
        pass
    age = int(doe) - int(dob)
    return age
# bmi
def get_bmi(row):
    try:
        bmi = float(row['BMI'])
        if bmi <= 0:
            return pd.NA
        else:
            return bmi
    except ValueError:
        return pd.NA
# presence of aneursym
def get_presenceOfAneurysm(row):
    number_of_ia = row['number of IA']
    if pd.isna(number_of_ia):
        return False
    else:
        return True
# number of aneurysm
def get_numberOfAneurysm(row):
    number_of_ia = row['number of IA']
    if pd.isna(number_of_ia):
        return 0
    else:
        try:
            return int(number_of_ia)
        except ValueError:
            return 'unknown'
# aneurysm location 
def get_aneurysmLocation(row, nth):
    l = row['Location of IA']
    try:
        location = l.split(',')[nth-1]
        if len(location.strip()) == 0:
            return 'unknown'
        else:
            return location.strip()
    except:
        return pd.NA
# sequenced dna
def get_sequencedDNA(row):
    if row['DNA available'] == 'YES':
        return True
    else:
        return False
# serum available
def get_serumAvailable(row):
    if row['Serum available'] == 'YES':
        return True
    else:
        return False
# genomic variations
def get_genomicVariation(row):
    return row['ANGPTL6 status']
# smokig status
def get_smokingStatus(row, status):
    if status == "former smoker":
        status = "stopped smoking"
    if status in row['smoking status']:
        return True
    else:
        return False
# create health object
def create_health_object(row):
    class HealthInfo:
        def __init__(self, row):
            self.row = row
        # high blood pressure
        def get_highBloodPressure(self):
            has_disease = self.row.get('high blood pressure')
            if pd.isna(has_disease):
                status = pd.NA
            elif "HBP" in has_disease:
                status = True
            elif "unknown" in has_disease:
                status = pd.NA
            elif "no" in has_disease:
                status = False
            return status
        #
        def get_highBloodPressure_treatment(self):
            has_treatment = self.row.get('high blood pressure')
            if pd.isna(has_treatment):
                status = pd.NA
            elif "unknown" in has_treatment:
                status = pd.NA
            elif "no" in has_treatment:
                status = False
            elif "HBP treated" in has_treatment:
                status = True
            return status
        # diabetes type 1
        def get_diabetesType1(self):
            has_disease = self.row.get('Diabetis - Dyslipidemia')
            if pd.isna(has_disease):
                status = pd.NA
            elif "Insulino dependant" in has_disease:
                status = True
            elif "no" in has_disease:
                status = False
            else:
                status = False
            return status
        # dyslipidemia
        def get_dyslipidemia(self):
            has_disease = self.row.get('Diabetis - Dyslipidemia')
            if pd.isna(has_disease):
                status = pd.NA
            elif "dyslipidemia" in has_disease:
                status = True
            elif "no" in has_disease:
                status = False
            else:
                status = False
            return status
        def get_dyslipidemia_treatment(self):
            has_treatment = self.row.get('Diabetis - Dyslipidemia')
            if pd.isna(has_treatment):
                status = pd.NA
            elif "untreated" in has_treatment:
                status = "untreated"
            elif "statins" in has_treatment:
                status = "statins"
            else:
                status = pd.NA
            return status
        # aspirin
        def get_aspirinTreatment(self):
            has_treatment = self.row.get('Aspirin/VitK antagonist')
            if pd.isna(has_treatment):
                status = pd.NA
            elif "no" in has_treatment:
                status = False
            elif "unkown" in has_treatment:
                status = pd.NA
            elif "Aspirin" in has_treatment:
                status = True
            else:
                status = pd.NA
            return status
        # vitamin K antagonist
        def get_vitkantagonistTreatment(self):
            has_treatment = self.row.get('Aspirin/VitK antagonist')
            if pd.isna(has_treatment):
                status = pd.NA
            elif "no" in has_treatment:
                status = False
            elif "unkown" in has_treatment:
                status = pd.NA
            elif "VitK antagonist" in has_treatment:
                status = True
            else:
                status = pd.NA
            return status
            

    return HealthInfo(row)




In [None]:
# transfrom rcvangptl6 df into something more structured
columns = ['pid', 'sex', 'age', 'bmi', 'presence of aneurysm', 'number of aneurysms', 'location aneurysum 1', 'location aneurysum 2', 
           'location aneurysum 3', 'sequenced dna', 'sampled serum', 'genomic variation', 
           'never smoked', 'former smoker', 'current smoker', 
           'high blood pressure', 'high blood pressure treatment', 
           'diabetes type 1',
           'dyslipidemia', 'dyslipidemia treatment',
           'aspirin treatment', 'vitamin K antagonist treatment']
newdf = pd.DataFrame(columns=columns)

for index, row in df_rcvangptl6_patients.iterrows():
    new_row = {
        'pid' : get_pid(row),
        'sex' : get_sex(row),
        'age' : get_age(row),
        'bmi' : get_bmi(row),
        'presence of aneurysm' : get_presenceOfAneurysm(row),
        'number of aneurysms' : get_numberOfAneurysm(row),
        'location aneurysum 1' : get_aneurysmLocation(row, 1),
        'location aneurysum 2' : get_aneurysmLocation(row, 2),
        'location aneurysum 3' : get_aneurysmLocation(row, 3), 
        'sequenced dna' : get_sequencedDNA(row),
        'sampled serum' : get_serumAvailable(row),
        'genomic variation' : get_genomicVariation(row),
        'never smoked' : get_smokingStatus(row, 'never smoked'),
        'former smoker' : get_smokingStatus(row, 'former smoker'),
        'current smoker' : get_smokingStatus(row, 'current smoker'),
        'high blood pressure' : create_health_object(row).get_highBloodPressure(),
        'high blood pressure treatment' : create_health_object(row).get_highBloodPressure_treatment(),
        'diabetes type 1' : create_health_object(row).get_diabetesType1(),
        'dyslipidemia' : create_health_object(row).get_dyslipidemia(),
        'dyslipidemia treatment' :create_health_object(row).get_dyslipidemia_treatment(),
        'aspirin treatment' : create_health_object(row).get_aspirinTreatment(),
        'vitamin K antagonist treatment' : create_health_object(row).get_vitkantagonistTreatment()
    }

    newdf = pd.concat([newdf, pd.DataFrame([new_row])], ignore_index=True)
# display the structured dataframe
newdf.to_csv('deliverable/structured_ANGPTL6_patients.csv', index=False, sep=';')


  newdf = pd.concat([newdf, pd.DataFrame([new_row])], ignore_index=True)


Unnamed: 0,pid,sex,age,bmi,presence of aneurysm,number of aneurysms,location aneurysum 1,location aneurysum 2,location aneurysum 3,sequenced dna,...,never smoked,former smoker,current smoker,high blood pressure,high blood pressure treatment,diabetes type 1,dyslipidemia,dyslipidemia treatment,aspirin treatment,vitamin K antagonist treatment
0,A_II-2,F,92,,True,1,ACoA,,,True,...,True,False,False,True,True,False,True,statins,False,False
1,A_II-5,F,45,,True,1,MCA,,,False,...,True,False,False,True,True,False,True,,False,False
2,A_III-1,F,58,,True,3,MCA,ICA,ACA,True,...,False,False,True,True,True,False,False,,False,False
3,A_III-5,F,60,23.0,True,2,ICA,ICA,,True,...,True,False,False,True,True,False,False,,False,False
4,A_IV-1,F,40,22.0,True,1,MCA,,,True,...,False,True,False,False,False,False,False,,False,False
5,B_II-1,F,67,25.0,True,2,MCA,PCoA,,True,...,False,False,True,,,False,True,statins,False,False
6,B_II-2,F,71,22.0,True,1,PCoA,,,True,...,True,False,False,,,False,False,,False,False
7,C_II-4,M,62,20.0,True,1,ICA,,,True,...,False,False,True,False,False,False,True,statins,False,False
8,C_II-5,M,61,21.0,True,2,MCA,ACA,,True,...,False,True,False,True,True,False,False,,False,False
9,D_II-1,F,58,25.0,True,2,MCA,unknown,,True,...,True,False,False,True,True,False,False,,,True


In [97]:
# modelisation d'un patient/individu/sujet
def pandasRow2semanticPerson(row):
    class semanticPerson():
        # init
        def __init__(self, row):
            self.row = row
        # pid
        def get_pid(self):
            pid = "individual#"+self.row['pid']
            return pid
        # age
        def get_age(self):
            return self.row['age']
        # bmi 
        def get_bmi(self):
            return self.row['bmi']
        # gender
        def get_gender(self):
            return self.row['sex']

    return semanticPerson(row)


In [None]:
# verbose 
verbose = False
# prefix and templates 
prefix = """@prefix sphn: <http://sphn.org/> .
@prefix ican: <http://ican.ressource.org/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix efo: <http://www.ebi.ac.uk/efo/EFO_> .
@prefix iciperson: <http://ontology.eil.utoronto.ca/icity/> .
@prefix obi: <http://purl.obolibrary.org/obo/OBI_> .
@prefix iao: <http://purl.obolibrary.org/obo/IAO_> .
"""

# ican individual identifier
# it is a person and it is also a sphn:SubjectPseudoIdentifier
# it has a (spnh:hasIdentifier) string identifier
sphn_patient_template = Template(
    """
    ican:$patient_id a iciperson:Person .
    ican:$patient_id rdf:type sphn:SubjectPseudoIdentifier ;
                            sphn:hasIdentifier "$patient_id"^^xsd:string .
    """
)
# age is an sphn:Age and also an efo:0001610 (age) or an obi:0001167 (age measurement datum)
# to link the property age to the patient we can use is_quality_measurement_of (IAO_0000221)
sphn_age_template = Template(
    """
    ican:$age_id a sphn:Age , obi:0001167 ;
        sphn:hasDeterminationDateTime "$age_determination_date"^^xsd:dateTime ;
        sphn:hasQuantity [ rdf:type sphn:Quantity ;
                            sphn:hasValue "$age_value" ;
                            sphn:hasUnit "years" ] .
    ican:$patient_id iao:0000221 ican:$age_id .
    """
)
sphn_gender_template = Template(
    """
    ican:gender_$gender_id a sphn:AdministrativeGender ;
        sphn:hasCode ican:code_$gender_code .
        
    ican:$patient_id ican:hasGender ican:gender_$gender_id .
    """
)

def generate_semanticBiollection_rdf(row, kg):
    semanticPerson = pandasRow2semanticPerson(row)

    # parient id
    patient_id = semanticPerson.get_pid()
    # age
    age_value = semanticPerson.get_age()
    age_id = str(age_value)
    age_date = "2018-01-01T00:00:00"
    # administrative gender
    gender_value = semanticPerson.get_gender()
    gender_id = gender_value

    rdf = sphn_patient_template.substitute(patient_id=patient_id) + \
          sphn_age_template.substitute(
                            patient_id=patient_id,
                            age_id=age_id,
                            age_value=age_value,
                            age_determination_date=age_date) + \
          sphn_gender_template.substitute(
                            patient_id=patient_id,
                            gender_id=gender_id, 
                            gender_code=gender_value
          )

    full_rdf = prefix + "\n" + rdf

    if verbose == True:
        print("======= RDF DATA =======")
        print(full_rdf)
        print("========================")

    kg.parse(data=full_rdf, format="turtle")


## Serialize data
kg = Dataset()
newdf[0:2].apply(generate_semanticBiollection_rdf, axis=1, kg=kg)
print(f"Generated {len(kg)} RDF triples")
kg.serialize(destination="deliverable/patients.ttl", format="turtle")

BadSyntax: at line 14 of <>:
Bad syntax (expected '.' or '}' or ']' at end of statement) at ^ in:
"...b'Identifier ;\n                            sphn:hasIdentifier '^b'"individual#A_II-2"^^xsd:string .\n    \n    ican:92 a sphn:Ag'..."