<a href="https://colab.research.google.com/github/noctillion/12-days-of-biopython/blob/main/Convert_ichange_data_to_phenopackets_sies_nov023.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Convert ICHANGE database extracts into phenopackets format


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import json
import pandas as pd
import numpy as np
import uuid
import random
from datetime import date, datetime, time, timezone

In [3]:
timestamp = date.today()
timestamp

datetime.date(2023, 11, 29)

In [4]:
datetimex = datetime.now(timezone.utc).isoformat(timespec='seconds')
str(datetimex)

'2023-11-29T21:12:31+00:00'

In [5]:
pd.set_option('display.max_columns', None)

## Utils functions

In [6]:
def convert_age_to_iso(age, age_unit):
    """ Converts age to ISO 8601 """

    if age_unit.strip() == "years":
        return f"P{str(age)}Y"
    elif age_unit.strip() == "months":
        return f"P0Y{str(age)}M"
    elif age_unit.strip() == "weeks":
        return f"P0Y0M{str(age)}W"
    elif age_unit.strip() == "days":
        return f"P0Y0M{str(age)}D"
    else:
        return f"{age} {age_unit}"

In [7]:
def get_patient_data(df, db_column):
    """ Checks that there is only one value for patient's data and it's consistent """

    value = list(df[db_column].unique())
    if len(value) == 1:
        return value[0]

## Reference resources and administrative metadata

In [8]:
# Ontologies
# If new ontologies used, add them to the list

RESOURCES = {
    "NCBITaxon": {
        "name": "NCBI Taxonomy OBO Edition",
        "version": "2018-07-27",
        "namespace_prefix": "NCBITaxon",
        "id": "NCBITaxon:2018-07-27",
        "iri_prefix": "http://purl.obolibrary.org/obo/NCBITaxon_",
        "url": "http://purl.obolibrary.org/obo/ncbitaxon.owl"
    },
    "ICHANGE": {
        "name": "ICHANGE Controlled vocabulary",
        "version": "2020-12-11",
        "namespace_prefix": "ICHANGE",
        "id": "ICHANGE:2020-12-11",
        "iri_prefix": "http://example.org/ICHANGE/",
        "url": "http://example.org/ICHANGE/"
    },
    "SNOMED": {
        "name": "SNOMED Clinical Terms",
        "version": "2019-04-11",
        "namespace_prefix": "SNOMED",
        "id": "SNOMED:2019-04-11",
        "iri_prefix": "http://purl.bioontology.org/ontology/SNOMEDCT/",
        "url": "http://purl.bioontology.org/ontology/SNOMEDCT"
    }
}

In [9]:
# Metadata object to be assigned to every single phenopacket

METADATA = {
            "phenopacket_schema_version": "1.0.0-RC3",
            "created_by": "C3G Team",
            "submitted_by": "C3G Team",
            "resources": [
                RESOURCES["NCBITaxon"],
                RESOURCES["ICHANGE"],
                RESOURCES["SNOMED"]
            ]
        }

In [10]:
# CVs mappings between ICHANGE and  Phenopackets

SEX_TO_SEX_MAPPING = {
    "M": "MALE",
    "F": "FEMALE",
    "NaN": "UNKNOWN_SEX"
}

In [11]:
# Universal ontology terms

NCBI_TAXON_HOMO_SAPIENS = {
    "id": "NCBITaxon:9606",
    "label": "Homo sapiens"
}

NCBI_TAXON_MUS_MUSCULUS = {
    "id": "NCBITaxon:10090",
    "label": "Mus musculus"
}

PROCEDURE_CODE_NOT_ASSIGNED = {
    "code": {
        "label": "Procedure code not assigned",
        "id": "SNOMED:42630001"
    }
}

UNKNOWN_CONCEPT = {
    "id": "SNOMED:261665006",
    "label": "Unknown"
}

## Read patients and samples csv file

In [None]:
# SQL query to retrieve "patients_samples.csv" data (from table patients)

# SELECT [ichange].[dbo].[patients].[patientID]
#       ,[ichange].[dbo].[patients].[age]
#       ,[ageGroup]
#       ,[ichange].[dbo].[patients].[ageUnit]
#       ,[date_birth]
#       ,[gender]
#       ,[identifier]
#       ,[patientCode]
#       ,[sibling]
#       ,[ichange].[dbo].[patients].[sourceID]
#       ,[ichange].[dbo].[samples].[sampleID]
#       ,[ichange].[dbo].[samples].[note]
#       ,[ichange].[dbo].[samples].[CollectionMethodID]
#       ,[ichange].[dbo].[samples].[diagnosis_note]
#       ,[ichange].[dbo].[samples].[RNAseq]
#       ,[ichange].[dbo].[samples].[sourceID] as samples_sourceID
#       ,[ichange].[dbo].[samples].[sourceSampleID]
#       ,[ichange].[dbo].[samples].[barCodeNo]
#       ,[ichange].[dbo].[samples].[specieID]
#       ,[ichange].[dbo].[species].[specie_en]
#       ,[ichange].[dbo].[samples].[age] as samples_age
#       ,[ichange].[dbo].[samples].[ageUnit] as samples_ageUnit
#       ,[ichange].[dbo].[samples].[recurrence]

#   FROM [ichange].[dbo].[patients]
#   LEFT JOIN [ichange].[dbo].[samples] ON [patients].[patientID] = [ichange].[dbo].[samples].[patientID]
#   LEFT JOIN [ichange].[dbo].[species] ON [samples].[specieID] = [ichange].[dbo].[species].[SpecieID]

In [12]:
patients_samples = "/content/drive/MyDrive/genomicc3g/patients_samples.csv"

In [13]:
patients_and_samples_df = pd.read_csv(patients_samples, dtype=str)
patients_and_samples_df.head()

Unnamed: 0,patientID,age,ageGroup,ageUnit,date_birth,gender,identifier,patientCode,sibling,sourceID,sampleID,note,CollectionMethodID,diagnosis_note,RNAseq,samples_sourceID,sourceSampleID,barCodeNo,specieID,specie_en,samples_age,samples_ageUnit,recurrence
0,0,,,,,,,P-0,0,,1002,H3F3A K27M,,,0,,DIPG04T,S-1002,1,Human,,,0
1,1003,,pediatric,,,,,P-1003,0,1.0,1003,,,Diagnosis: immunodeficiency,0,1.0,AD14,S-1003,1,Human,,,0
2,1004,,,,,,,P-1004,0,1.0,1004,,,Diagnosis: immunodeficiency,0,1.0,AD141,S-1004,1,Human,,,0
3,1005,,,,,,,P-1005,0,1.0,1005,,,Diagnosis: immunodeficiency,0,1.0,AD142,S-1005,1,Human,,,0
4,1006,,pediatric,,,,,P-1006,0,1.0,1006,,,Diagnosis: immunodeficiency,0,1.0,AD150,S-1006,1,Human,,,0


In [14]:
patients_and_samples_df.shape

(10288, 23)

In [15]:
patients_and_samples_df = patients_and_samples_df.fillna("nan")
patients_and_samples_df.head()

Unnamed: 0,patientID,age,ageGroup,ageUnit,date_birth,gender,identifier,patientCode,sibling,sourceID,sampleID,note,CollectionMethodID,diagnosis_note,RNAseq,samples_sourceID,sourceSampleID,barCodeNo,specieID,specie_en,samples_age,samples_ageUnit,recurrence
0,0,,,,,,,P-0,0,,1002,H3F3A K27M,,,0,,DIPG04T,S-1002,1,Human,,,0
1,1003,,pediatric,,,,,P-1003,0,1.0,1003,,,Diagnosis: immunodeficiency,0,1.0,AD14,S-1003,1,Human,,,0
2,1004,,,,,,,P-1004,0,1.0,1004,,,Diagnosis: immunodeficiency,0,1.0,AD141,S-1004,1,Human,,,0
3,1005,,,,,,,P-1005,0,1.0,1005,,,Diagnosis: immunodeficiency,0,1.0,AD142,S-1005,1,Human,,,0
4,1006,,pediatric,,,,,P-1006,0,1.0,1006,,,Diagnosis: immunodeficiency,0,1.0,AD150,S-1006,1,Human,,,0


In [16]:
# list all columns

list(patients_and_samples_df)

['patientID',
 'age',
 'ageGroup',
 'ageUnit',
 'date_birth',
 'gender',
 'identifier',
 'patientCode',
 'sibling',
 'sourceID',
 'sampleID',
 'note',
 'CollectionMethodID',
 'diagnosis_note',
 'RNAseq',
 'samples_sourceID',
 'sourceSampleID',
 'barCodeNo',
 'specieID',
 'specie_en',
 'samples_age',
 'samples_ageUnit',
 'recurrence']

In [17]:
# count unique patients
unique_patients= len(patients_and_samples_df["patientID"].unique())

# count unique samples
unique_samples= len(patients_and_samples_df["sampleID"].unique())

# +1 NULL this number always will be +1 to what the count of all samples in the datbase because it counts sampleID value NULL as ne of the unique values
print(f"Unique patients: {unique_patients}\nUnique samples: {unique_samples}")

Unique patients: 5456
Unique samples: 10052


In [18]:
patients_id_list = list(patients_and_samples_df.patientID.unique())

## Convert to phenopackets

In [19]:
## functional
patients_id_list = list(patients_and_samples_df.patientID.unique())

phenopackets = []

for patient_id in patients_id_list:
    df_patient = patients_and_samples_df[patients_and_samples_df.patientID==patient_id]

    phenopacket = {
        "id": patient_id,
        "subject": {
            "id": patient_id,
            "extra_properties": {}
        },
        "biosamples": [],
        "diseases": [],
        "meta_data": METADATA
    }

    for dic in [{"date_birth": "date_of_birth"}]:
        for key, value in dic.items():
            # any phenotypic value
            pheno_value = get_patient_data(df_patient, key)
            if pheno_value and pheno_value != "nan":
                phenopacket["subject"][value] = pheno_value

    for dic in [{"ageGroup": "age_group"},
                {"identifier": "identifier"},
                {"patientCode": "patient_code"},
                {"sibling": "sibling"},
                {"sourceID": "source_id"}
               ]:
        for key, value in dic.items():
            # extra properties value
            extra_property_value = get_patient_data(df_patient, key)
            if extra_property_value and extra_property_value != "nan":
                phenopacket["subject"]["extra_properties"][value] = extra_property_value

    # alternate_ids
    if get_patient_data(df_patient, "identifier") and get_patient_data(df_patient, "identifier") != "nan":
        phenopacket["subject"]["alternate_ids"] = [str(get_patient_data(df_patient, "identifier"))]

    # Handling of age using TimeElement
    age = get_patient_data(df_patient, "age")
    age_unit = get_patient_data(df_patient, "ageUnit")
    if age and age != "nan":
        iso_age = convert_age_to_iso(age, age_unit if age_unit and age_unit != "nan" else "years")
        phenopacket["subject"]["time_at_last_encounter"] = {
            "age": {
                "iso8601duration": iso_age
            }
        }

    # Handling of sex
    sex = get_patient_data(df_patient, "gender")
    if (sex and sex != "nan"):
        phenopacket["subject"]["sex"] = SEX_TO_SEX_MAPPING[sex]
    else:
        phenopacket["subject"]["sex"] = SEX_TO_SEX_MAPPING["NaN"]

#     if diseases:
#         unique_diseases = [i for n, i in enumerate(diseases) if i not in diseases[n + 1:]]
#         phenopacket["diseases"] = unique_diseases


    # Handling biosamples and diagnosis
    if "sampleID" in list(df_patient.columns):
        biosamples = list(df_patient["sampleID"])
        sample_notes = list(df_patient["note"])
        diagnosis_notes = list(df_patient["diagnosis_note"])
        barcodes = list(df_patient["barCodeNo"])
        species_id = list(df_patient["specieID"])
        sample_ages = list(df_patient["samples_age"])
        sample_age_units = list(df_patient["samples_ageUnit"])
        source_sample_ids = list(df_patient["sourceSampleID"])

        for sample_id, sample_note, diagnosis_note, barcode, specie_id, sample_age, sample_age_unit, source_sample_id in zip(
            biosamples, sample_notes, diagnosis_notes, barcodes, species_id, sample_ages, sample_age_units,
            source_sample_ids
        ):
            if sample_id == "nan" or sample_id is None:
                continue

            biosample = {
                "id": barcode,
                "description": sample_id,
                "procedure": PROCEDURE_CODE_NOT_ASSIGNED
            }

            # add diagnosis_note to extra_properties
            if isinstance(diagnosis_note, str) and diagnosis_note != "nan":
                if "extra_properties" in biosample:
                    biosample["extra_properties"].update({"diagnosis_note": diagnosis_note})
                else:
                    biosample["extra_properties"] = {"diagnosis_note": diagnosis_note}
            # add sourceSampleID
            if isinstance(source_sample_id, str) and source_sample_id != "nan":
                if "extra_properties" in biosample:
                    biosample["extra_properties"].update({"source_sample_id": source_sample_id})
                else:
                    biosample["extra_properties"] = {"source_sample_id": source_sample_id}
            # add taxonomy
            if isinstance(specie_id, str) and specie_id != "nan":
                if specie_id == "1":
                    biosample["taxonomy"] = NCBI_TAXON_HOMO_SAPIENS
                elif specie_id == "2":
                    biosample["taxonomy"] = NCBI_TAXON_MUS_MUSCULUS
                else:
                    pass

            # Handling of sample age using TimeElement
            if sample_age and sample_age != "nan":
                iso_age = convert_age_to_iso(sample_age, sample_age_unit if sample_age_unit and sample_age_unit != "nan" else "years")
                biosample["time_of_collection"] = {
                    "age": {
                        "iso8601duration": iso_age
                    }
                }

            phenopacket["biosamples"].append(biosample)

    # Append the constructed phenopacket
    phenopackets.append(phenopacket)


In [None]:
# get list of all patients ids

patients_id_list = list(patients_and_samples_df.patientID.unique())

phenopackets = []

for patient_id in patients_id_list:
    df_patient = patients_and_samples_df[patients_and_samples_df.patientID==patient_id]

    phenopacket = {
        "id": patient_id,
        "subject": {
            "id": patient_id,
            "extra_properties": {}
        },
        "biosamples": [],
        "diseases": [],
        "meta_data": METADATA
    }

    for dic in [{"date_birth": "date_of_birth"}]:
        for key, value in dic.items():
            # any phenotypic value
            pheno_value = get_patient_data(df_patient, key)
            if pheno_value and pheno_value != "nan":
                phenopacket["subject"][value] = pheno_value

    for dic in [{"ageGroup": "age_group"},
                {"identifier": "identifier"},
                {"patientCode": "patient_code"},
                {"sibling": "sibling"},
                {"sourceID": "source_id"}
               ]:
        for key, value in dic.items():
            # extra properties value
            extra_property_value = get_patient_data(df_patient, key)
            if extra_property_value and extra_property_value != "nan":
                phenopacket["subject"]["extra_properties"][value] = extra_property_value

    # alternate_ids
    if get_patient_data(df_patient, "identifier") and get_patient_data(df_patient, "identifier") != "nan":
        phenopacket["subject"]["alternate_ids"] = [str(get_patient_data(df_patient, "identifier"))]

    #age
    '''     age = get_patient_data(df_patient, "age")
    age_unit = get_patient_data(df_patient, "ageUnit")
    if (age and age != "nan") and (age_unit and age_unit != "nan"):
        iso_age = convert_age_to_iso(age, age_unit)
        phenopacket["subject"]["age"] = {
            "age": iso_age
        } '''
    age = get_patient_data(df_patient, "age")
    age_unit = get_patient_data(df_patient, "ageUnit")
    if age and age != "nan":
        iso_age = convert_age_to_iso(age, age_unit if age_unit and age_unit != "nan" else "years")
        phenopacket["subject"]["time_at_last_encounter"] = {
            "age": {
                "iso8601duration": iso_age
            }
        }

    # it's probably years if age is there and units are not
    elif (age and age != "nan") and (age_unit and age_unit == "nan"):
        iso_age = convert_age_to_iso(age, "years")
        phenopacket["subject"]["age"] = {
            "age": iso_age
        }
    else:
        pass


    # sex
    sex = get_patient_data(df_patient, "gender")
    if (sex and sex != "nan"):
        phenopacket["subject"]["sex"] = SEX_TO_SEX_MAPPING[sex]
    else:
        phenopacket["subject"]["sex"] = SEX_TO_SEX_MAPPING["NaN"]

#     if diseases:
#         unique_diseases = [i for n, i in enumerate(diseases) if i not in diseases[n + 1:]]
#         phenopacket["diseases"] = unique_diseases


    # add biosamples and diagnosis
    if "sampleID" in list(df_patient.columns):
        biosamples = list(df_patient["sampleID"])
        sample_notes = list(df_patient["note"])
        diagnosis_notes = list(df_patient["diagnosis_note"])
        barcodes = list(df_patient["barCodeNo"])
        species_id = list(df_patient["specieID"])
        sample_ages = list(df_patient["samples_age"])
        sample_age_units = list(df_patient["samples_ageUnit"])
        source_sample_ids = list(df_patient["sourceSampleID"])
        '''         for sample_id, sample_note, diagnosis_note, barcode, specie_id, sample_age, sample_age_unit, source_sample_id in zip(
            biosamples, sample_notes, diagnosis_notes, barcodes, species_id, sample_ages, sample_age_units,
            source_sample_ids
        ): '''
        for sample_id, sample_note, diagnosis_note, barcode, specie_id, sample_age, sample_age_unit, source_sample_id in zip(
        biosamples, sample_notes, diagnosis_notes, barcodes, species_id, sample_ages, sample_age_units,
        source_sample_ids
            # if there is no sampleID then there is no other sample related information
            if sample_id == "nan" or sample_id is None:
                pass

            # continue with those patients that have samples
            else:
                biosample = {
                    # as we agreed we are going to use barCodeNo as sample id
                    "id": barcode,
                    # same db sampleID to description field
                    # !!!!!!!!!!!!TODO fix save as number
                    "description": sample_id,
                    "procedure": PROCEDURE_CODE_NOT_ASSIGNED
                }

                # add diagnosis_note to extra_properties
                if isinstance(diagnosis_note, str) and diagnosis_note != "nan":
                    if "extra_properties" in biosample:
                        biosample["extra_properties"].update({"diagnosis_note": diagnosis_note})
                    else:
                        biosample["extra_properties"] = {"diagnosis_note": diagnosis_note}
                # add sourceSampleID
                if isinstance(source_sample_id, str) and source_sample_id != "nan":
                    if "extra_properties" in biosample:
                        biosample["extra_properties"].update({"source_sample_id": source_sample_id})
                    else:
                        biosample["extra_properties"] = {"source_sample_id": source_sample_id}
                # add taxonomy
                if isinstance(specie_id, str) and specie_id != "nan":
                    if specie_id == "1":
                        biosample["taxonomy"] = NCBI_TAXON_HOMO_SAPIENS
                    elif specie_id == "2":
                        biosample["taxonomy"] = NCBI_TAXON_MUS_MUSCULUS
                    else:
                        pass

                # sample age
                ''' if (sample_age and sample_age != "nan") and (sample_age_unit and sample_age_unit != "nan"):
                    iso_age = convert_age_to_iso(sample_age, sample_age_unit)
                    biosample["individual_age_at_collection"] = {
                        "age": iso_age
                    }
                elif (sample_age and sample_age != "nan") and not (sample_age_unit and sample_age_unit == "nan"):
                    iso_age = convert_age_to_iso(age, "years")
                    biosample["individual_age_at_collection"] = {
                        "age": iso_age
                    }
                else:
                    pass

                phenopacket["biosamples"].append(biosample) '''
              if sample_age and sample_age != "nan":
                iso_age = convert_age_to_iso(sample_age, sample_age_unit if sample_age_unit and sample_age_unit != "nan" else "years")
                biosample["time_of_collection"] = {
                    "age": {
                        "iso8601duration": iso_age
                    }
                }

        phenopacket["biosamples"].append(biosample)

    # list datatypes inside phenopacket
    for datatype in ["biosamples"]:
        if not phenopacket[datatype]:
            del phenopacket[datatype]


    phenopackets.append(phenopacket)

In [20]:
# get list of all patients ids

patients_id_list = list(patients_and_samples_df.patientID.unique())

phenopackets = []

for patient_id in patients_id_list:
    df_patient = patients_and_samples_df[patients_and_samples_df.patientID==patient_id]

    phenopacket = {
        "id": patient_id,
        "subject": {
            "id": patient_id,
            "extra_properties": {}
        },
        "biosamples": [],
        "diseases": [],
        "meta_data": METADATA
    }

    for dic in [{"date_birth": "date_of_birth"}]:
        for key, value in dic.items():
            # any phenotypic value
            pheno_value = get_patient_data(df_patient, key)
            if pheno_value and pheno_value != "nan":
                phenopacket["subject"][value] = pheno_value

    for dic in [{"ageGroup": "age_group"},
                {"identifier": "identifier"},
                {"patientCode": "patient_code"},
                {"sibling": "sibling"},
                {"sourceID": "source_id"}
               ]:
        for key, value in dic.items():
            # extra properties value
            extra_property_value = get_patient_data(df_patient, key)
            if extra_property_value and extra_property_value != "nan":
                phenopacket["subject"]["extra_properties"][value] = extra_property_value

    # alternate_ids
    if get_patient_data(df_patient, "identifier") and get_patient_data(df_patient, "identifier") != "nan":
        phenopacket["subject"]["alternate_ids"] = [str(get_patient_data(df_patient, "identifier"))]

    #age
    age = get_patient_data(df_patient, "age")
    age_unit = get_patient_data(df_patient, "ageUnit")
    if (age and age != "nan") and (age_unit and age_unit != "nan"):
        iso_age = convert_age_to_iso(age, age_unit)
        phenopacket["subject"]["age"] = {
            "age": iso_age
        }

    # it's probably years if age is there and units are not
    elif (age and age != "nan") and (age_unit and age_unit == "nan"):
        iso_age = convert_age_to_iso(age, "years")
        phenopacket["subject"]["age"] = {
            "age": iso_age
        }
    else:
        pass


    # sex
    sex = get_patient_data(df_patient, "gender")
    if (sex and sex != "nan"):
        phenopacket["subject"]["sex"] = SEX_TO_SEX_MAPPING[sex]
    else:
        phenopacket["subject"]["sex"] = SEX_TO_SEX_MAPPING["NaN"]

#     if diseases:
#         unique_diseases = [i for n, i in enumerate(diseases) if i not in diseases[n + 1:]]
#         phenopacket["diseases"] = unique_diseases


    # add biosamples and diagnosis
    if "sampleID" in list(df_patient.columns):
        biosamples = list(df_patient["sampleID"])
        sample_notes = list(df_patient["note"])
        diagnosis_notes = list(df_patient["diagnosis_note"])
        barcodes = list(df_patient["barCodeNo"])
        species_id = list(df_patient["specieID"])
        sample_ages = list(df_patient["samples_age"])
        sample_age_units = list(df_patient["samples_ageUnit"])
        source_sample_ids = list(df_patient["sourceSampleID"])
        for sample_id, sample_note, diagnosis_note, barcode, specie_id, sample_age, sample_age_unit, source_sample_id in zip(
            biosamples, sample_notes, diagnosis_notes, barcodes, species_id, sample_ages, sample_age_units,
            source_sample_ids
        ):
            # if there is no sampleID then there is no other sample related information
            if sample_id == "nan" or sample_id is None:
                pass

            # continue with those patients that have samples
            else:
                biosample = {
                    # as we agreed we are going to use barCodeNo as sample id
                    "id": barcode,
                    # same db sampleID to description field
                    # !!!!!!!!!!!!TODO fix save as number
                    "description": sample_id,
                    "procedure": PROCEDURE_CODE_NOT_ASSIGNED
                }

                # add diagnosis_note to extra_properties
                if isinstance(diagnosis_note, str) and diagnosis_note != "nan":
                    if "extra_properties" in biosample:
                        biosample["extra_properties"].update({"diagnosis_note": diagnosis_note})
                    else:
                        biosample["extra_properties"] = {"diagnosis_note": diagnosis_note}
                # add sourceSampleID
                if isinstance(source_sample_id, str) and source_sample_id != "nan":
                    if "extra_properties" in biosample:
                        biosample["extra_properties"].update({"source_sample_id": source_sample_id})
                    else:
                        biosample["extra_properties"] = {"source_sample_id": source_sample_id}
                # add taxonomy
                if isinstance(specie_id, str) and specie_id != "nan":
                    if specie_id == "1":
                        biosample["taxonomy"] = NCBI_TAXON_HOMO_SAPIENS
                    elif specie_id == "2":
                        biosample["taxonomy"] = NCBI_TAXON_MUS_MUSCULUS
                    else:
                        pass

                # sample age
                if (sample_age and sample_age != "nan") and (sample_age_unit and sample_age_unit != "nan"):
                    iso_age = convert_age_to_iso(sample_age, sample_age_unit)
                    biosample["individual_age_at_collection"] = {
                        "age": iso_age
                    }
                elif (sample_age and sample_age != "nan") and not (sample_age_unit and sample_age_unit == "nan"):
                    iso_age = convert_age_to_iso(age, "years")
                    biosample["individual_age_at_collection"] = {
                        "age": iso_age
                    }
                else:
                    pass

                phenopacket["biosamples"].append(biosample)

    # list datatypes inside phenopacket
    for datatype in ["biosamples"]:
        if not phenopacket[datatype]:
            del phenopacket[datatype]


    phenopackets.append(phenopacket)

In [21]:
# inspect a random phenopacket

for phenopacket in phenopackets:
    if phenopacket["id"] == "5579":
        print(phenopacket)

{'id': '5579', 'subject': {'id': '5579', 'extra_properties': {'identifier': 'E180000497A', 'patient_code': 'P-5579', 'sibling': '0', 'source_id': '148'}, 'alternate_ids': ['E180000497A'], 'sex': 'MALE'}, 'biosamples': [{'id': 'S-7065', 'description': '7065', 'procedure': {'code': {'label': 'Procedure code not assigned', 'id': 'SNOMED:42630001'}}, 'extra_properties': {'source_sample_id': 'E180000497A'}, 'taxonomy': {'id': 'NCBITaxon:9606', 'label': 'Homo sapiens'}, 'individual_age_at_collection': {'age': 'P14.0Y'}}], 'diseases': [], 'meta_data': {'phenopacket_schema_version': '1.0.0-RC3', 'created_by': 'C3G Team', 'submitted_by': 'C3G Team', 'resources': [{'name': 'NCBI Taxonomy OBO Edition', 'version': '2018-07-27', 'namespace_prefix': 'NCBITaxon', 'id': 'NCBITaxon:2018-07-27', 'iri_prefix': 'http://purl.obolibrary.org/obo/NCBITaxon_', 'url': 'http://purl.obolibrary.org/obo/ncbitaxon.owl'}, {'name': 'ICHANGE Controlled vocabulary', 'version': '2020-12-11', 'namespace_prefix': 'ICHANGE'

In [22]:
print(len(phenopackets))

5456


In [23]:
# inspect a random phenopacket

print(phenopackets[206])

{'id': '1212', 'subject': {'id': '1212', 'extra_properties': {'age_group': 'adult', 'patient_code': 'P-1212', 'sibling': '0', 'source_id': '3'}, 'age': {'age': 'P33.0Y'}, 'sex': 'FEMALE'}, 'biosamples': [{'id': 'S-1212', 'description': '1212', 'procedure': {'code': {'label': 'Procedure code not assigned', 'id': 'SNOMED:42630001'}}, 'extra_properties': {'diagnosis_note': 'the Ki-67 proliferation index is 1%;\r\nFollow up:\r\nSept 2000 - subtotal resection followed by cranial radiation\r\nJuly 2007 - MRI good\r\nJuly 2008 - MRI stable\r\nJuly 2009 - MRI stable\r\nJuly 2010 - MRI stable', 'source_sample_id': 'BTTB 722'}, 'taxonomy': {'id': 'NCBITaxon:9606', 'label': 'Homo sapiens'}, 'individual_age_at_collection': {'age': 'P33.0Y'}}], 'diseases': [], 'meta_data': {'phenopacket_schema_version': '1.0.0-RC3', 'created_by': 'C3G Team', 'submitted_by': 'C3G Team', 'resources': [{'name': 'NCBI Taxonomy OBO Edition', 'version': '2018-07-27', 'namespace_prefix': 'NCBITaxon', 'id': 'NCBITaxon:2018

## Merge diagnosis into phenopackets

In [None]:
# SQL query to retrieve "samples_diagnosis.csv" data (from table samples)

# SELECT [ichange].[dbo].[samples].[sampleID]
#       ,[ichange].[dbo].[samples].[note]
#       ,[ichange].[dbo].[samples].[barCodeNo]
#       ,[ichange].[dbo].[samples].[patientID]
#       ,[ichange].[dbo].[sample_diagnosis].[diagnosisID]
#       ,[ichange].[dbo].[diagnosis].[diagnosis_en]
#       ,[ichange].[dbo].[diagnosis].[active]
#       ,[ichange].[dbo].[diagnosis].[WHO_Grade]


#   FROM [ichange].[dbo].[samples]

#   RIGHT JOIN [ichange].[dbo].[sample_diagnosis] ON [samples].[sampleID] = [ichange].[dbo].[sample_diagnosis].[sampleID]
#   LEFT JOIN [ichange].[dbo].[diagnosis] ON [sample_diagnosis].[diagnosisID] = [ichange].[dbo].[diagnosis].[diagnosisID]

In [24]:
samples_diagnosis = "/content/drive/MyDrive/genomicc3g/samples_diagnosis.csv"

In [25]:
samples_and_diagnosis_df = pd.read_csv(samples_diagnosis, dtype=str)
samples_and_diagnosis_df.head()

Unnamed: 0,sampleID,note,barCodeNo,patientID,diagnosisID,diagnosis_en,active,WHO_Grade
0,1077,,SX-1077,1077,13,ATRT,1,IV
1,1078,,SX-1078,1078,13,ATRT,1,IV
2,1079,,SX-1079,1079,13,ATRT,1,IV
3,1080,,SX-1080,1080,13,ATRT,1,IV
4,1081,,SX-1081,1081,13,ATRT,1,IV


In [26]:
samples_and_diagnosis_df.shape

(6938, 8)

In [27]:
samples_and_diagnosis_df = samples_and_diagnosis_df.fillna("nan")
samples_and_diagnosis_df.head()

Unnamed: 0,sampleID,note,barCodeNo,patientID,diagnosisID,diagnosis_en,active,WHO_Grade
0,1077,,SX-1077,1077,13,ATRT,1,IV
1,1078,,SX-1078,1078,13,ATRT,1,IV
2,1079,,SX-1079,1079,13,ATRT,1,IV
3,1080,,SX-1080,1080,13,ATRT,1,IV
4,1081,,SX-1081,1081,13,ATRT,1,IV


In [28]:
disease = []

for i, row in samples_and_diagnosis_df.iterrows():
    for phenopacket in phenopackets:
        if "biosamples" in phenopacket:
            if row["barCodeNo"] in [biosample["id"] for biosample in phenopacket["biosamples"]]:
                # first, assign diagnosis as disease to phenopacket
                disease = {
                    "term": {
                        "id": f"ICHANGE:{row['diagnosisID']}",
                        "label": row["diagnosis_en"]
                    },
                    "extra_properties": {
                        "active": row["active"],
                        "who_grade": row["WHO_Grade"] if row["WHO_Grade"] != "nan" else "None"
                    }
                }
                #if disease["term"]["id"] not in [d["term"]["id"] for d in phenopacket["diseases"]]:
                phenopacket["diseases"].append(disease)

                # second, assign diagnosis as histological diagnosis to a biosample
                for biosample in phenopacket["biosamples"]:
                    if row["barCodeNo"] == biosample["id"]:
                        biosample["histological_diagnosis"] = {
                            "id": f"ICHANGE:{row['diagnosisID']}",
                            "label": row["diagnosis_en"]
                        }
#                     else:
#                         print(f"barCodeNo: {row['barCodeNo']} sample: {biosample['id']}")

In [29]:
# remove diseases list if it's empty

for phenopacket in phenopackets:
    if not phenopacket["diseases"]:
        del phenopacket["diseases"]

In [30]:
# check if diseases are added to phenopacket or not

print(phenopackets[206])

{'id': '1212', 'subject': {'id': '1212', 'extra_properties': {'age_group': 'adult', 'patient_code': 'P-1212', 'sibling': '0', 'source_id': '3'}, 'age': {'age': 'P33.0Y'}, 'sex': 'FEMALE'}, 'biosamples': [{'id': 'S-1212', 'description': '1212', 'procedure': {'code': {'label': 'Procedure code not assigned', 'id': 'SNOMED:42630001'}}, 'extra_properties': {'diagnosis_note': 'the Ki-67 proliferation index is 1%;\r\nFollow up:\r\nSept 2000 - subtotal resection followed by cranial radiation\r\nJuly 2007 - MRI good\r\nJuly 2008 - MRI stable\r\nJuly 2009 - MRI stable\r\nJuly 2010 - MRI stable', 'source_sample_id': 'BTTB 722'}, 'taxonomy': {'id': 'NCBITaxon:9606', 'label': 'Homo sapiens'}, 'individual_age_at_collection': {'age': 'P33.0Y'}, 'histological_diagnosis': {'id': 'ICHANGE:1', 'label': 'Pilocytic Astrocytoma'}}], 'diseases': [{'term': {'id': 'ICHANGE:1', 'label': 'Pilocytic Astrocytoma'}, 'extra_properties': {'active': '1', 'who_grade': 'I'}}], 'meta_data': {'phenopacket_schema_version':

In [31]:
# remove duplicates in diseases list

for phenopacket in phenopackets:
    if "diseases" in phenopacket:
        unique_diseases_only = list({v["term"]["id"]:v for v in phenopacket["diseases"]}.values())

        phenopacket["diseases"] = unique_diseases_only

## Merge tissueLocation into biosamples

In [None]:
# SQL query to retrieve "samples_tissueLocation.csv" data (from sample_tissueLocation table)

#  SELECT [sampleTissueLocationID]
#       ,[sampleID]
#       ,[sample_tissueLocation].[tissueLocationID]
#       ,[ichange].[dbo].[tissueLocations].[tissueLocation_en]
#       ,[ichange].[dbo].[tissueLocations].[firstLevel]
#       ,[ichange].[dbo].[tissueLocations].[secondLevel]
#       ,[ichange].[dbo].[tissueLocations].[thirdLevel]
#       ,[ichange].[dbo].[tissueLocations].[hemisphere]
#       ,[ichange].[dbo].[tissueLocations].[organID]
#       ,[ichange].[dbo].[organs].[organ_en]
#   FROM [ichange].[dbo].[sample_tissueLocation]
#   LEFT JOIN [ichange].[dbo].[tissueLocations] ON [sample_tissueLocation].[tissueLocationID] = [ichange].[dbo].[tissueLocations].[tissueLocationID]
#   LEFT JOIN [ichange].[dbo].[organs] ON [ichange].[dbo].[tissueLocations].[organID] = [ichange].[dbo].[organs].[organID]

In [32]:
samples_tissueLocation = "/content/drive/MyDrive/genomicc3g/samples_tissueLocation.csv"

In [33]:
tissue_locations_df = pd.read_csv(samples_tissueLocation, dtype=str)
tissue_locations_df = tissue_locations_df.fillna("nan")
tissue_locations_df.head(20)

Unnamed: 0,sampleTissueLocationID,sampleID,tissueLocationID,tissueLocation_en,firstLevel,secondLevel,thirdLevel,hemisphere,organID,organ_en
0,2,1013,59,Frontal Lobe,Hemisphere,,,Right,1,Brain
1,3,1014,93,Ventricle,,,,,1,Brain
2,4,1015,98,Posterior Fossa,Midline,Posterior fossa,Posterior fossa,,1,Brain
3,5,1016,98,Posterior Fossa,Midline,Posterior fossa,Posterior fossa,,1,Brain
4,6,1017,93,Ventricle,,,,,1,Brain
5,7,1018,151,Cortex - Multicentric,Hemisphere,,,,1,Brain
6,8,1019,58,Frontal Lobe,Hemisphere,,,Left,1,Brain
7,9,1020,98,Posterior Fossa,Midline,Posterior fossa,Posterior fossa,,1,Brain
8,10,1022,98,Posterior Fossa,Midline,Posterior fossa,Posterior fossa,,1,Brain
9,11,1023,98,Posterior Fossa,Midline,Posterior fossa,Posterior fossa,,1,Brain


In [34]:
tissue_locations_df.shape

(9785, 10)

In [35]:
# make a copy of the main df

sub_patients_and_samples_df = patients_and_samples_df.copy()

# remove columns that are not needed in this merge

for column in ['patientCode', 'age', 'ageGroup', 'ageUnit', 'date_birth', 'gender', 'identifier', 'note', 'sibling', 'sourceID',
               'diagnosis_note','RNAseq', 'samples_sourceID', 'sourceSampleID',
               'CollectionMethodID', 'specieID', 'samples_age', 'samples_ageUnit', 'recurrence']:
    del sub_patients_and_samples_df[column]

sub_patients_and_samples_df.head()


Unnamed: 0,patientID,sampleID,barCodeNo,specie_en
0,0,1002,S-1002,Human
1,1003,1003,S-1003,Human
2,1004,1004,S-1004,Human
3,1005,1005,S-1005,Human
4,1006,1006,S-1006,Human


In [36]:
sub_patients_and_samples_df.shape

(10288, 4)

In [37]:
# merge samples df and tissue locations df

merged_tissueLocations_and_sub = pd.merge(sub_patients_and_samples_df, tissue_locations_df, on='sampleID')
merged_tissueLocations_and_sub.head()

Unnamed: 0,patientID,sampleID,barCodeNo,specie_en,sampleTissueLocationID,tissueLocationID,tissueLocation_en,firstLevel,secondLevel,thirdLevel,hemisphere,organID,organ_en
0,0,1002,S-1002,Human,1828,47,Brainstem,Midline,Posterior fossa,Brainstem,,1,Brain
1,1003,1003,S-1003,Human,2184,0,Unknown,,,,,0,Unknown
2,1004,1004,S-1004,Human,2185,0,Unknown,,,,,0,Unknown
3,1005,1005,S-1005,Human,2186,0,Unknown,,,,,0,Unknown
4,1006,1006,S-1006,Human,2187,0,Unknown,,,,,0,Unknown


In [38]:
# list all unique barcodes

barcode_list = list(merged_tissueLocations_and_sub.barCodeNo.unique())
# barcode_list

In [39]:
# annotate biosamples with their tissue locations

biosamples = {}

for barcode in barcode_list:
    df_sample = merged_tissueLocations_and_sub[merged_tissueLocations_and_sub.barCodeNo==barcode]
    biosample = {
        "id": barcode
    }
    sampled_tissue_ids = list(df_sample["tissueLocationID"])
    sample_tissue_labels = list(df_sample["tissueLocation_en"])

    # extra_properties start #
    ep_first_levels = list(df_sample["firstLevel"])
    ep_second_levels = list(df_sample["secondLevel"])
    ep_third_levels = list(df_sample["thirdLevel"])
    ep_hemispheres = list(df_sample["hemisphere"])
    ep_organs = list(df_sample["organ_en"])
    # end #

    for sample_tissue_id, sample_tissue_label, first_level, second_level, third_level, hemisphere, organ in zip(
        sampled_tissue_ids,
        sample_tissue_labels,
        ep_first_levels,
        ep_second_levels,
        ep_third_levels,
        ep_hemispheres,
        ep_organs
    ):
        sample_tissue = {
            "id": f"ICHANGE:{sample_tissue_id}",
            "label": sample_tissue_label
        }
        biosample["sampled_tissue"] = sample_tissue

        extra_properties = {
            "tissue_location_first_level": first_level,
            "tissue_location_second_level": second_level,
            "tissue_location_third_level": third_level,
            "tissue_location_hemisphere": hemisphere,
            "tissue_location_organ": organ,
        }
        for k, v in list(extra_properties.items()):
            if v == "nan":
                extra_properties.pop(k)
        if extra_properties is not {}:
            biosample["extra_properties"] = extra_properties

    #biosamples.append(biosample)
    biosamples[biosample["id"]] = biosample

# biosamples

In [40]:
# annotate phenopackets biosamples with their tissue locations

for phenopacket in phenopackets:
    if "biosamples" in phenopacket:
        for biosample in phenopacket["biosamples"]:
            external_biosample = biosamples.get(biosample["id"], None)
            #external_biosample = {b for b in biosamples if b["id"] == biosample["id"]}
            #external_biosample = biosamples.get(biosample["id"], None)
            if external_biosample:
                #extract_biosample = biosamples[external_biosample]
                biosample["sampled_tissue"] = external_biosample["sampled_tissue"]
                if "extra_properties" in external_biosample:
                    if "extra_properties" in biosample and biosample["extra_properties"] is not None:
                        biosample["extra_properties"].update(external_biosample["extra_properties"])
                    else:
                        biosample["extra_properties"] = external_biosample["extra_properties"]
            else:
                biosample["sampled_tissue"] = {
                    "id": "SNOMED:261665006",
                    "label": "Unknown"
                }

    else:
        pass


In [41]:
# inspect a random phenopacket, check if it has sampled_tissue property

print(phenopackets[1403])

{'id': '2498', 'subject': {'id': '2498', 'extra_properties': {'age_group': 'adult', 'patient_code': 'P-2498', 'sibling': '1'}, 'sex': 'MALE'}, 'biosamples': [{'id': 'S-2498', 'description': '2498', 'procedure': {'code': {'label': 'Procedure code not assigned', 'id': 'SNOMED:42630001'}}, 'extra_properties': {'diagnosis_note': 'Diagnosis: melanoma', 'source_sample_id': 'Melan_Father_RF', 'tissue_location_organ': 'Unknown'}, 'taxonomy': {'id': 'NCBITaxon:9606', 'label': 'Homo sapiens'}, 'histological_diagnosis': {'id': 'ICHANGE:23', 'label': 'Melanoma'}, 'sampled_tissue': {'id': 'ICHANGE:0', 'label': 'Unknown'}}], 'diseases': [{'term': {'id': 'ICHANGE:23', 'label': 'Melanoma'}, 'extra_properties': {'active': '1', 'who_grade': 'None'}}], 'meta_data': {'phenopacket_schema_version': '1.0.0-RC3', 'created_by': 'C3G Team', 'submitted_by': 'C3G Team', 'resources': [{'name': 'NCBI Taxonomy OBO Edition', 'version': '2018-07-27', 'namespace_prefix': 'NCBITaxon', 'id': 'NCBITaxon:2018-07-27', 'iri_

In [None]:
# save to a file, if needed

# with open(f"ichange_phenopackets_{timestamp}.json", "w") as fp:
#     json.dump(phenopackets, fp, indent=4)

## Merge mutations and genes

In [None]:
# SQL query to retrieve "sample_mutations_genes.csv" data (from extractProcessing table)

# SELECT [ichange].[dbo].[extractProcessing].[extractProcessingID]
#       ,[ichange].[dbo].[extractProcessing].[sampleID]
#       ,[ichange].[dbo].[extractProcessing_mutation].[mutationID]
#       ,[ichange].[dbo].[mutations].[mutation]
#       ,[ichange].[dbo].[mutations].[geneID]
#       ,[ichange].[dbo].[genes].[gene]
#       ,[ichange].[dbo].[genes].[accession]

#   FROM [ichange].[dbo].[extractProcessing]
#   INNER JOIN [ichange].[dbo].[extractProcessing_mutation] on [ichange].[dbo].[extractProcessing].[extractProcessingID] = [ichange].[dbo].[extractProcessing_mutation].[extractProcessingID]
#   LEFT JOIN [ichange].[dbo].[mutations] on [ichange].[dbo].[extractProcessing_mutation].[mutationID] = [ichange].[dbo].[mutations].[mutationID]
#   LEFT JOIN [ichange].[dbo].[genes] on [ichange].[dbo].[mutations].[geneID] = [ichange].[dbo].[genes].[geneID]

In [42]:
sample_mutations_genes = "/content/drive/MyDrive/genomicc3g/sample_mutations_genes.csv"

In [43]:
mutations_genes_df = pd.read_csv(sample_mutations_genes, dtype=str)
mutations_genes_df = mutations_genes_df.fillna("nan")
mutations_genes_df.head()

Unnamed: 0,extractProcessingID,sampleID,mutationID,mutation,geneID,gene,accession
0,1,1002,140,K27M,2,H3F3A,NM_002107
1,2,1188,226,WT,2,H3F3A,NM_002107
2,3,1189,226,WT,2,H3F3A,NM_002107
3,4,1190,142,G34V,2,H3F3A,NM_002107
4,5,1193,226,WT,2,H3F3A,NM_002107


In [44]:
mutations_genes_df.shape

(3121, 7)

In [45]:
# merge samples and mutations, genes df

merged_mutations_genes_and_sub = pd.merge(sub_patients_and_samples_df, mutations_genes_df, on='sampleID')
merged_mutations_genes_and_sub.head()

Unnamed: 0,patientID,sampleID,barCodeNo,specie_en,extractProcessingID,mutationID,mutation,geneID,gene,accession
0,0,1002,S-1002,Human,1,140,K27M,2,H3F3A,NM_002107
1,0,1002,S-1002,Human,1207,256,Q996fs\r\n,4,ATRX,NM_138270
2,0,1002,S-1002,Human,1916,271,V25F\r\n,5,TP53,NM_000546
3,1188,1188,S-1188,Human,2,226,WT,2,H3F3A,NM_002107
4,1189,1189,S-1189,Human,3,226,WT,2,H3F3A,NM_002107


In [46]:
# list all unique barcodes

barcode_list_2 = list(merged_mutations_genes_and_sub.barCodeNo.unique())

New code for interpretation

In [None]:
# Merge samples and mutations, genes DataFrame
merged_mutations_genes_and_sub = pd.merge(sub_patients_and_samples_df, mutations_genes_df, on='sampleID')

# Create a list of unique barcodes
barcode_list_2 = list(merged_mutations_genes_and_sub.barCodeNo.unique())

def create_interpretation(df_sample):
    """
    Create an interpretation summary for a biosample based on its variants and genes.

    :param df_sample: DataFrame containing mutation and gene data for a single biosample.
    :return: A dictionary representing the interpretation summary.
    """
    # Extracting variants and genes information from the DataFrame
    variants = []
    for _, row in df_sample.iterrows():
        variant = {
            "allele_type": "hgvsAllele",
            "allele": {
                "id": row["mutationID"],
                "hgvs": row["mutation"]
            },
            "extra_properties": {
                "gene_context": row["gene"]
            }
        }
        variants.append(variant)

    genes = df_sample[['geneID', 'gene']].drop_duplicates().to_dict(orient='records')

    if not variants and not genes:
        return None

    return {
        "variants_summary": variants,
        "genes_summary": genes
    }

# Create biosamples with interpretations
biosamples_interpretations = {}

for barcode in barcode_list_2:
    df_sample = merged_mutations_genes_and_sub[merged_mutations_genes_and_sub.barCodeNo == barcode]
    interpretation = create_interpretation(df_sample)
    biosamples_interpretations[barcode] = interpretation

# Iterate over phenopackets and add interpretations to biosamples
for phenopacket in phenopackets:
    if "biosamples" in phenopacket:
        for biosample in phenopacket["biosamples"]:
            barcode = biosample["id"]
            interpretation = biosamples_interpretations.get(barcode, None)
            if interpretation:
                print(phenopacket)
                biosample["interpretation"] = interpretation


{'id': '0', 'subject': {'id': '0', 'extra_properties': {'patient_code': 'P-0', 'sibling': '0'}, 'sex': 'UNKNOWN_SEX'}, 'biosamples': [{'id': 'S-1002', 'description': '1002', 'procedure': {'code': {'label': 'Procedure code not assigned', 'id': 'SNOMED:42630001'}}, 'extra_properties': {'source_sample_id': 'DIPG04T', 'tissue_location_first_level': 'Midline', 'tissue_location_second_level': 'Posterior fossa', 'tissue_location_third_level': 'Brainstem', 'tissue_location_organ': 'Brain'}, 'taxonomy': {'id': 'NCBITaxon:9606', 'label': 'Homo sapiens'}, 'sampled_tissue': {'id': 'ICHANGE:47', 'label': 'Brainstem'}, 'interpretation': {'variants_summary': [{'mutationID': '140', 'mutation': 'K27M', 'gene': 'H3F3A'}, {'mutationID': '256', 'mutation': 'Q996fs\r\n', 'gene': 'ATRX'}, {'mutationID': '271', 'mutation': 'V25F\r\n', 'gene': 'TP53'}], 'genes_summary': [{'geneID': '2', 'gene': 'H3F3A'}, {'geneID': '4', 'gene': 'ATRX'}, {'geneID': '5', 'gene': 'TP53'}]}}, {'id': 'S-2595', 'description': '2595

In [None]:

# Merge samples and mutations, genes DataFrame
merged_mutations_genes_and_sub = pd.merge(sub_patients_and_samples_df, mutations_genes_df, on='sampleID')

# Create a list of unique barcodes
barcode_list_2 = list(merged_mutations_genes_and_sub.barCodeNo.unique())

# Function to create interpretation from variants and genes
def create_interpretation(variants, genes):
    """
    Create an interpretation summary for a biosample based on its variants and genes.

    :param variants: A list of variant information for the biosample.
    :param genes: A list of gene information for the biosample.
    :return: A dictionary representing the interpretation summary.
    """
    # Aggregate variants and genes into an interpretation summary
    return {
        "variants_summary": variants,
        "genes_summary": genes
    }

# Create biosamples with interpretations
biosamples_interpretations = {}

for barcode in barcode_list_2:
    df_sample = merged_mutations_genes_and_sub[merged_mutations_genes_and_sub.barCodeNo == barcode]

    # Extract variant and gene data
    variants = df_sample[['mutationID', 'mutation', 'gene']].drop_duplicates().to_dict(orient='records')
    genes = df_sample[['geneID', 'gene']].drop_duplicates().to_dict(orient='records')

    interpretation = create_interpretation(variants, genes)
    biosamples_interpretations[barcode] = interpretation

# Placeholder for your phenopackets
# phenopackets = [...]

# Iterate over phenopackets and add interpretations to biosamples
for phenopacket in phenopackets:
    if "biosamples" in phenopacket:
        for biosample in phenopacket["biosamples"]:
            barcode = biosample["id"]
            interpretation = biosamples_interpretations.get(barcode, None)
            if interpretation:
                print(phenopacket)
                biosample["interpretation"] = interpretation


{'id': '0', 'subject': {'id': '0', 'extra_properties': {'patient_code': 'P-0', 'sibling': '0'}, 'sex': 'UNKNOWN_SEX'}, 'biosamples': [{'id': 'S-1002', 'description': '1002', 'procedure': {'code': {'label': 'Procedure code not assigned', 'id': 'SNOMED:42630001'}}, 'extra_properties': {'source_sample_id': 'DIPG04T', 'tissue_location_first_level': 'Midline', 'tissue_location_second_level': 'Posterior fossa', 'tissue_location_third_level': 'Brainstem', 'tissue_location_organ': 'Brain'}, 'taxonomy': {'id': 'NCBITaxon:9606', 'label': 'Homo sapiens'}, 'sampled_tissue': {'id': 'ICHANGE:47', 'label': 'Brainstem'}, 'interpretation': {'variants_summary': [{'mutationID': '140', 'mutation': 'K27M', 'gene': 'H3F3A'}, {'mutationID': '256', 'mutation': 'Q996fs\r\n', 'gene': 'ATRX'}, {'mutationID': '271', 'mutation': 'V25F\r\n', 'gene': 'TP53'}], 'genes_summary': [{'geneID': '2', 'gene': 'H3F3A'}, {'geneID': '4', 'gene': 'ATRX'}, {'geneID': '5', 'gene': 'TP53'}]}}, {'id': 'S-2595', 'description': '2595

In [None]:
print(phenopackets[1403])

{'id': '2498', 'subject': {'id': '2498', 'extra_properties': {'age_group': 'adult', 'patient_code': 'P-2498', 'sibling': '1'}, 'sex': 'MALE'}, 'biosamples': [{'id': 'S-2498', 'description': '2498', 'procedure': {'code': {'label': 'Procedure code not assigned', 'id': 'SNOMED:42630001'}}, 'extra_properties': {'diagnosis_note': 'Diagnosis: melanoma', 'source_sample_id': 'Melan_Father_RF', 'tissue_location_organ': 'Unknown'}, 'taxonomy': {'id': 'NCBITaxon:9606', 'label': 'Homo sapiens'}, 'histological_diagnosis': {'id': 'ICHANGE:23', 'label': 'Melanoma'}, 'sampled_tissue': {'id': 'ICHANGE:0', 'label': 'Unknown'}}], 'diseases': [{'term': {'id': 'ICHANGE:23', 'label': 'Melanoma'}, 'extra_properties': {'active': '1', 'who_grade': 'None'}}], 'meta_data': {'phenopacket_schema_version': '1.0.0-RC3', 'created_by': 'C3G Team', 'submitted_by': 'C3G Team', 'resources': [{'name': 'NCBI Taxonomy OBO Edition', 'version': '2018-07-27', 'namespace_prefix': 'NCBITaxon', 'id': 'NCBITaxon:2018-07-27', 'iri_

In [47]:
# create an object with biosamples and their variants (mutations)

biosamples_with_variants = {}

for barcode in barcode_list_2:
    df_sample = merged_mutations_genes_and_sub[merged_mutations_genes_and_sub.barCodeNo==barcode]
    biosample = {
        "id": barcode,
        "variants": []
    }
    mutation_ids = list(df_sample["mutationID"])
    mutation_labels = list(df_sample["mutation"])
    gene_labels = list(df_sample["gene"])
    for mutation_id, mutation_label, gene_label in zip(mutation_ids, mutation_labels, gene_labels):
        variant = {
            "allele_type": "hgvsAllele",
            "allele": {
                "id": f"{mutation_id}",
                "hgvs": f"{mutation_label}"
            },
            "extra_properties": {
                "gene_context": f"{gene_label}"
            }
        }
        biosample["variants"].append(variant)

    if biosample["variants"] == []:
        del biosample["variants"]

    biosamples_with_variants[biosample["id"]] = biosample


In [48]:
# inspect

print(biosamples_with_variants)

{'S-1002': {'id': 'S-1002', 'variants': [{'allele_type': 'hgvsAllele', 'allele': {'id': '140', 'hgvs': 'K27M'}, 'extra_properties': {'gene_context': 'H3F3A'}}, {'allele_type': 'hgvsAllele', 'allele': {'id': '256', 'hgvs': 'Q996fs\r\n'}, 'extra_properties': {'gene_context': 'ATRX'}}, {'allele_type': 'hgvsAllele', 'allele': {'id': '271', 'hgvs': 'V25F\r\n'}, 'extra_properties': {'gene_context': 'TP53'}}]}, 'S-1188': {'id': 'S-1188', 'variants': [{'allele_type': 'hgvsAllele', 'allele': {'id': '226', 'hgvs': 'WT'}, 'extra_properties': {'gene_context': 'H3F3A'}}]}, 'S-1189': {'id': 'S-1189', 'variants': [{'allele_type': 'hgvsAllele', 'allele': {'id': '226', 'hgvs': 'WT'}, 'extra_properties': {'gene_context': 'H3F3A'}}]}, 'S-1190': {'id': 'S-1190', 'variants': [{'allele_type': 'hgvsAllele', 'allele': {'id': '142', 'hgvs': 'G34V'}, 'extra_properties': {'gene_context': 'H3F3A'}}]}, 'S-1193': {'id': 'S-1193', 'variants': [{'allele_type': 'hgvsAllele', 'allele': {'id': '226', 'hgvs': 'WT'}, 'ext

In [49]:
# TODO male genes unique in the list

# create an object with biosamples and related genes

biosamples_with_genes = {}

for barcode in barcode_list_2:
    df_sample = merged_mutations_genes_and_sub[merged_mutations_genes_and_sub.barCodeNo==barcode]
    biosample = {
        "id": barcode,
        "genes": []
    }
    gene_ids = list(df_sample["geneID"])
    gene_labels = list(df_sample["gene"])
    # TODO
    gene_accessions = list(df_sample["accession"])
    for gene_id, gene_label in zip(gene_ids, gene_labels):
        gene = {
            "id": f"{gene_id}",
            "symbol": f"{gene_label}"
        }
        biosample["genes"].append(gene)

    if biosample["genes"] == []:
        del biosample["genes"]
    biosamples_with_genes[biosample["id"]] = biosample

In [None]:
# inspect

# print(biosamples_with_genes)

In [50]:
# iterate over phenopackets and annotate biosamples with variants (mutations) and phenopackets with related genes

for phenopacket in phenopackets:
    genes = []
    if "biosamples" in phenopacket:
        for biosample in phenopacket["biosamples"]:
            external_biosample = biosamples_with_variants.get(biosample["id"], None)
            #external_biosample = {b for b in biosamples if b["id"] == biosample["id"]}
            #external_biosample = biosamples.get(biosample["id"], None)
            if external_biosample:
                #extract_biosample = biosamples[external_biosample]
                biosample["variants"] = external_biosample["variants"]

            external_biosample_genes = biosamples_with_genes.get(biosample["id"], None)
            if external_biosample_genes:
                if "genes" in external_biosample_genes:
                    genes.extend(external_biosample_genes["genes"])

    else:
        pass

    if genes:
        phenopacket["genes"] = genes

## Merge material into phenopackets

In [None]:
# # SQL query to retrieve "samples_material.csv" data (from table samples)

# SELECT [ichange].[dbo].[samples].[sampleID]
#       ,[ichange].[dbo].[samples].[barCodeNo]
#       ,[ichange].[dbo].[samples].[patientID]
#       ,[ichange].[dbo].[samples].[materialAvailable]
#       ,[ichange].[dbo].[samples].[materialID]
#       ,[ichange].[dbo].[materials].[material_en]
#       ,[ichange].[dbo].[materials].[materialCode]


#   FROM [ichange].[dbo].[samples]
#   LEFT JOIN [ichange].[dbo].[materials] ON [samples].[materialID] = [ichange].[dbo].[materials].[materialID]

In [51]:
samples_material = "/content/drive/MyDrive/genomicc3g/samples_material.csv"

In [52]:
samples_and_material_df = pd.read_csv(samples_material, dtype=str)
samples_and_material_df.head()

Unnamed: 0,sampleID,barCodeNo,patientID,materialAvailable,materialID,material_en,materialCode
0,1002,S-1002,0,1,1,Tissue,TI
1,1003,S-1003,1003,1,1,Tissue,TI
2,1004,S-1004,1004,1,2,Fibroblast,FB
3,1005,S-1005,1005,1,2,Fibroblast,FB
4,1006,S-1006,1006,1,2,Fibroblast,FB


In [53]:
samples_and_material_df.shape

(10051, 7)

In [54]:
samples_and_material_df = samples_and_material_df.fillna("nan")
samples_and_material_df.head()

Unnamed: 0,sampleID,barCodeNo,patientID,materialAvailable,materialID,material_en,materialCode
0,1002,S-1002,0,1,1,Tissue,TI
1,1003,S-1003,1003,1,1,Tissue,TI
2,1004,S-1004,1004,1,2,Fibroblast,FB
3,1005,S-1005,1005,1,2,Fibroblast,FB
4,1006,S-1006,1006,1,2,Fibroblast,FB


In [55]:
# iterate over phenopackets and annotate biosamples with material information if it's available

for i, row in samples_and_material_df.iterrows():
    if row["material_en"] != "nan":
        for phenopacket in phenopackets:
            if "biosamples" in phenopacket:
                for biosample in phenopacket["biosamples"]:
                    if row["barCodeNo"] == biosample["id"]:
                        if "extra_properties" in biosample:
                            biosample["extra_properties"].update({"material": row["material_en"]})
                        else:
                            biosample["extra_properties"] = {"material": row["material_en"]}


# check that material assigned

phenopackets[:5]

[{'id': '0',
  'subject': {'id': '0',
   'extra_properties': {'patient_code': 'P-0', 'sibling': '0'},
   'sex': 'UNKNOWN_SEX'},
  'biosamples': [{'id': 'S-1002',
    'description': '1002',
    'procedure': {'code': {'label': 'Procedure code not assigned',
      'id': 'SNOMED:42630001'}},
    'extra_properties': {'source_sample_id': 'DIPG04T',
     'tissue_location_first_level': 'Midline',
     'tissue_location_second_level': 'Posterior fossa',
     'tissue_location_third_level': 'Brainstem',
     'tissue_location_organ': 'Brain',
     'material': 'Tissue'},
    'taxonomy': {'id': 'NCBITaxon:9606', 'label': 'Homo sapiens'},
    'sampled_tissue': {'id': 'ICHANGE:47', 'label': 'Brainstem'},
    'variants': [{'allele_type': 'hgvsAllele',
      'allele': {'id': '140', 'hgvs': 'K27M'},
      'extra_properties': {'gene_context': 'H3F3A'}},
     {'allele_type': 'hgvsAllele',
      'allele': {'id': '256', 'hgvs': 'Q996fs\r\n'},
      'extra_properties': {'gene_context': 'ATRX'}},
     {'allele_

## Save to a file

In [56]:
# inspect

print(phenopackets[1188])

{'id': '2280', 'subject': {'id': '2280', 'extra_properties': {'age_group': 'pediatric', 'patient_code': 'P-2280', 'sibling': '0', 'source_id': '21'}, 'age': {'age': 'P0Y4.0M'}, 'sex': 'FEMALE'}, 'biosamples': [{'id': 'S-2280', 'description': '2280', 'procedure': {'code': {'label': 'Procedure code not assigned', 'id': 'SNOMED:42630001'}}, 'extra_properties': {'diagnosis_note': 'Diagnosis: DIA', 'source_sample_id': 'PL652', 'tissue_location_first_level': 'Hemisphere', 'tissue_location_hemisphere': 'Left', 'tissue_location_organ': 'Brain', 'material': 'Tissue'}, 'taxonomy': {'id': 'NCBITaxon:9606', 'label': 'Homo sapiens'}, 'individual_age_at_collection': {'age': 'P0Y4.0M'}, 'histological_diagnosis': {'id': 'ICHANGE:86', 'label': 'Desmoplastic Infantile Astrocytoma (DIA)'}, 'sampled_tissue': {'id': 'ICHANGE:115', 'label': 'Fronto-Parietal Lobe'}}], 'diseases': [{'term': {'id': 'ICHANGE:86', 'label': 'Desmoplastic Infantile Astrocytoma (DIA)'}, 'extra_properties': {'active': '1', 'who_grad

In [57]:
# save to a file

with open(f"ichange_phenopackets_{timestamp}.json", "w") as fp:
    json.dump(phenopackets, fp, indent=4)

## QA Checks

##### This code just goes over the data and checks if all counts match

In [58]:
len(phenopackets)

5456

In [59]:
biosamples_count = 0

In [60]:
for ph in phenopackets:
    if "biosamples" in ph:
        for b in ph["biosamples"]:
            biosamples_count += 1

biosamples_count

10051

In [61]:
diagnosis_count = 0

for ph in phenopackets:
    if "biosamples" in ph:
        for b in ph["biosamples"]:
            if "histological_diagnosis" in b:
                diagnosis_count += 1

diagnosis_count

6891

In [62]:
diseases_count = 0

for ph in phenopackets:
    if "diseases" in ph:
        for d in ph["diseases"]:
            diseases_count += 1

diseases_count

3652

In [63]:
from collections import Counter

In [64]:
diseases_counter = Counter()

for ph in phenopackets:
    if "diseases" in ph:
        for d in ph["diseases"]:
            diseases_counter.update((d["term"]["label"],))

diseases_counter

Counter({'GBM': 521,
         'Immunodeficiency': 13,
         'ATRT': 110,
         'PNET': 51,
         'Normal Brain': 42,
         'Neural Stem Cell': 5,
         'Epithelioid Hemangioendothelioma': 20,
         'Pilocytic Astrocytoma': 366,
         'Pleomorphic Xanthoastrocytoma (PXA)': 15,
         'Germinoma': 11,
         'Oligodendroglioma': 2,
         'Anaplastic Ependymoma': 12,
         'Oligoastrocytoma': 28,
         'Oligodendroglioma GrII': 17,
         'Gliomastosis Cerebri': 2,
         'Angiosarcoma': 46,
         'Seminoma': 1,
         'Lymphoma': 4,
         'Ganglioglioma': 65,
         'Vasculitis': 1,
         'Medulloblastoma': 223,
         'Ependymoma': 62,
         'Ependymoma - myxopapillary': 12,
         'Low Grade Glioma (LGG)': 35,
         'Anaplastic Astrocytoma (AA)': 99,
         'Astrocytoma': 39,
         'HGG': 19,
         'Cortical Dysplasia': 9,
         'DNET': 61,
         'Diffuse Astrocytoma': 95,
         'Anaplastic Ganglioglioma': 3,

In [65]:
print(len(diseases_counter.keys()))

163


In [66]:
diagnosis_counter = Counter()

In [67]:
for ph in phenopackets:
    if "biosamples" in ph:
        for b in ph["biosamples"]:
            if "histological_diagnosis" in b and b["histological_diagnosis"]:
                #print(b["histological_diagnosis"])
                diagnosis_counter.update((b["histological_diagnosis"]["label"],))

diagnosis_counter

Counter({'GBM': 860,
         'Immunodeficiency': 15,
         'ATRT': 130,
         'PNET': 64,
         'Normal Brain': 40,
         'Neural Stem Cell': 15,
         'Epithelioid Hemangioendothelioma': 20,
         'Pilocytic Astrocytoma': 723,
         'Pleomorphic Xanthoastrocytoma (PXA)': 23,
         'Oligodendroglioma': 4,
         'Anaplastic Ependymoma': 38,
         'Oligoastrocytoma': 31,
         'Oligodendroglioma GrII': 17,
         'Gliomastosis Cerebri': 7,
         'Angiosarcoma': 46,
         'Seminoma': 1,
         'Lymphoma': 9,
         'Vasculitis': 1,
         'Medulloblastoma': 453,
         'Ependymoma': 188,
         'Ependymoma - myxopapillary': 32,
         'Low Grade Glioma (LGG)': 130,
         'Anaplastic Astrocytoma (AA)': 157,
         'HGG': 53,
         'Ganglioglioma': 133,
         'DNET': 83,
         'Cortical Dysplasia': 9,
         'Diffuse Astrocytoma': 108,
         'Anaplastic Ganglioglioma': 4,
         'Palmoplantar Keratoderma of the Vörne

In [68]:
print(len(diagnosis_counter.keys()))

162


In [69]:
for k in diseases_counter.keys():
    if k not in diagnosis_counter.keys():
        print(k)

Febrile infection-related epilepsy syndrome


In [70]:
ph_counter = Counter()

for ph in phenopackets:
    ph_counter.update((ph["id"],))

for k, v in ph_counter.items():
    if v > 1:
        print(k)


In [71]:
for k, v in ph_counter.items():
    print(k, v)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1474 1
1475 1
1476 1
1477 1
1478 1
1479 1
1480 1
1481 1
1482 1
1483 1
1484 1
1485 1
1486 1
1487 1
1488 1
1489 1
1490 1
1491 1
1492 1
1493 1
1494 1
1495 1
1496 1
1497 1
1498 1
1499 1
1500 1
1501 1
1502 1
1503 1
1504 1
1505 1
1506 1
1507 1
1508 1
1509 1
1510 1
1511 1
1512 1
1513 1
1514 1
1515 1
1516 1
1517 1
1518 1
1519 1
1521 1
1522 1
1523 1
1524 1
1525 1
1526 1
1527 1
1528 1
1529 1
1530 1
1531 1
1532 1
1533 1
1534 1
1535 1
1536 1
1537 1
1538 1
1539 1
1540 1
1541 1
1542 1
1543 1
1544 1
1545 1
1546 1
1547 1
1548 1
1549 1
1550 1
1551 1
1552 1
1553 1
1554 1
1555 1
1556 1
1557 1
1558 1
1559 1
1560 1
1561 1
1562 1
1563 1
1564 1
1565 1
1566 1
1567 1
1569 1
1570 1
1571 1
1572 1
1573 1
1574 1
1575 1
1576 1
1577 1
1578 1
1579 1
1580 1
1581 1
1582 1
1583 1
1584 1
1585 1
1586 1
1587 1
1588 1
1589 1
1590 1
1591 1
1592 1
1593 1
1594 1
1595 1
1597 1
1598 1
1599 1
1600 1
1601 1
1602 1
1603 1
1604 1
1605 1
1606 1
1607 1
1608 1
1609 1
1610

In [72]:
len(phenopackets)

5456

In [73]:
subjects =[]
for ph in phenopackets:
    subjects.append(ph["subject"]["id"])


In [74]:
len(subjects)

5456

In [75]:
subjects[:10]

['0', '1003', '1004', '1005', '1006', '1007', '1008', '1009', '1010', '1011']

In [76]:
set(subjects)

{'6022',
 '5408',
 '1817',
 '1181',
 '2321',
 '3112',
 '2661',
 '4273',
 '6758',
 '4344',
 '1784',
 '5608',
 '2967',
 '2437',
 '3441',
 '7137',
 '2902',
 '1332',
 '4795',
 '3218',
 '7132',
 '1244',
 '1859',
 '6939',
 '1302',
 '2915',
 '4008',
 '6553',
 '5874',
 '5129',
 '4362',
 '6464',
 '3091',
 '6806',
 '4547',
 '2837',
 '5617',
 '1611',
 '2247',
 '2616',
 '3938',
 '4127',
 '4015',
 '5386',
 '2651',
 '4922',
 '4497',
 '1191',
 '2181',
 '7136',
 '6680',
 '3042',
 '6100',
 '6594',
 '4623',
 '6069',
 '1998',
 '1702',
 '5058',
 '4939',
 '4970',
 '5133',
 '7144',
 '2387',
 '2328',
 '6645',
 '3138',
 '7257',
 '4507',
 '1933',
 '6147',
 '5893',
 '3906',
 '5768',
 '5104',
 '1426',
 '3797',
 '5710',
 '5836',
 '2455',
 '6505',
 '4367',
 '4368',
 '7142',
 '2586',
 '6571',
 '4004',
 '2313',
 '1125',
 '5109',
 '1523',
 '7268',
 '7164',
 '6128',
 '4412',
 '2154',
 '2383',
 '3003',
 '5030',
 '3072',
 '6507',
 '2791',
 '2693',
 '5609',
 '6212',
 '5344',
 '2705',
 '4411',
 '6365',
 '1445',
 '2046',
 

In [77]:
len(set(subjects))

5456

In [78]:
original_phenopacket_ids = []
for p in phenopackets:
    original_phenopacket_ids.append(p["id"])

len(original_phenopacket_ids)

5456