In [18]:
from ista import FlatFileDatabaseParser
from ista.util import print_onto_stats

In [19]:
import owlready2 as owl

In [20]:
onto = owl.get_ontology("file://C:\\Users\\musker\\Desktop\\cs611_project\\neo4j\\import\\CPO.rdf").load()
data_dir = "C:\\Users\\musker\\Desktop\\cs611_project\\ontologies"

In [21]:
drugbank = FlatFileDatabaseParser("drugbank", onto, data_dir)
hetionet = FlatFileDatabaseParser("hetionet", onto, data_dir)

In [129]:
import pandas as pd
nodes = pd.read_csv("datasets/hetionet/hetionet-v1.0-nodes.tsv", sep='\t')

In [23]:
nodes.head()

Unnamed: 0,id,name,kind
0,Anatomy::UBERON:0000002,uterine cervix,Anatomy
1,Anatomy::UBERON:0000004,nose,Anatomy
2,Anatomy::UBERON:0000006,islet of Langerhans,Anatomy
3,Anatomy::UBERON:0000007,pituitary gland,Anatomy
4,Anatomy::UBERON:0000010,peripheral nervous system,Anatomy


In [24]:
nodes[(nodes["name"].str.contains("sleep phase", case=False))]

Unnamed: 0,id,name,kind
45145,Side Effect::C0852566,Sleep phase rhythm disturbance,Side Effect


In [25]:
edges = pd.read_csv("datasets/hetionet/edges.sif", sep='\t')

In [26]:
nodes[(nodes["name"].str.contains("whin", case=False))]

Unnamed: 0,id,name,kind


In [27]:
nodes[(nodes["name"].str.contains("encephalopathy", case=False))]

Unnamed: 0,id,name,kind
41053,Side Effect::C0006112,Metabolic encephalopathy,Side Effect
41545,Side Effect::C0019151,Hepatic encephalopathy,Side Effect
41772,Side Effect::C0023524,Progressive multifocal leukoencephalopathy,Side Effect
42555,Side Effect::C0085584,Encephalopathy,Side Effect
42627,Side Effect::C0149504,Toxic encephalopathy,Side Effect
42745,Side Effect::C0151620,Hypertensive encephalopathy,Side Effect
43928,Side Effect::C0270612,Leukoencephalopathy,Side Effect
45933,Side Effect::C1140716,Hypoxic-ischaemic encephalopathy,Side Effect
46110,Side Effect::C1306587,Encephalopathy acute,Side Effect
46538,Side Effect::C3160858,Posterior reversible encephalopathy syndrome,Side Effect


In [28]:
nodes[nodes["name"] == "sma"]

Unnamed: 0,id,name,kind


In [29]:
edges[(edges["metaedge"] == "DpS")]

Unnamed: 0,source,metaedge,target
720777,Disease::DOID:7693,DpS,Symptom::D007383
720778,Disease::DOID:2994,DpS,Symptom::D021501
720779,Disease::DOID:7148,DpS,Symptom::D010146
720780,Disease::DOID:1319,DpS,Symptom::D013064
720781,Disease::DOID:1324,DpS,Symptom::D003248
...,...,...,...
724129,Disease::DOID:12236,DpS,Symptom::D011537
724130,Disease::DOID:10763,DpS,Symptom::D057774
724131,Disease::DOID:1192,DpS,Symptom::D003638
724132,Disease::DOID:4989,DpS,Symptom::D010148


In [41]:
# Adding nodes for CP, symptom-CP relationships

# Adding CP as a node
if not (nodes["id"] == "Disease::DOID:1969").any():
    nodes.loc[len(nodes)] = ["Disease::DOID:1969", "Cerebral Palsy", "Disease"]

# Creating relationships between CP and CP symptoms
cp_symptoms = pd.read_csv("symptoms.csv")

if not (edges["source"] == "Disease::DOID:1969").any():
    cp_symptoms_rel = pd.DataFrame({
        'source': ['Disease::DOID:1969'] * len(cp_symptoms),  # Repeat the value for each row
        'metaedge': ['DpS'] * len(cp_symptoms),                # Repeat the value for each row
        'target': cp_symptoms['name']                        # Use values from the `name` column
    })

    edges = pd.concat([edges, cp_symptoms_rel], ignore_index=True)

edges

Unnamed: 0,source,metaedge,target
0,Gene::9021,GpBP,Biological Process::GO:0071357
1,Gene::51676,GpBP,Biological Process::GO:0098780
2,Gene::19,GpBP,Biological Process::GO:0055088
3,Gene::3176,GpBP,Biological Process::GO:0010243
4,Gene::3039,GpBP,Biological Process::GO:0006898
...,...,...,...
2250206,Disease::DOID:1969,DpS,Diplegia
2250207,Disease::DOID:1969,DpS,Hemiplegia
2250208,Disease::DOID:1969,DpS,Extrapyramidal disorder
2250209,Disease::DOID:1969,DpS,Chorea


In [47]:
# Creating the treatments dataframe

treatments_rel = pd.read_csv("pedi_treatments.csv")

treatments = treatments_rel[["treatment", "kind"]]
treatments = treatments.drop_duplicates(subset='treatment').reset_index(drop=True)

# Generate IDs in the format "Treatment::CPO<id>"
treatments['id'] = [f"Treatment::CPO{str(i).zfill(5)}" for i in range(1, len(treatments) + 1)]

In [53]:
treatments = treatments.rename(columns={"treatment": "name"})[["id", "name", "kind"]]
treatments.to_csv("datasets/cp_dataset/nodes.csv", index=False)

In [98]:
treatments = pd.read_csv("datasets/cp_dataset/edges.csv")
hetionet_nodes = pd.read_csv("datasets/hetionet/hetionet-v1.0-nodes.tsv", sep='\t')
hetionet_nodes["name_lower"] = hetionet_nodes["name"].str.lower()

Unnamed: 0,source,metaedge,target,id,name,kind,name_lower
0,Treatment::CPO00001,therapeutic,hydrocephalus,Side Effect::C0020255,Hydrocephalus,Side Effect,hydrocephalus
1,Treatment::CPO00002,therapeutic,hydrocephalus,Side Effect::C0020255,Hydrocephalus,Side Effect,hydrocephalus
2,Treatment::CPO00002,therapeutic,hydrocephalus,Side Effect::C0020255,Hydrocephalus,Side Effect,hydrocephalus
3,Treatment::CPO00002,adverse,irritability,Side Effect::C0022107,Irritability,Side Effect,irritability
4,Treatment::CPO00002,adverse,anorexia,Side Effect::C0003123,Anorexia,Side Effect,anorexia
...,...,...,...,...,...,...,...
111,Treatment::CPO00053,therapeutic,dysautonomia,Side Effect::C0013363,Dysautonomia,Side Effect,dysautonomia
112,Treatment::CPO00054,therapeutic,muscle atrophy,Biological Process::GO:0014889,muscle atrophy,Biological Process,muscle atrophy
113,Treatment::CPO00054,therapeutic,muscle atrophy,Side Effect::C0026846,Muscle atrophy,Side Effect,muscle atrophy
114,Treatment::CPO00020,therapeutic,ataxia,Side Effect::C0004134,Ataxia,Side Effect,ataxia


In [123]:
treatment_targets = pd.DataFrame(treatments_rel["target"].drop_duplicates())
n_cp = n.copy()
n_cp["name_lower"] = n_cp["name"].str.lower()
treatment_targets.merge(n_cp[n_cp["id"].str.startswith("Symptom")], left_on="target", right_on="name_lower", how="inner")

Unnamed: 0,target,id,name,kind,name_lower
0,hydrocephalus,Symptom::C0020255,Hydrocephalus,Side Effect,hydrocephalus
1,irritability,Symptom::C0022107,Irritability,Side Effect,irritability
2,anorexia,Symptom::C0003123,Anorexia,Side Effect,anorexia
3,anorexia,Symptom::D000855,Anorexia,Symptom,anorexia
4,c-reactive protein increased,Symptom::C0742906,C-reactive protein increased,Side Effect,c-reactive protein increased
...,...,...,...,...,...
56,peripheral sensorimotor neuropathy,Symptom::C1112256,Peripheral sensorimotor neuropathy,Side Effect,peripheral sensorimotor neuropathy
57,dysautonomia,Symptom::C0013363,Dysautonomia,Side Effect,dysautonomia
58,muscle atrophy,Symptom::C0026846,Muscle atrophy,Side Effect,muscle atrophy
59,ataxia,Symptom::C0004134,Ataxia,Side Effect,ataxia


In [124]:
hetionet_nodes[hetionet_nodes["name"] == "diplegia"]

Unnamed: 0,id,name,kind,name_lower


In [118]:
n = nodes.copy()


def replace_prefix(id):
    if "Side Effect" in id:
        return "Symptom::" + id.split("::")[1]
    return id

n["id"] = n["id"].apply(replace_prefix)
#n[n["kind"] == "Side Effect"].head(10)
n.head(20)

Unnamed: 0,id,name,kind
0,Anatomy::UBERON:0000002,uterine cervix,Anatomy
1,Anatomy::UBERON:0000004,nose,Anatomy
2,Anatomy::UBERON:0000006,islet of Langerhans,Anatomy
3,Anatomy::UBERON:0000007,pituitary gland,Anatomy
4,Anatomy::UBERON:0000010,peripheral nervous system,Anatomy
5,Anatomy::UBERON:0000011,parasympathetic nervous system,Anatomy
6,Anatomy::UBERON:0000013,sympathetic nervous system,Anatomy
7,Anatomy::UBERON:0000020,sense organ,Anatomy
8,Anatomy::UBERON:0000026,appendage,Anatomy
9,Anatomy::UBERON:0000029,lymph node,Anatomy


In [133]:
nodes[nodes["name"].str.contains("Chorea")]


Unnamed: 0,id,name,kind
41152,Side Effect::C0008489,Chorea,Side Effect
46645,Symptom::D002819,Chorea,Symptom
46905,Symptom::D020150,Chorea Gravidarum,Symptom


In [None]:
# NODE MAPPINGS

drugbank.parse_node_type(
    node_type="Drug",  # Switch from "Chemical" in ComptoxAI to "Drug" in AlzKB
    source_filename="CUSTOM/drug_links.tsv",
    fmt="tsv",
    parse_config={
        "iri_column_name": "DrugBank ID",
        "headers": True,
        "data_property_map": {
            "DrugBank ID": onto.xrefDrugbank,
            "CAS Number": onto.xrefCasRN,
            "Name": onto.commonName,
            "data_resource": onto.sourceDatabase,
        },
        "merge_column": {
            "source_column_name": "CAS Number",
            "data_property": onto.xrefCasRN,
        },
    },
    merge=False,
    skip=False
)


hetionet.parse_node_type(
    node_type="Symptom",
    source_filename="hetionet-custom-nodes.tsv", #use customized hetionet
    fmt="tsv",
    parse_config={
        "iri_column_name": "name",
        "headers": True,
        "filter_column": "kind",
        "filter_value": "Symptom",
        "data_transforms": {
            "id": lambda x: x.split("::")[-1]
        },
        "data_property_map": {
            "id": onto.xrefMeSH,
            "name": onto.commonName,
            "sourceDB": onto.sourceDatabase,
        }
    },
    merge=False,
    skip=False
)

In [None]:
# Relationship mappings

# DRUGTREATSDISEASE (OPSIYONEL)

hetionet.parse_relationship_type(  # Hetionet makes a messy distinction between 'treats' and 'palliates' which we ignore
    relationship_type=onto.drugTreatsDisease,
    source_filename="hetionet-v1.0-edges.sif",
    fmt="tsv",
    parse_config={
        "subject_node_type": onto.Drug,
        "subject_column_name": "source",
        "subject_match_property": onto.xrefDrugbank,
        "object_node_type": onto.Disease,
        "object_column_name": "target",
        "object_match_property": onto.xrefDiseaseOntology,
        "filter_column": "metaedge",
        "filter_value": "CpD",
        "headers": True,
        "data_transforms": {
            "source": lambda x: x.split("::")[-1],
            "target": lambda x: x.split(":")[-1] # Note: Because hetionet prefixes DOIDs with 'DOID:'
        },
    },
    merge=False,
    skip=False
)

# DISEASE_ASSOCIATES_WITH_DISEASE

hetionet.parse_relationship_type(
    relationship_type=onto.diseaseAssociatesWithDisease,
    source_filename="hetionet-custom-edges.tsv", #use customized hetionet
    fmt="tsv",
    parse_config={
        "subject_node_type": onto.Disease,
        "subject_column_name": "source",
        "subject_match_property": onto.xrefDiseaseOntology,
        "object_node_type": onto.Disease,
        "object_column_name": "target",
        "object_match_property": onto.xrefDiseaseOntology,
        "filter_column": "metaedge",
        "filter_value": "DrD",
        "headers": True,
        "data_transforms": {
            "source": lambda x: x.split(":")[-1], # Note: Because hetionet prefixes DOIDs with 'DOID:'
            "target": lambda x: x.split(":")[-1] # Note: Because hetionet prefixes DOIDs with 'DOID:'
        },
    },
    merge=False,
    skip=False
)

# SYMPTOM_MANIFESTATION_OF_DISEASE

hetionet.parse_relationship_type(
    relationship_type=onto.symptomManifestationOfDisease,
    source_filename="hetionet-custom-edges.tsv", #use customized hetionet
    fmt="tsv",
    parse_config={
        "subject_node_type": onto.Symptom,
        "subject_column_name": "target", # Flip target and source
        "subject_match_property": onto.xrefMeSH,
        "object_node_type": onto.Disease,
        "object_column_name": "source",
        "object_match_property": onto.xrefDiseaseOntology,
        "filter_column": "metaedge",
        "filter_value": "DpS",
        "headers": True,
        "data_transforms": {
            "source": lambda x: x.split("DOID:")[-1], # Note: Because hetionet prefixes DOIDs with 'DOID:'
            "target": lambda x: x.split("::")[-1]
        },
    },
    merge=False,
    skip=False
)