In [1]:
from tqdm.notebook import tqdm
import re
import os 
import shutil
import numpy as np
import pandas as pd
import igraph as ig
from scipy.sparse import lil_matrix, save_npz
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoTokenizer, AutoModel, pipeline
%load_ext autoreload
%autoreload 2

In [2]:
# Original data
data_path = '/n/data1/hms/dbmi/zitnik/lab/datasets/2021-06-RxGNN/v07-12-2021/datasets/'

# New data to add
new_data_dir = '/n/data1/hms/dbmi/zitnik/lab/users/mli/PrimeKG/'
new_data_path = new_data_dir + 'Monarch_6July2022/'

save_path = '/n/data1/hms/dbmi/zitnik/lab/users/mli/PrimeKG/kg/'

# Read datasets

In [3]:
def assert_dtypes(df): 
    all_string = True
    for i, x in enumerate(df.dtypes.values): 
        if x != np.dtype('O'): 
            all_string = False
            print(df.columns[i], x)
    if not all_string: assert False

In [4]:
# Original PrimeKG Datasets

df_ppi = pd.read_csv(data_path+'ppi/protein_protein.csv', low_memory=False).dropna()
df_ppi = df_ppi.astype({'proteinA_entrezid':int}).astype({'proteinA_entrezid':str})
df_ppi = df_ppi.astype({'proteinB_entrezid':int}).astype({'proteinB_entrezid':str})
assert_dtypes(df_ppi)

df_drugbank = pd.read_csv(data_path+'drugbank/drug_protein.csv', low_memory=False)
df_drugbank = df_drugbank.get(['DrugBank', 'relation', 'NCBIGeneID','DrugBankName']).dropna()
df_drugbank = df_drugbank.astype({'NCBIGeneID':int}).astype({'NCBIGeneID':str})
assert_dtypes(df_drugbank)

df_disgenet = pd.read_csv(data_path+'disgenet/curated_gene_disease_associations.tsv', sep='\t', low_memory=False)
df_disgenet = df_disgenet.astype({'geneId':int}).astype({'geneId':str})

df_mondo_terms = pd.read_csv(data_path+'mondo/mondo_terms.csv', low_memory=False)
df_mondo_terms = df_mondo_terms.astype({'id':int}).astype({'id':str})

df_mondo_xref = pd.read_csv(data_path+'mondo/mondo_references.csv', low_memory=False)
df_mondo_xref = df_mondo_xref.astype({'mondo_id':int}).astype({'mondo_id':str})
assert_dtypes(df_mondo_xref)

df_mondo_parents = pd.read_csv(data_path+'mondo/mondo_parents.csv', low_memory=False)
df_mondo_parents = df_mondo_parents.astype({'parent':int}).astype({'parent':str})
df_mondo_parents = df_mondo_parents.astype({'child':int}).astype({'child':str})
assert_dtypes(df_mondo_parents)

df_drug_central = pd.read_csv(data_path+'drugcentral/drug_disease.csv', low_memory=False)
df_drug_central = df_drug_central.get(['cas_reg_no','relationship_name', 'umls_cui']) # 'concept_id', 'concept_name', 'snomed_conceptid'
df_drug_central = df_drug_central.query('not @df_drug_central.cas_reg_no.isna()')
df_drug_central = df_drug_central.query('not @df_drug_central.umls_cui.isna()')
assert_dtypes(df_drug_central)

df_ddi = pd.read_csv(data_path+'drugbank/drug_drug.csv', low_memory=False)
assert_dtypes(df_ddi)

df_hp_terms = pd.read_csv(data_path+'hpo/hp_terms.csv', low_memory=False)
df_hp_terms = df_hp_terms.astype({'id':int}).astype({'id':str})

df_hp_xref = pd.read_csv(data_path+'hpo/hp_references.csv', low_memory=False)
df_hp_xref = df_hp_xref.astype({'hp_id':int}).astype({'hp_id':str})

df_hp_parents = pd.read_csv(data_path+'hpo/hp_parents.csv', low_memory=False)
df_hp_parents = df_hp_parents.astype({'parent':int}).astype({'parent':str})
df_hp_parents = df_hp_parents.astype({'child':int}).astype({'child':str})
assert_dtypes(df_hp_parents)

df_hpoa_pos = pd.read_csv(data_path+'hpo/disease_phenotype_pos.csv', low_memory=False)
df_hpoa_pos = df_hpoa_pos.astype({'hp_id':int}).astype({'hp_id':str})
df_hpoa_pos = df_hpoa_pos.astype({'disease_ontology_id':int}).astype({'disease_ontology_id':str})
assert_dtypes(df_hpoa_pos)

df_hpoa_neg = pd.read_csv(data_path+'hpo/disease_phenotype_neg.csv', low_memory=False)
df_hpoa_neg = df_hpoa_neg.astype({'hp_id':int}).astype({'hp_id':str})
df_hpoa_neg = df_hpoa_neg.astype({'disease_ontology_id':int}).astype({'disease_ontology_id':str})
assert_dtypes(df_hpoa_neg)

df_sider = pd.read_csv(data_path+'sider/sider.csv', low_memory=False)
assert_dtypes(df_sider)

df_go_terms = pd.read_csv(data_path+'go/go_terms_info.csv', low_memory=False)
df_go_terms = df_go_terms.astype({'go_term_id':int}).astype({'go_term_id':str})
assert_dtypes(df_go_terms)

df_go_edges = pd.read_csv(data_path+'go/go_terms_relations.csv', low_memory=False)
df_go_edges = df_go_edges.astype({'x':int}).astype({'x':str})
df_go_edges = df_go_edges.astype({'y':int}).astype({'y':str})
assert_dtypes(df_go_edges)

df_gene2go = pd.read_csv(data_path+'ncbigene/protein_go_associations.csv', low_memory=False)
df_gene2go = df_gene2go.astype({'ncbi_gene_id':int}).astype({'ncbi_gene_id':str})
df_gene2go = df_gene2go.astype({'go_term_id':int}).astype({'go_term_id':str})
assert_dtypes(df_gene2go)

df_exposures = pd.read_csv(data_path+'ctd/exposure_data.csv', low_memory=False)
df_exposures = df_exposures.get(['exposurestressorname', 'exposurestressorid',
                  'exposuremarker', 'exposuremarkerid',
                  'diseasename', 'diseaseid',
                  'phenotypename', 'phenotypeid'])
assert_dtypes(df_exposures)

df_uberon_terms = pd.read_csv(data_path+'uberon/uberon_terms.csv', low_memory=False)
df_uberon_terms = df_uberon_terms.astype({'id':int}).astype({'id':str})
assert_dtypes(df_uberon_terms)

df_uberon_is_a = pd.read_csv(data_path+'uberon/uberon_is_a.csv', low_memory=False)
df_uberon_is_a = df_uberon_is_a.astype({'id':int}).astype({'id':str})
df_uberon_is_a = df_uberon_is_a.astype({'is_a':int}).astype({'is_a':str})
assert_dtypes(df_uberon_is_a)

df_uberon_rels = pd.read_csv(data_path+'uberon/uberon_rels.csv', low_memory=False)
df_uberon_rels = df_uberon_rels.astype({'id':int}).astype({'id':str})
df_uberon_rels = df_uberon_rels.astype({'relation_id':int}).astype({'relation_id':str})
assert_dtypes(df_uberon_rels)

df_bgee = pd.read_csv(data_path+'bgee/anatomy_gene.csv', low_memory=False)
df_bgee = df_bgee.astype({'expression_rank':int}).astype({'expression_rank':str})
df_bgee = df_bgee.astype({'anatomy_id':int}).astype({'anatomy_id':str})
assert_dtypes(df_bgee)

df_reactome_terms = pd.read_csv(data_path+'reactome/reactome_terms.csv', low_memory=False)
assert_dtypes(df_reactome_terms)

df_reactome_rels = pd.read_csv(data_path+'reactome/reactome_relations.csv', low_memory=False)
assert_dtypes(df_reactome_rels)

df_reactome_ncbi = pd.read_csv(data_path+'reactome/reactome_ncbi.csv', low_memory=False)
df_reactome_ncbi = df_reactome_ncbi[df_reactome_ncbi.ncbi_id.str.isnumeric()]
assert_dtypes(df_reactome_ncbi)

df_umls_mondo = pd.read_csv(data_path+'vocab/umls_mondo.csv', low_memory=False)
df_umls_mondo = df_umls_mondo.astype({'mondo_id':int}).astype({'mondo_id':str})
assert_dtypes(df_umls_mondo)

df_prot_names = pd.read_csv(data_path+'vocab/gene_names.csv', low_memory=False, sep='\t')
df_prot_names = df_prot_names.rename(columns={'NCBI Gene ID(supplied by NCBI)':'ncbi_id', 'NCBI Gene ID':'ncbi_id2', 'Approved symbol':'symbol', 'Approved name':'name'})
df_prot_names = df_prot_names.get(['ncbi_id', 'symbol']).dropna()
df_prot_names = df_prot_names.astype({'ncbi_id':int}).astype({'ncbi_id':str})
assert_dtypes(df_prot_names)

db_vocab = pd.read_csv(data_path+'vocab/drugbank_vocabulary.csv', low_memory=False)
assert_dtypes(db_vocab)

df_db_atc = pd.read_csv(data_path+'vocab/drugbank_atc_codes.csv', low_memory=False).get(['atc_code','parent_key'])
assert_dtypes(df_db_atc)

In [5]:
# New datasets to add to PrimeKG

df_variant_disease = pd.read_csv(new_data_path+'variant_disease.all.tsv', low_memory=False, sep='\t')
df_variant_disease = df_variant_disease[df_variant_disease["subject_taxon_label"] == "Homo sapiens"]
df_variant_disease = df_variant_disease.rename(columns={"subject": "variant_id", "subject_label": "variant_label", "object": "disease_id", "object_label": "disease_name"})
df_variant_disease[["disease_source", "disease_id"]] = df_variant_disease["disease_id"].str.split(":", expand=True)
print(df_variant_disease["disease_source"].unique())
df_variant_disease["disease_id"] = df_variant_disease["disease_id"].astype(int).astype(str)
df_variant_disease = df_variant_disease.get(["variant_id", "variant_label", "disease_id", "disease_name", "disease_source", "relation_label"]).dropna()
assert_dtypes(df_variant_disease)

df_variant_gene = pd.read_csv(new_data_path+'variant_gene.all.tsv', low_memory=False, sep='\t')
df_variant_gene = df_variant_gene[df_variant_gene["subject_taxon_label"] == "Homo sapiens"]
df_variant_gene = df_variant_gene.rename(columns={"subject": "variant_id", "subject_label": "variant_label", "object": "gene_id", "object_label": "gene_name"})
df_variant_gene[["gene_source", "gene_id"]] = df_variant_gene["gene_id"].str.split(":", expand=True)
print(df_variant_gene["gene_source"].unique())
df_variant_gene = df_variant_gene.get(["variant_id", "variant_label", "gene_id", "gene_name", "gene_source", "relation_label"]).dropna()
assert_dtypes(df_variant_gene)

df_variant_phenotype = pd.read_csv(new_data_path+'variant_phenotype.all.tsv', low_memory=False, sep='\t')
df_variant_phenotype = df_variant_phenotype[df_variant_phenotype["subject_taxon_label"] == "Homo sapiens"]
df_variant_phenotype = df_variant_phenotype.rename(columns={"subject": "variant_id", "subject_label": "variant_label", "object": "phenotype_id", "object_label": "phenotype_label"})
df_variant_phenotype[["phenotype_source", "phenotype_id"]] = df_variant_phenotype["phenotype_id"].str.split(":", expand=True)
df_variant_phenotype["phenotype_id"] = df_variant_phenotype["phenotype_id"].astype(int).astype(str)
print(df_variant_phenotype["phenotype_source"].unique())
df_variant_phenotype = df_variant_phenotype.get(["variant_id", "variant_label", "phenotype_id", "phenotype_label", "phenotype_source", "relation_label"]).dropna()
assert_dtypes(df_variant_phenotype)

df_gene_mapping = pd.read_csv(new_data_dir+'gene_mappings_6July2022.txt', low_memory=False, sep="\t")
df_gene_mapping = df_gene_mapping.rename(columns={"Approved symbol": "gene_name", "NCBI gene ID": "ncbi_id", "OMIM ID": "omim_id", "Ensembl gene ID": "ens_id"})
df_gene_mapping[["source", "hgnc_id"]] = df_gene_mapping["HGNC ID"].str.split(":", expand=True)
df_gene_mapping = df_gene_mapping.get(["hgnc_id", "ncbi_id", "omim_id", "ens_id", "gene_name"])

['MONDO' 'OMIM']
['HGNC' 'NCBIGene' 'ENSEMBL' 'OMIM']
['MONDO' 'EFO' 'HP']


In [6]:
# Overlap of variant IDs

dis_variants = set(list(df_variant_disease["variant_id"].unique()))
gen_variants = set(list(df_variant_gene["variant_id"].unique()))
phe_variants = set(list(df_variant_phenotype["variant_id"].unique()))

print("Disease associated:", len(dis_variants), "Gene associated:", len(gen_variants), "Phenotype associated:", len(phe_variants))

print("Overlap between dis and gen:", len(dis_variants.intersection(gen_variants)))
print("Overlap between dis and phe:", len(dis_variants.intersection(phe_variants)))
print("Overlap between gen and phe:", len(gen_variants.intersection(phe_variants)))

print("Overlap between gen and phe+dis:", len(gen_variants.intersection(phe_variants.union(dis_variants))))

print(gen_variants.difference(phe_variants.union(dis_variants)))

Disease associated: 148551 Gene associated: 219399 Phenotype associated: 132649
Overlap between dis and gen: 112937
Overlap between dis and phe: 9225
Overlap between gen and phe: 80903
Overlap between gen and phe+dis: 188039
{'ClinVarVariant:477594', 'OMIM:276903.0012', 'OMIM:617782.0014', 'OMIM:600924.0004', 'OMIM:147880.0003', 'OMIM:123833.0005', 'OMIM:300390.0011', 'OMIM:607008.0015', 'OMIM:300550.0004', 'OMIM:192340.0012', 'OMIM:120240.0016', 'OMIM:606463.0026', 'OMIM:609014.0007', 'OMIM:312180.0001', 'OMIM:120436.0008', 'OMIM:612309.0013', 'OMIM:118955.0003', 'OMIM:605802.0008', 'OMIM:142000.0021', 'OMIM:180901.0035', 'OMIM:109270.0005', 'OMIM:603845.0002', 'OMIM:617782.0006', 'OMIM:607614.0001', 'OMIM:607939.0005', 'OMIM:138079.0012', 'OMIM:614459.0004', 'OMIM:608473.0003', 'OMIM:608378.0006', 'OMIM:128260.0002', 'OMIM:611060.0009', 'OMIM:300841.0227', 'OMIM:300311.0002', 'ClinVarVariant:613408', 'OMIM:600509.0017', 'OMIM:141800.0133', 'OMIM:600797.0001', 'OMIM:609539.0007', 'OMI

# Converting databases into graph edges

In [7]:
def clean_edges(df): 
    df = df.get(['relation', 'display_relation', 'x_id','x_type', 'x_name', 'x_source','y_id','y_type', 'y_name', 'y_source'])
    df = df.dropna()
    df = df.drop_duplicates()
    df = df.query('not ((x_id == y_id) and (x_type == y_type) and (x_source == y_source) and (x_name == y_name))')
    return df

## Basic

### Protein protein interactions (NCBI)

In [8]:
df_prot_prot = pd.merge(df_ppi, df_prot_names, 'left', left_on='proteinA_entrezid', right_on='ncbi_id').rename(columns={'symbol':'symbolA'})
df_prot_prot = pd.merge(df_prot_prot, df_prot_names, 'left', left_on='proteinB_entrezid', right_on='ncbi_id').rename(columns={'symbol':'symbolB'})

df_prot_prot = df_prot_prot.rename(columns={'proteinA_entrezid':'x_id', 'proteinB_entrezid':'y_id', 'symbolA':'x_name', 'symbolB':'y_name'})
df_prot_prot['x_type'] = 'gene/protein'
df_prot_prot['x_source'] = 'NCBI'
df_prot_prot['y_type'] = 'gene/protein'
df_prot_prot['y_source'] = 'NCBI'
df_prot_prot['relation'] = 'protein_protein'
df_prot_prot['display_relation'] = 'ppi'
df_prot_prot = clean_edges(df_prot_prot)
df_prot_prot.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,protein_protein,ppi,9796,gene/protein,PHYHIP,NCBI,56992,gene/protein,KIF15,NCBI


### Drug protein interactions (DrugBank)

In [9]:
df_prot_drug = pd.merge(df_drugbank, df_prot_names, 'left', left_on='NCBIGeneID', right_on='ncbi_id')

df_prot_drug = df_prot_drug.rename(columns={'DrugBank':'x_id', 'NCBIGeneID':'y_id', 'DrugBankName':'x_name', 'symbol':'y_name'})
df_prot_drug['x_type'] = 'drug'
df_prot_drug['x_source'] = 'DrugBank'
df_prot_drug['y_type'] = 'gene/protein'
df_prot_drug['y_source'] = 'NCBI'
df_prot_drug['display_relation'] = df_prot_drug.get('relation').values
df_prot_drug['relation'] = 'drug_protein' # combine targets, carrier, enzyme and transporter
df_prot_drug = clean_edges(df_prot_drug)
df_prot_drug.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,drug_protein,carrier,DB09130,drug,Copper,DrugBank,2157,gene/protein,F8,NCBI


### Drug disease interactions (DiseaseCentral) –– PENDING

In [10]:
df_drug_dis = pd.merge(df_drug_central, db_vocab, 'left', left_on='cas_reg_no', right_on='CAS')
df_drug_dis = pd.merge(df_drug_dis, df_umls_mondo, 'inner', left_on='umls_cui', right_on='umls_id')
df_drug_dis = pd.merge(df_drug_dis, df_mondo_terms, 'left', left_on='mondo_id', right_on='id')

df_drug_dis = df_drug_dis.get(['relationship_name','DrugBank ID', 'Common name', 'mondo_id', 'name'])
df_drug_dis = df_drug_dis.dropna().drop_duplicates()

df_drug_dis = df_drug_dis.rename(columns={'DrugBank ID':'x_id', 'mondo_id':'y_id', 'Common name':'x_name', 'name':'y_name', 'relationship_name':'relation'})
df_drug_dis['x_type'] = 'drug'
df_drug_dis['x_source'] = 'DrugBank'
df_drug_dis['y_type'] = 'disease'
df_drug_dis['y_source'] = 'MONDO'
df_drug_dis['display_relation'] = df_drug_dis.get('relation').values
df_drug_dis = clean_edges(df_drug_dis)
df_drug_dis.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,contraindication,contraindication,DB05271,drug,Rotigotine,DrugBank,5044,disease,hypertensive disorder,MONDO


### Disease protein interactions (DisGenNet)

In [11]:
df_prot_dis1 = df_disgenet.query('diseaseType=="disease"')

df_prot_dis1 = pd.merge(df_prot_dis1, df_umls_mondo, 'inner', left_on='diseaseId', right_on='umls_id')
df_prot_dis1 = pd.merge(df_prot_dis1, df_mondo_terms, 'left', left_on='mondo_id', right_on='id')

df_prot_dis1 = df_prot_dis1.rename(columns={'geneId':'x_id', 'geneSymbol':'x_name', 'mondo_id':'y_id', 'name':'y_name'})
df_prot_dis1['x_type'] = 'gene/protein'
df_prot_dis1['x_source'] = 'NCBI'
df_prot_dis1['y_type'] = 'disease'
df_prot_dis1['y_source'] = 'MONDO'
df_prot_dis1['relation'] = 'disease_protein'
df_prot_dis1['display_relation'] = 'associated with'
df_prot_dis1 = clean_edges(df_prot_dis1)
df_prot_dis1.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,disease_protein,associated with,1,gene/protein,A1BG,NCBI,5090,disease,schizophrenia (disease),MONDO


### Disease disease interations (MONDO)

In [12]:
df_dis_dis1 = pd.merge(df_mondo_parents, df_mondo_terms, 'left', left_on='parent', right_on='id')
df_dis_dis1 = df_dis_dis1.rename(columns={'parent':'x_id', 'name':'x_name'})
df_dis_dis1 = pd.merge(df_dis_dis1, df_mondo_terms, 'left', left_on='child', right_on='id')
df_dis_dis1 = df_dis_dis1.rename(columns={'child':'y_id', 'name':'y_name'})
df_dis_dis1['x_type'] = 'disease'
df_dis_dis1['x_source'] = 'MONDO'
df_dis_dis1['y_type'] = 'disease'
df_dis_dis1['y_source'] = 'MONDO'
df_dis_dis1['relation'] = 'disease_disease'
df_dis_dis1['display_relation'] = 'parent-child'
df_dis_dis1 = clean_edges(df_dis_dis1)
df_dis_dis1.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,disease_disease,parent-child,2816,disease,adrenal cortex disease,MONDO,4,disease,adrenocortical insufficiency,MONDO


### Drug drug interactions (DrugBank)

In [13]:
df_drug_drug = pd.merge(df_ddi, db_vocab, 'inner', left_on='drug1', right_on='DrugBank ID')
df_drug_drug = df_drug_drug.rename(columns={'drug1':'x_id', 'Common name':'x_name'})
df_drug_drug = pd.merge(df_drug_drug.astype({'drug2':'str'}), db_vocab, 'inner', left_on='drug2', right_on='DrugBank ID')
df_drug_drug = df_drug_drug.rename(columns={'drug2':'y_id', 'Common name':'y_name'})
df_drug_drug['x_type'] = 'drug'
df_drug_drug['x_source'] = 'DrugBank'
df_drug_drug['y_type'] = 'drug'
df_drug_drug['y_source'] = 'DrugBank'
df_drug_drug['relation'] = 'drug_drug'
df_drug_drug['display_relation'] = 'synergistic interaction'
df_drug_drug = clean_edges(df_drug_drug)
df_drug_drug.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,drug_drug,synergistic interaction,DB00001,drug,Lepirudin,DrugBank,DB06605,drug,Apixaban,DrugBank


## Effect/Phenotype

### Effect protein interactions (DisGenNet)

In [14]:
df_prot_phe = df_disgenet.query('diseaseType=="phenotype"')

df_prot_phe = pd.merge(df_prot_phe, df_hp_xref, 'inner', left_on='diseaseId', right_on='ontology_id')
df_prot_phe = pd.merge(df_prot_phe, df_hp_terms, 'left', left_on='hp_id', right_on='id')

df_prot_phe = df_prot_phe.rename(columns={'geneId':'x_id', 'geneSymbol':'x_name', 'hp_id':'y_id', 'name':'y_name'})
df_prot_phe['x_type'] = 'gene/protein'
df_prot_phe['x_source'] = 'NCBI'
df_prot_phe['y_type'] = 'effect/phenotype'
df_prot_phe['y_source'] = 'HPO'
df_prot_phe['relation'] = 'phenotype_protein'
df_prot_phe['display_relation'] = 'associated with'
df_prot_phe = clean_edges(df_prot_phe)
df_prot_phe.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,phenotype_protein,associated with,1,gene/protein,A1BG,NCBI,2240,effect/phenotype,Hepatomegaly,HPO


### Effect effect interactions (HPO)

In [15]:
df_phe_phe = pd.merge(df_hp_parents, df_hp_terms, 'left', left_on='parent', right_on='id')
df_phe_phe = df_phe_phe.rename(columns={'name':'parent_name'})
df_phe_phe = pd.merge(df_phe_phe, df_hp_terms, 'left', left_on='child', right_on='id')
df_phe_phe = df_phe_phe.rename(columns={'name':'child_name'})
df_phe_phe = df_phe_phe.get(['parent', 'child', 'parent_name', 'child_name'])

df_phe_phe = df_phe_phe.rename(columns={'parent':'x_id', 'child':'y_id', 'parent_name':'x_name', 'child_name':'y_name'})
df_phe_phe['x_type'] = 'effect/phenotype'
df_phe_phe['x_source'] = 'HPO'
df_phe_phe['y_type'] = 'effect/phenotype'
df_phe_phe['y_source'] = 'HPO'
df_phe_phe['relation'] = 'phenotype_phenotype'
df_phe_phe['display_relation'] = 'parent-child'
df_phe_phe = clean_edges(df_phe_phe)
df_phe_phe.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,phenotype_phenotype,parent-child,1507,effect/phenotype,Growth abnormality,HPO,2,effect/phenotype,Abnormality of body height,HPO


### Disease effect interactions (HPO-A)

In [16]:
df_dis_phe_pos1 = pd.merge(df_hpoa_pos, df_mondo_xref, 'left', left_on='disease_ontology_id', right_on='ontology_id')
df_dis_phe_pos1 = df_dis_phe_pos1.query('(disease_ontology==ontology) or (disease_ontology=="ORPHA" and ontology=="Orphanet")')
df_dis_phe_pos1 = pd.merge(df_dis_phe_pos1, df_hp_terms, 'left', left_on='hp_id', right_on='id').rename(columns={'name':'hp_name'})
df_dis_phe_pos1 = pd.merge(df_dis_phe_pos1, df_mondo_terms, 'left', left_on='mondo_id', right_on='id').rename(columns={'name':'mondo_name'})
df_dis_phe_pos1 = df_dis_phe_pos1.get(['mondo_id', 'mondo_name', 'hp_id', 'hp_name'])
df_dis_phe_pos1 = df_dis_phe_pos1.rename(columns={'mondo_id':'x_id', 'mondo_name':'x_name', 'hp_id': 'y_id', 'hp_name':'y_name'})
df_dis_phe_pos1.loc[:, 'x_source'] = 'MONDO'
df_dis_phe_pos1.loc[:, 'x_type'] = 'disease'
df_dis_phe_pos1.loc[:, 'y_source'] = 'HPO'
df_dis_phe_pos1.loc[:, 'y_type'] = 'effect/phenotype'
df_dis_phe_pos1.loc[:, 'relation'] = 'disease_phenotype_positive'
df_dis_phe_pos1.loc[:, 'display_relation'] = 'phenotype present'
df_dis_phe_pos1 = clean_edges(df_dis_phe_pos1)
df_dis_phe_pos1.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,disease_phenotype_positive,phenotype present,10761,disease,retinitis pigmentosa Y-linked,MONDO,510,effect/phenotype,Rod-cone dystrophy,HPO


In [17]:
df_dis_phe_neg = pd.merge(df_hpoa_neg, df_mondo_xref, 'left', left_on='disease_ontology_id', right_on='ontology_id')
df_dis_phe_neg = df_dis_phe_neg.query('(disease_ontology==ontology) or (disease_ontology=="ORPHA" and ontology=="Orphanet")')
df_dis_phe_neg = pd.merge(df_dis_phe_neg, df_hp_terms, 'left', left_on='hp_id', right_on='id').rename(columns={'name':'hp_name'})
df_dis_phe_neg = pd.merge(df_dis_phe_neg, df_mondo_terms, 'left', left_on='mondo_id', right_on='id').rename(columns={'name':'mondo_name'})
df_dis_phe_neg = df_dis_phe_neg.get(['mondo_id', 'mondo_name', 'hp_id', 'hp_name'])
df_dis_phe_neg = df_dis_phe_neg.rename(columns={'mondo_id':'x_id', 'mondo_name':'x_name', 'hp_id': 'y_id', 'hp_name':'y_name'})
df_dis_phe_neg.loc[:, 'x_source'] = 'MONDO'
df_dis_phe_neg.loc[:, 'x_type'] = 'disease'
df_dis_phe_neg.loc[:, 'y_source'] = 'HPO'
df_dis_phe_neg.loc[:, 'y_type'] = 'effect/phenotype'
df_dis_phe_neg.loc[:, 'relation'] = 'disease_phenotype_negative'
df_dis_phe_neg.loc[:, 'display_relation'] = 'phenotype absent'
df_dis_phe_neg = clean_edges(df_dis_phe_neg)
df_dis_phe_neg.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,disease_phenotype_negative,phenotype absent,13924,disease,osteogenesis imperfecta type 13,MONDO,365,effect/phenotype,Hearing impairment,HPO


### Remove phenotype nodes if they exist in MONDO

In [18]:
# phenotypes that are actually diseases in MONDO
# avoid duplicate nodes and convert them to disease relations
mondo_xref_hp_subset = df_mondo_xref.query('ontology=="HP"')
mondo_xref_hp_subset.loc[:, 'ontology_id'] = mondo_xref_hp_subset.get('ontology_id').astype(int).astype(str).values
hp_ids_r_mondo = pd.merge(mondo_xref_hp_subset, df_hp_terms, 'inner', left_on='ontology_id', right_on='id').get('ontology_id').values

def replace_hp_data_w_mondo(df, hp_id_col, drop_cols=[]): 
    cols = list(df.columns.values)
    cols.extend(['mondo_id', 'mondo_name'])
    [cols.remove(x) for x in drop_cols]
    df = pd.merge(df, mondo_xref_hp_subset, 'left', left_on=hp_id_col, right_on='ontology_id')
    df = pd.merge(df, df_mondo_terms, 'left', left_on='mondo_id', right_on='id')
    df = df.rename(columns={'name':'mondo_name'}).get(cols)
    return df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [19]:
# HANDLE EFFECT EFFECT 

# PHE-PHE should be PHE-DIS if ONE PHE is in MONDO

df_dis_phe_x = df_phe_phe.query('x_id in @hp_ids_r_mondo and y_id not in @hp_ids_r_mondo')
df_dis_phe_x = replace_hp_data_w_mondo(df=df_dis_phe_x, hp_id_col='x_id', 
                                       drop_cols=[c for c in df_dis_phe_x.columns.values if 'x_' in c])
df_dis_phe_x = df_dis_phe_x.rename(columns={'mondo_id':'x_id', 'mondo_name':'x_name'})
df_dis_phe_x.loc[:, 'x_source'] = 'MONDO'
df_dis_phe_x.loc[:, 'x_type'] = 'disease'

df_dis_phe_y = df_phe_phe.query('y_id in @hp_ids_r_mondo and x_id not in @hp_ids_r_mondo')
df_dis_phe_y = replace_hp_data_w_mondo(df=df_dis_phe_y, hp_id_col='y_id',
                                       drop_cols=[c for c in df_dis_phe_y.columns.values if 'y_' in c])
df_dis_phe_y = df_dis_phe_y.rename(columns={'mondo_id':'y_id', 'mondo_name':'y_name'})
df_dis_phe_y.loc[:, 'y_source'] = 'MONDO'
df_dis_phe_y.loc[:, 'y_type'] = 'disease'

df_dis_phe_pos2 = pd.concat([df_dis_phe_x, df_dis_phe_y], ignore_index=True)
df_dis_phe_pos2['relation'] = 'disease_phenotype_positive'
df_dis_phe_pos2.loc[:, 'display_relation'] = 'phenotype present'
df_dis_phe_pos2 = clean_edges(df_dis_phe_pos2)


# PHE-PHE should be DIS-DIS if BOTH PHE are in MONDO

df_dis_dis2 = df_phe_phe.query('x_id in @hp_ids_r_mondo and y_id in @hp_ids_r_mondo')
df_dis_dis2 = replace_hp_data_w_mondo(df=df_dis_dis2, 
                                       hp_id_col='x_id', 
                                       drop_cols=[c for c in df_dis_dis2.columns.values if 'x_' in c])
df_dis_dis2 = df_dis_dis2.rename(columns={'mondo_id':'x_id', 'mondo_name':'x_name'})
df_dis_dis2 = replace_hp_data_w_mondo(df=df_dis_dis2, 
                                       hp_id_col='y_id', 
                                       drop_cols=[c for c in df_dis_dis2.columns.values if 'y_' in c])
df_dis_dis2 = df_dis_dis2.rename(columns={'mondo_id':'y_id', 'mondo_name':'y_name'})
df_dis_dis2.loc[:, 'x_source'] = 'MONDO'
df_dis_dis2.loc[:, 'x_type'] = 'disease'
df_dis_dis2.loc[:, 'y_source'] = 'MONDO'
df_dis_dis2.loc[:, 'y_type'] = 'disease'
df_dis_dis2.loc[:,'relation'] = 'disease_disease'
df_dis_dis2.loc[:,'display_relation'] = 'parent-child'
df_dis_dis2 = clean_edges(df_dis_dis2)

# drop relations in PHE PHE if either PHE is in MONDO
# phenotype phenotype should have no disease nodes
df_phe_phe = df_phe_phe.query('x_id not in @hp_ids_r_mondo and y_id not in @hp_ids_r_mondo')

In [20]:
# HANDLE PROTEIN EFFECT 

# if phenotype in MONDO make it protein-disease relations 
df_prot_dis2= df_prot_phe.query('y_id in @hp_ids_r_mondo')
df_prot_dis2 = replace_hp_data_w_mondo(df=df_prot_dis2, hp_id_col='y_id',
                                       drop_cols=[c for c in df_prot_dis2.columns.values if 'y_' in c])
df_prot_dis2 = df_prot_dis2.rename(columns={'mondo_id':'y_id', 'mondo_name':'y_name'})
df_prot_dis2.loc[:, 'y_source'] = 'MONDO'
df_prot_dis2.loc[:, 'y_type'] = 'disease'
df_prot_dis2.loc[:, 'relation'] = 'disease_protein'
df_prot_dis2.loc[:, 'display_relation'] = 'associated with'
df_prot_dis2 = clean_edges(df_prot_dis2)

# remove from protein-phenotype if phenotype in MONDO 
df_prot_phe = df_prot_phe.query('y_id not in @hp_ids_r_mondo')

In [21]:
# HANDLE DISEASE EFFECT 

# remove from protein-phenotype if phenotype in MONDO 
df_dis_phe_pos1 = df_dis_phe_pos1.query('y_id not in @hp_ids_r_mondo')

# NEGATIVE disease_phenotype should just be dropped because negative disease_disease doesn't make sense 
df_dis_phe_neg = df_dis_phe_neg.query('y_id not in @hp_ids_r_mondo')

In [22]:
# COMBINE DATAFRAMES 

df_prot_dis = pd.concat([df_prot_dis1, df_prot_dis2], ignore_index=True).drop_duplicates()
df_dis_dis = pd.concat([df_dis_dis1, df_dis_dis2], ignore_index=True).drop_duplicates()
df_dis_phe_pos = pd.concat([df_dis_phe_pos1, df_dis_phe_pos2], ignore_index=True).drop_duplicates()

### Drug effect interactions (SIDER)

In [23]:
df_drug_effect = pd.merge(df_sider, df_db_atc, 'left', left_on='atc', right_on='atc_code')
df_drug_effect = df_drug_effect.rename(columns={'parent_key':'DrugBank', 'UMLS_from_meddra':'UMLS'})
df_drug_effect = pd.merge(df_drug_effect, db_vocab, 'left', left_on='DrugBank', right_on='DrugBank ID')
df_drug_effect = pd.merge(df_drug_effect, df_hp_xref, 'left', left_on='UMLS' , right_on='ontology_id')
df_drug_effect = pd.merge(df_drug_effect, df_hp_terms, 'left', left_on='hp_id' , right_on='id')
df_drug_effect = df_drug_effect.get(['DrugBank ID','Common name','hp_id', 'name'])
df_drug_effect = df_drug_effect.dropna().drop_duplicates()

df_drug_effect = df_drug_effect.rename(columns={'DrugBank ID':'x_id', 'Common name':'x_name', 'hp_id':'y_id', 'name':'y_name'})
df_drug_effect['x_type'] = 'drug'
df_drug_effect['x_source'] = 'DrugBank'
df_drug_effect['y_type'] = 'effect/phenotype'
df_drug_effect['y_source'] = 'HPO'
df_drug_effect['relation'] = 'drug_effect'
df_drug_effect['display_relation'] = 'side effect'
df_drug_effect = df_drug_effect.query('y_id not in @hp_ids_r_mondo')
df_drug_effect = clean_edges(df_drug_effect)
df_drug_effect.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,drug_effect,side effect,DB00583,drug,Levocarnitine,DrugBank,2027,effect/phenotype,Abdominal pain,HPO


## GO Terms

### Go terms interactions (GO)

In [24]:
bp = df_go_terms.query('go_term_type=="biological_process"')
df_bp_bp = pd.merge(df_go_edges, bp, 'inner', left_on='x', right_on='go_term_id')
df_bp_bp = df_bp_bp.rename(columns={'go_term_id':'x_id','go_term_name':'x_name','go_term_type':'x_type'})
df_bp_bp = pd.merge(df_bp_bp, bp, 'inner', left_on='y', right_on='go_term_id')
df_bp_bp = df_bp_bp.rename(columns={'go_term_id':'y_id','go_term_name':'y_name','go_term_type':'y_type'})
df_bp_bp['relation'] = 'bioprocess_bioprocess'
df_bp_bp['x_source'] = 'GO'
df_bp_bp['y_source'] = 'GO'
df_bp_bp['display_relation'] = 'parent-child'
df_bp_bp = clean_edges(df_bp_bp)
df_bp_bp.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,bioprocess_bioprocess,parent-child,51581,biological_process,negative regulation of neurotransmitter uptake,GO,51612,biological_process,negative regulation of serotonin uptake,GO


In [25]:
mf = df_go_terms.query('go_term_type=="molecular_function"')
df_mf_mf = pd.merge(df_go_edges, mf, 'inner', left_on='x', right_on='go_term_id')
df_mf_mf = df_mf_mf.rename(columns={'go_term_id':'x_id','go_term_name':'x_name','go_term_type':'x_type'})
df_mf_mf = pd.merge(df_mf_mf, mf, 'inner', left_on='y', right_on='go_term_id')
df_mf_mf = df_mf_mf.rename(columns={'go_term_id':'y_id','go_term_name':'y_name','go_term_type':'y_type'})
df_mf_mf['relation'] = 'molfunc_molfunc'
df_mf_mf['display_relation'] = 'parent-child'
df_mf_mf['x_source'] = 'GO'
df_mf_mf['y_source'] = 'GO'
df_mf_mf = clean_edges(df_mf_mf)
df_mf_mf.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,molfunc_molfunc,parent-child,8168,molecular_function,methyltransferase activity,GO,102130,molecular_function,malonyl-CoA methyltransferase activity,GO


In [26]:
cc = df_go_terms.query('go_term_type=="cellular_component"')
df_cc_cc = pd.merge(df_go_edges, cc, 'inner', left_on='x', right_on='go_term_id')
df_cc_cc = df_cc_cc.rename(columns={'go_term_id':'x_id','go_term_name':'x_name','go_term_type':'x_type'})
df_cc_cc = pd.merge(df_cc_cc, cc, 'inner', left_on='y', right_on='go_term_id')
df_cc_cc = df_cc_cc.rename(columns={'go_term_id':'y_id','go_term_name':'y_name','go_term_type':'y_type'})
df_cc_cc['relation'] = 'cellcomp_cellcomp'
df_cc_cc['display_relation'] = 'parent-child'
df_cc_cc['x_source'] = 'GO'
df_cc_cc['y_source'] = 'GO'
df_cc_cc = clean_edges(df_cc_cc)
df_cc_cc.head(1)


Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,cellcomp_cellcomp,parent-child,110165,cellular_component,cellular anatomical entity,GO,90553,cellular_component,unicellular trichome tip,GO


### Go protein interactions (Gene2GO)

In [27]:
df_prot_path = pd.merge(df_gene2go, df_go_terms, 'inner', 'go_term_id').rename(columns={'go_term_type_x':'go_term_type'})
df_prot_path = pd.merge(df_prot_path, df_prot_names, 'left', left_on='ncbi_gene_id', right_on='ncbi_id')
df_prot_path = df_prot_path.rename(columns={'ncbi_gene_id':'x_id', 'symbol':'x_name', 
                             'go_term_id':'y_id','go_term_name':'y_name', 'go_term_type':'y_type'})
df_prot_path['x_type'] = 'gene/protein'
df_prot_path['x_source'] = 'NCBI'
df_prot_path['y_source'] = 'GO'
df_prot_path = df_prot_path.get(['x_id','x_type', 'x_name', 'x_source','y_id','y_type', 'y_name', 'y_source'])

In [28]:
df_prot_mf = df_prot_path.query('y_type=="molecular_function"').copy()
df_prot_mf['relation'] = 'molfunc_protein'
df_prot_mf['display_relation'] = 'interacts with'
df_prot_mf = clean_edges(df_prot_mf)
df_prot_mf.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,molfunc_protein,interacts with,2,gene/protein,A2M,NCBI,19966,molecular_function,interleukin-1 binding,GO


In [29]:
df_prot_cc = df_prot_path.query('y_type=="cellular_component"').copy()
df_prot_cc['relation'] = 'cellcomp_protein'
df_prot_cc['display_relation'] = 'interacts with'
df_prot_cc = clean_edges(df_prot_cc)
df_prot_cc.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
214459,cellcomp_protein,interacts with,1,gene/protein,A1BG,NCBI,1904813,cellular_component,ficolin-1-rich granule lumen,GO


In [30]:
df_prot_bp = df_prot_path.query('y_type=="biological_process"').copy()
df_prot_bp['relation'] = 'bioprocess_protein'
df_prot_bp['display_relation'] = 'interacts with'
df_prot_bp = clean_edges(df_prot_bp)
df_prot_bp.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
69588,bioprocess_protein,interacts with,1,gene/protein,A1BG,NCBI,43312,biological_process,neutrophil degranulation,GO


## Exposure

### Exposure protein interactions (CTD)

In [31]:
df_exp_prot = df_exposures.get(['exposurestressorname', 'exposurestressorid','exposuremarker', 'exposuremarkerid'])
df_exp_prot = df_exp_prot.loc[df_exp_prot.get(['exposuremarkerid']).dropna().index, :]

gene_row_index = []
for idx, data in df_exp_prot.iterrows():
    if data.exposuremarkerid.isnumeric(): 
        gene_row_index.append(idx)

df_exp_prot = df_exp_prot.loc[gene_row_index, :].astype({'exposuremarkerid': 'int'}).astype({'exposuremarkerid': 'str'})
df_exp_prot = pd.merge(df_exp_prot, df_prot_names, 'left', left_on='exposuremarkerid', right_on='ncbi_id')

df_exp_prot = df_exp_prot.rename(columns={'exposurestressorid':'x_id', 'exposurestressorname':'x_name', 'ncbi_id':'y_id', 'symbol':'y_name'})
df_exp_prot['x_type'] = 'exposure'
df_exp_prot['x_source'] = 'CTD'
df_exp_prot['y_type'] = 'gene/protein'
df_exp_prot['y_source'] = 'NCBI'
df_exp_prot['relation'] = 'exposure_protein'
df_exp_prot['display_relation'] = 'interacts with'
df_exp_prot = clean_edges(df_exp_prot)
df_exp_prot.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,exposure_protein,interacts with,C092102,exposure,1-hydroxyphenanthrene,CTD,1401,gene/protein,CRP,NCBI


### Exposure disease interactions (CTD)

In [32]:
df_exp_dis = df_exposures.get(['exposurestressorname', 'exposurestressorid','diseasename', 'diseaseid'])
df_exp_dis = df_exp_dis.loc[df_exp_dis.get(['diseaseid']).dropna().index, :]
df_exp_dis = pd.merge(df_exp_dis, df_mondo_xref.query('ontology=="MESH"'), 'left', left_on='diseaseid', right_on='ontology_id')
df_exp_dis = pd.merge(df_exp_dis, df_mondo_terms, 'left', left_on='mondo_id', right_on= 'id')

df_exp_dis = df_exp_dis.rename(columns={'exposurestressorid':'x_id', 'exposurestressorname':'x_name', 'mondo_id':'y_id', 'name':'y_name'})
df_exp_dis['x_type'] = 'exposure'
df_exp_dis['x_source'] = 'CTD'
df_exp_dis['y_type'] = 'disease'
df_exp_dis['y_source'] = 'MONDO'
df_exp_dis['relation'] = 'exposure_disease'
df_exp_dis['display_relation'] = 'linked to'
df_exp_dis = clean_edges(df_exp_dis)
df_exp_dis.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,exposure_disease,linked to,C024566,exposure,"1,1,1-trichloroethane",CTD,4976,disease,amyotrophic lateral sclerosis,MONDO


### Exposure exposure interactions (CTD)

In [33]:
exposures = np.unique(df_exposures.get('exposurestressorid').values)
df_exp_exp = df_exposures.query('exposuremarkerid in @exposures')

df_exp_exp = df_exp_exp.get(['exposurestressorname', 'exposurestressorid','exposuremarker', 'exposuremarkerid'])
df_exp_exp = df_exp_exp.loc[df_exp_exp.get(['exposuremarkerid']).dropna().index, :]
df_exp_exp = df_exp_exp.drop_duplicates()

df_exp_exp = df_exp_exp.rename(columns={'exposurestressorid':'x_id', 'exposurestressorname':'x_name', 'exposuremarker':'y_name', 'exposuremarkerid':'y_id'})
df_exp_exp['x_type'] = 'exposure'
df_exp_exp['x_source'] = 'CTD'
df_exp_exp['y_type'] = 'exposure'
df_exp_exp['y_source'] = 'CTD'
df_exp_exp['relation'] = 'exposure_exposure'
df_exp_exp['display_relation'] = 'parent-child'
df_exp_exp = clean_edges(df_exp_exp)

### Exposure pathway interactions (CTD)

In [34]:
# phenotypes are actually pathways 

df_exp_path = df_exposures.get(['exposurestressorname', 'exposurestressorid','phenotypename', 'phenotypeid'])
df_exp_path = df_exp_path.loc[df_exp_path.get(['phenotypeid']).dropna().index, :]
df_exp_path.loc[:, 'phenotypeid'] = [str(int(x.split(':')[1])) for x in df_exp_path.get(['phenotypeid']).values.reshape(-1)]
df_exp_path = df_exp_path.drop_duplicates()
df_exp_path = pd.merge(df_exp_path, df_go_terms, 'inner', left_on='phenotypeid', right_on='go_term_id')
df_exp_path = df_exp_path.rename(columns={'exposurestressorid':'x_id', 'exposurestressorname':'x_name', 
                                          'go_term_id':'y_id', 'go_term_name':'y_name', 'go_term_type':'y_type'})
df_exp_path['x_type'] = 'exposure'
df_exp_path['x_source'] = 'CTD'
df_exp_path['y_source'] = 'GO'

In [35]:
df_exp_bp = df_exp_path.query('y_type=="biological_process"').copy()
df_exp_bp['relation'] = 'exposure_bioprocess'
df_exp_bp['display_relation'] = 'interacts with'
df_exp_bp = clean_edges(df_exp_bp)
df_exp_bp.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,exposure_bioprocess,interacts with,C046839,exposure,"1,2,3,4,6,7,8-heptachlorodibenzodioxin",CTD,8217,biological_process,regulation of blood pressure,GO


In [36]:
df_exp_mf = df_exp_path.query('y_type=="molecular_function"').copy()
df_exp_mf['relation'] = 'exposure_molfunc'
df_exp_mf['display_relation'] = 'interacts with'
df_exp_mf = clean_edges(df_exp_mf)
df_exp_mf.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
527,exposure_molfunc,interacts with,C014024,exposure,"2,4,5,2',4',5'-hexachlorobiphenyl",CTD,19766,molecular_function,IgA receptor activity,GO


In [37]:
df_exp_cc = df_exp_path.query('y_type=="cellular_component"').copy()
df_exp_cc['relation'] = 'exposure_cellcomp'
df_exp_cc['display_relation'] = 'interacts with'
df_exp_cc = clean_edges(df_exp_cc)
df_exp_cc.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
833,exposure_cellcomp,interacts with,D000393,exposure,Air Pollutants,CTD,71743,cellular_component,"IgE immunoglobulin complex, circulating",GO


## Anatomy

### Anatomy anatomy interactions (UBERON) 

In [38]:
df_ana_ana = pd.merge(df_uberon_is_a, df_uberon_terms, 'left', left_on='id', right_on='id')
df_ana_ana = df_ana_ana.rename(columns={'id':'x_id', 'name':'x_name'})
df_ana_ana = pd.merge(df_ana_ana, df_uberon_terms, 'left', left_on='is_a', right_on='id')
df_ana_ana = df_ana_ana.rename(columns={'id':'y_id', 'name':'y_name'})
df_ana_ana['x_type'] = 'anatomy'
df_ana_ana['x_source'] = 'UBERON'
df_ana_ana['y_type'] = 'anatomy'
df_ana_ana['y_source'] = 'UBERON'
df_ana_ana['relation'] = 'anatomy_anatomy'
df_ana_ana['display_relation'] = 'parent-child'
df_ana_ana = clean_edges(df_ana_ana)
df_ana_ana.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,anatomy_anatomy,parent-child,2,anatomy,uterine cervix,UBERON,5156,anatomy,reproductive structure,UBERON


### Anatomy Protein (BGEE)

In [39]:
df_bgee = pd.merge(df_bgee, df_prot_names, 'inner', left_on='gene_name', right_on='symbol')
df_bgee = df_bgee.rename(columns={'ncbi_id':'x_id', 'symbol':'x_name', 
                                  'anatomy_id':'y_id', 'anatomy_name':'y_name'})
df_bgee['x_source'] = 'NCBI'
df_bgee['x_type'] = 'gene/protein'
df_bgee['y_source'] = 'UBERON'
df_bgee['y_type'] = 'anatomy'

In [40]:
df_ana_prot_pos = df_bgee.query('expression=="present"').copy()
df_ana_prot_pos['relation'] = 'anatomy_protein_present'
df_ana_prot_pos['display_relation'] = 'expression present'
df_ana_prot_pos = clean_edges(df_ana_prot_pos)
df_ana_prot_pos.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,anatomy_protein_present,expression present,7105,gene/protein,TSPAN6,NCBI,2,anatomy,uterine cervix,UBERON


In [41]:
df_ana_prot_neg = df_bgee.query('expression=="absent"').copy()
df_ana_prot_neg['relation'] = 'anatomy_protein_absent'
df_ana_prot_neg['display_relation'] = 'expression absent'
df_ana_prot_neg = clean_edges(df_ana_prot_neg)
df_ana_prot_neg.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
507,anatomy_protein_absent,expression absent,2268,gene/protein,FGR,NCBI,1476,anatomy,deltoid,UBERON


## Pathways

In [42]:
df_path_path = pd.merge(df_reactome_rels, df_reactome_terms, 'inner', left_on='reactome_id_1', right_on='reactome_id')
df_path_path = df_path_path.rename(columns={'reactome_id': 'x_id', 'reactome_name':'x_name'})
df_path_path = pd.merge(df_path_path, df_reactome_terms, 'inner', left_on='reactome_id_2', right_on='reactome_id')
df_path_path = df_path_path.rename(columns={'reactome_id': 'y_id', 'reactome_name':'y_name'})

df_path_path['x_source'] = 'REACTOME'
df_path_path['x_type'] = 'pathway'
df_path_path['y_source'] = 'REACTOME'
df_path_path['y_type'] = 'pathway'
df_path_path['relation'] = 'pathway_pathway'
df_path_path['display_relation'] = 'parent-child'
df_path_path = clean_edges(df_path_path)
df_path_path.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,pathway_pathway,parent-child,R-HSA-109581,pathway,Apoptosis,REACTOME,R-HSA-109606,pathway,Intrinsic Pathway for Apoptosis,REACTOME


### Pathway protein interactions

In [43]:
df_path_prot = pd.merge(df_reactome_ncbi, df_prot_names, 'inner', 'ncbi_id')

df_path_prot = df_path_prot.rename(columns={'ncbi_id': 'x_id', 'symbol':'x_name', 
                                            'reactome_id': 'y_id', 'reactome_name':'y_name'})
df_path_prot['x_source'] = 'NCBI'
df_path_prot['x_type'] = 'gene/protein'
df_path_prot['y_source'] = 'REACTOME'
df_path_prot['y_type'] = 'pathway'
df_path_prot['relation'] = 'pathway_protein'
df_path_prot['display_relation'] = 'interacts with'
df_path_prot = clean_edges(df_path_prot)
df_path_prot.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,pathway_protein,interacts with,1,gene/protein,A1BG,NCBI,R-HSA-114608,pathway,Platelet degranulation,REACTOME


## Variant gene interactions

In [44]:
df_variant_gene_hgnc = pd.merge(df_variant_gene[df_variant_gene["gene_source"] == "HGNC"], df_gene_mapping, 'left', left_on='gene_id', right_on="hgnc_id")
assert len(df_variant_gene_hgnc[df_variant_gene_hgnc["hgnc_id"].isna()]) == 0
df_variant_gene_hgnc = df_variant_gene_hgnc.get(["variant_id", "variant_label", "hgnc_id", "relation_label"]).drop_duplicates()
df_variant_gene_hgnc.head(1)

Unnamed: 0,variant_id,variant_label,hgnc_id,relation_label
0,OMIM:148040.0012,"KRT5, SER181PRO",6442,is_allele_of


In [45]:
df_gene_mapping_ncbi = df_gene_mapping[~df_gene_mapping["ncbi_id"].isna()].astype({"ncbi_id":int}).astype({"ncbi_id":str})
df_variant_gene_ncbi = pd.merge(df_variant_gene[df_variant_gene["gene_source"] == "NCBIGene"], df_gene_mapping_ncbi, 'inner', left_on='gene_id', right_on="ncbi_id")
df_variant_gene_ncbi = df_variant_gene_ncbi.get(["variant_id", "variant_label", "hgnc_id", "relation_label"]).drop_duplicates()
df_variant_gene_ncbi.head(1)

Unnamed: 0,variant_id,variant_label,hgnc_id,relation_label


In [46]:
df_gene_mapping_omim = df_gene_mapping[~df_gene_mapping["omim_id"].isna()].astype({"omim_id":int}).astype({"omim_id":str})
df_variant_gene_omim = pd.merge(df_variant_gene[df_variant_gene["gene_source"] == "OMIM"], df_gene_mapping_omim, 'inner', left_on='gene_id', right_on="omim_id")
df_variant_gene_omim = df_variant_gene_omim[~df_variant_gene_omim["ncbi_id"].isna()].astype({"ncbi_id":int}).astype({"ncbi_id":str})
df_variant_gene_omim = df_variant_gene_omim.get(["variant_id", "variant_label", "hgnc_id", "relation_label"]).drop_duplicates()
df_variant_gene_omim.head(1)

Unnamed: 0,variant_id,variant_label,hgnc_id,relation_label


In [47]:
df_gene_mapping_ensg = df_gene_mapping[~df_gene_mapping["ens_id"].isna()]
df_variant_gene_ensg = pd.merge(df_variant_gene[df_variant_gene["gene_source"] == "ENSEMBL"], df_gene_mapping_ensg, 'inner', left_on='gene_id', right_on="ens_id")
df_variant_gene_ensg = df_variant_gene_ensg[~df_variant_gene_ensg["ncbi_id"].isna()].astype({"ncbi_id":int}).astype({"ncbi_id":str})
df_variant_gene_ensg = df_variant_gene_ensg.get(["variant_id", "variant_label", "hgnc_id", "relation_label"]).rename(columns={"gene_id": "gene_name"}).drop_duplicates()
df_variant_gene_ensg.head(1)

Unnamed: 0,variant_id,variant_label,hgnc_id,relation_label
0,dbSNP:rs2161300,rs2161300-T,56066,has_affected_feature


In [48]:
print("All HGNC genes from Monarch Initiative:", df_variant_gene[df_variant_gene["gene_source"] == "HGNC"].shape)
print("Mapped HGNC genes to HGNC:", df_variant_gene_hgnc.shape)

print("All NCBI genes from Monarch Initiative:", df_variant_gene[df_variant_gene["gene_source"] == "NCBIGene"].shape)
print("Mapped NCBI genes to HGNC:", df_variant_gene_ncbi.shape)

print("All OMIM genes from Monarch Initiative:", df_variant_gene[df_variant_gene["gene_source"] == "OMIM"].shape)
print("Mapped OMIM genes to HGNC:", df_variant_gene_omim.shape)

print("All ENS genes from Monarch Initiative:", df_variant_gene[df_variant_gene["gene_source"] == "ENSEMBL"].shape)
print("Mapped ENS genes to HGNC:", df_variant_gene_ensg.shape)

All HGNC genes from Monarch Initiative: (226413, 6)
Mapped HGNC genes to HGNC: (226413, 4)
All NCBI genes from Monarch Initiative: (15, 6)
Mapped NCBI genes to HGNC: (0, 4)
All OMIM genes from Monarch Initiative: (1, 6)
Mapped OMIM genes to HGNC: (0, 4)
All ENS genes from Monarch Initiative: (459, 6)
Mapped ENS genes to HGNC: (10, 4)


In [49]:
df_variant_gene_hgnc = pd.concat([df_variant_gene_hgnc, df_variant_gene_ncbi, df_variant_gene_omim, df_variant_gene_ensg], ignore_index=True)
df_variant_gene_hgnc = df_variant_gene_hgnc.drop_duplicates()
print("Final variant-gene associations:", df_variant_gene_hgnc.shape)

Final variant-gene associations: (226418, 4)


In [None]:

# Need to map NCBI to HGNC before compiling KG


## Variant phenotype associations

In [50]:

# Need to map EFO to UMLS (https://github.com/EBISPOT/efo/releases)





In [51]:
# Remove HPO that already have MONDO IDs

df_variant_phenotype_MONDO = df_variant_phenotype[df_variant_phenotype["phenotype_source"] == "HP"].query('phenotype_id in @hp_ids_r_mondo')
df_variant_phenotype_MONDO = replace_hp_data_w_mondo(df=df_variant_phenotype_MONDO, hp_id_col='phenotype_id', drop_cols=[c for c in df_variant_phenotype_MONDO.columns.values if 'phenotype_' in c])
df_variant_phenotype_MONDO = df_variant_phenotype_MONDO.rename(columns={'mondo_id':'phenotype_id', 'mondo_name':'phenotype_label'})
df_variant_phenotype_MONDO.loc[:, 'phenotype_source'] = 'MONDO'
df_variant_phenotype_MONDO

Unnamed: 0,variant_id,variant_label,relation_label,phenotype_id,phenotype_label,phenotype_source
0,OMIM:107310.0002,"SLC9A1, SER464PHE",has phenotype,5881,oligohydramnios (disease),MONDO
1,dbSNP:rs2342406,rs2342406-?,contributes to condition,1384,myopia (disease),MONDO
2,dbSNP:rs115689122,rs115689122-A,contributes to condition,1156,borderline personality disorder (disease),MONDO
3,dbSNP:rs77008212,rs77008212-A,contributes to condition,11284,astigmatism (disease),MONDO
4,dbSNP:rs4357117,rs4357117-T,contributes to condition,1673,diarrheal disease,MONDO
...,...,...,...,...,...,...
712,dbSNP:rs28613963,rs28613963-T,contributes to condition,1384,myopia (disease),MONDO
713,dbSNP:rs6420484,rs6420484-G,contributes to condition,1384,myopia (disease),MONDO
714,dbSNP:rs56075542,rs56075542-T,contributes to condition,1384,myopia (disease),MONDO
715,dbSNP:rs117949737,rs117949737-G,contributes to condition,11284,astigmatism (disease),MONDO


In [52]:
# Make sure all HPOs are mapped

df_variant_phenotype_HPO = df_variant_phenotype[df_variant_phenotype["phenotype_source"] == "HP"].query('phenotype_id not in @hp_ids_r_mondo')
assert pd.merge(df_variant_phenotype_HPO, df_hp_terms, 'left', left_on='phenotype_id', right_on='id').shape == pd.merge(df_variant_phenotype_HPO, df_hp_terms, 'inner', left_on='phenotype_id', right_on='id').shape
df_variant_phenotype_HPO.head(1)

Unnamed: 0,variant_id,variant_label,phenotype_id,phenotype_label,phenotype_source,relation_label
50506,dbSNP:rs2066844,rs2066844-?,155,Oral ulcer,HP,contributes to condition


## Variant disease associations

In [59]:
df_variant_disease["disease_source"].unique()

array(['MONDO', 'OMIM'], dtype=object)

In [58]:
# Map MONDO

mapped_left = pd.merge(df_variant_disease[df_variant_disease["disease_source"] == "MONDO"], df_mondo_terms, 'left', left_on='disease_id', right_on='id')
mapped_inner = pd.merge(df_variant_disease[df_variant_disease["disease_source"] == "MONDO"], df_mondo_terms, 'inner', left_on='disease_id', right_on='id')
print(mapped_left.shape, mapped_inner.shape)

not_mapped = set(list(mapped_left["disease_id"].unique())).difference(set(list(mapped_inner["disease_id"].unique())))
print("Not mapped (but will keep for now)", not_mapped)

(192271, 10) (191848, 10)
Not mapped (but will keep for now) {'600024', '600027', '700060', '700089', '700092', '100352', '700087', '30849', '100348', '700088', '700090', '30837'}


In [54]:
# Map OMIM

list(df_variant_disease[df_variant_disease["disease_source"] == "OMIM"]["disease_id"].unique())
# OMIM:616617 has obsolete MONDO IDs (MONDO:0014709, MONDO:0100263)
# OMIM:234580 is mapped to MONDO:0024544 according to Wikidata but no other reference available
# OMIM:184840 has no MONDO ID

# Verdict: Assuming that these OMIM IDs do not have MONDO IDs

['616617', '234580', '184840']

# Compiling knowledge graph

In [110]:
kg = pd.concat([df_prot_prot, df_prot_drug, df_drug_dis, df_drug_drug, df_prot_phe,
                df_phe_phe, df_dis_phe_neg, df_dis_phe_pos, df_prot_dis, df_dis_dis, 
                df_drug_effect, df_bp_bp, df_mf_mf, df_cc_cc, df_prot_mf, 
                df_prot_cc, df_prot_bp, df_exp_prot, df_exp_dis, df_exp_exp, 
                df_exp_bp, df_exp_mf, df_exp_cc, df_path_path, df_path_prot,
                df_ana_ana, df_ana_prot_pos, df_ana_prot_neg]) #28
kg = kg.drop_duplicates()
kg_rev = kg.copy().rename(columns={'x_id':'y_id','x_type':'y_type', 'x_name':'y_name', 'x_source':'y_source',
                            'y_id':'x_id','y_type':'x_type', 'y_name':'x_name', 'y_source':'x_source' }) #add reverse edges
kg = pd.concat([kg, kg_rev])
kg = kg.drop_duplicates()
kg = kg.dropna()
# remove self loops from edges 
kg = kg.query('not ((x_id == y_id) and (x_type == y_type) and (x_source == y_source) and (x_name == y_name))')
kg.head()

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,protein_protein,ppi,9796,gene/protein,PHYHIP,NCBI,56992,gene/protein,KIF15,NCBI
1,protein_protein,ppi,7918,gene/protein,GPANK1,NCBI,9240,gene/protein,PNMA1,NCBI
2,protein_protein,ppi,8233,gene/protein,ZRSR2,NCBI,23548,gene/protein,TTC33,NCBI
3,protein_protein,ppi,4899,gene/protein,NRF1,NCBI,11253,gene/protein,MAN1B1,NCBI
4,protein_protein,ppi,5297,gene/protein,PI4KA,NCBI,8601,gene/protein,RGS20,NCBI


In [42]:
kg.to_csv(save_path+'auxillary/kg_raw.csv', index=False)

# Get giant component

In [43]:
kg = pd.read_csv(save_path+'auxillary/kg_raw.csv', low_memory=False)

In [44]:
nodes = pd.concat([kg.get(['x_id','x_type', 'x_name','x_source']).rename(columns={'x_id':'node_id', 'x_type':'node_type', 'x_name':'node_name','x_source':'node_source'}), 
                   kg.get(['y_id','y_type', 'y_name','y_source']).rename(columns={'y_id':'node_id', 'y_type':'node_type', 'y_name':'node_name','y_source':'node_source'})])
nodes = nodes.drop_duplicates().reset_index().drop('index',axis=1).reset_index().rename(columns={'index':'node_idx'})

edges = pd.merge(kg, nodes, 'left', left_on=['x_id','x_type', 'x_name','x_source'], right_on=['node_id','node_type','node_name','node_source'])
edges = edges.rename(columns={'node_idx':'x_idx'})
edges = pd.merge(edges, nodes, 'left', left_on=['y_id','y_type', 'y_name','y_source'], right_on=['node_id','node_type','node_name','node_source'])
edges = edges.rename(columns={'node_idx':'y_idx'})
edges = edges.get(['relation', 'display_relation','x_idx', 'y_idx'])
edges['combine_idx'] = edges['x_idx'].astype(str) + '-' + edges['y_idx'].astype(str)

edge_index = edges.get(['x_idx', 'y_idx']).values.T

graph = ig.Graph()
graph.add_vertices(list(range(nodes.shape[0])))
graph.add_edges([tuple(x) for x in edge_index.T])

graph = graph.as_undirected(mode='collapse')

c = graph.components(mode='strong')
giant = c.giant()

#print('Nodes: %d' % giant.vcount())
#print('Edges: %d' % giant.ecount())

assert not giant.is_directed()
assert giant.is_connected()

giant_nodes = giant.vs['name']
new_nodes = nodes.query('node_idx in @giant_nodes')
assert new_nodes.shape[0] == giant.vcount()

new_edges = edges.query('x_idx in @giant_nodes and y_idx in @giant_nodes').copy()
assert new_edges.shape[0] == giant.ecount()

new_kg = pd.merge(new_edges, new_nodes, 'left', left_on='x_idx', right_on='node_idx')
new_kg = new_kg.rename(columns={'node_id':'x_id', 'node_type':'x_type', 'node_name':'x_name','node_source':'x_source'}) 
new_kg = pd.merge(new_kg, new_nodes, 'left', left_on='y_idx', right_on='node_idx')
new_kg = new_kg.rename(columns={'node_id':'y_id', 'node_type':'y_type', 'node_name':'y_name','node_source':'y_source'}) 
new_kg = clean_edges(new_kg)

In [45]:
kg = new_kg.copy()
kg.to_csv(save_path+'auxillary/kg_giant.csv', index=False)

# Collapse similar diseases

In [46]:
kg = pd.read_csv(save_path+'auxillary/kg_giant.csv', low_memory=False)

## Find Groups

### Automated grouping

In [47]:
'''
disease_nodes = pd.concat([kg.get(['x_id','x_type', 'x_name','x_source']).rename(columns={'x_id':'node_id', 'x_type':'node_type', 'x_name':'node_name','x_source':'node_source'}), 
                   kg.get(['y_id','y_type', 'y_name','y_source']).rename(columns={'y_id':'node_id', 'y_type':'node_type', 'y_name':'node_name','y_source':'node_source'})])
disease_nodes = disease_nodes.query('node_type=="disease"')
disease_nodes = disease_nodes.drop_duplicates().reset_index().drop('index',axis=1).reset_index().rename(columns={'index':'node_idx'})
'''

'\ndisease_nodes = pd.concat([kg.get([\'x_id\',\'x_type\', \'x_name\',\'x_source\']).rename(columns={\'x_id\':\'node_id\', \'x_type\':\'node_type\', \'x_name\':\'node_name\',\'x_source\':\'node_source\'}), \n                   kg.get([\'y_id\',\'y_type\', \'y_name\',\'y_source\']).rename(columns={\'y_id\':\'node_id\', \'y_type\':\'node_type\', \'y_name\':\'node_name\',\'y_source\':\'node_source\'})])\ndisease_nodes = disease_nodes.query(\'node_type=="disease"\')\ndisease_nodes = disease_nodes.drop_duplicates().reset_index().drop(\'index\',axis=1).reset_index().rename(columns={\'index\':\'node_idx\'})\n'

In [48]:
'''
groups = []
seen = set()
idx2group = {}
no = set()

def isroman(s):
    return bool(re.search(r"^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$",s))

def issingleletter(s): 
    if len(s)>1: return False

def same_words(s1, s2): 
    for word in s1.lower().split(' '): 
        word = word.split(',')[0]
        if word!='type' and word!='(disease)' and word not in s2.lower(): 
            return False 
    for word in s2.lower().split(' '): 
        word = word.split(',')[0]
        if word!='type' and word!='(disease)' and word not in s1.lower(): 
            return False
    return True

for i in range(disease_nodes.shape[0]):
    i_name = disease_nodes.loc[i, 'node_name']
    i_idx = disease_nodes.loc[i, 'node_idx']
    for w in ['monosomy','disomy', 'trisomy', 'trisomy/tetrasomy', 'chromosome']: 
        if w in i_name: 
            no.add(i_idx)

for i in range(disease_nodes.shape[0]):
    i_idx = disease_nodes.loc[i, 'node_idx']
    if i_idx in seen: continue 
    if i_idx in no: continue 
    i_name = disease_nodes.loc[i, 'node_name']
    i_split = i_name.split(' ')
    end = i_split[-1]
    if len(end)<=2 or end.isnumeric() or isroman(end):  
        main_text = ' '.join(i_split[:-1])
        matches = [i_name]
        matches_idx = [i_idx]
        match_found = False
        numeric = True
        for j in range(disease_nodes.shape[0]):
            j_idx = disease_nodes.loc[j, 'node_idx']
            j_name = disease_nodes.loc[j, 'node_name']
            m = ' '.join(j_name.split(' ')[:-1])
            if m.lower() == main_text.lower() or same_words(m, main_text): 
                matches.append(j_name)
                matches_idx.append(j_idx)
                match_found = True
        if match_found:
            matches_idx = list(set(matches_idx))
            matches = list(set(matches))
            if len(matches) <= 1: continue 
            if main_text.endswith('type'): 
                main_text = main_text[:-4]
            if main_text.endswith(','): 
                main_text = main_text[:-1]
            if main_text.endswith(' '): 
                main_text = main_text[:-1]
            print(main_text)
            for x in sorted(matches): 
                print('-  ',x)
            for x in matches_idx: 
                seen.add(x)
                idx2group[x] = main_text
            groups.append((main_text, matches_idx))

'''

'\ngroups = []\nseen = set()\nidx2group = {}\nno = set()\n\ndef isroman(s):\n    return bool(re.search(r"^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$",s))\n\ndef issingleletter(s): \n    if len(s)>1: return False\n\ndef same_words(s1, s2): \n    for word in s1.lower().split(\' \'): \n        word = word.split(\',\')[0]\n        if word!=\'type\' and word!=\'(disease)\' and word not in s2.lower(): \n            return False \n    for word in s2.lower().split(\' \'): \n        word = word.split(\',\')[0]\n        if word!=\'type\' and word!=\'(disease)\' and word not in s1.lower(): \n            return False\n    return True\n\nfor i in range(disease_nodes.shape[0]):\n    i_name = disease_nodes.loc[i, \'node_name\']\n    i_idx = disease_nodes.loc[i, \'node_idx\']\n    for w in [\'monosomy\',\'disomy\', \'trisomy\', \'trisomy/tetrasomy\', \'chromosome\']: \n        if w in i_name: \n            no.add(i_idx)\n\nfor i in range(disease_nodes.shape[0]):\n    i_idx = disease_nodes

In [49]:
'''
disease_nodes.loc[:, 'group_name'] = ''
for data in disease_nodes.itertuples():
    if data.node_idx in idx2group.keys(): 
        disease_nodes.loc[data.Index, 'group_name'] = idx2group[data.node_idx]
    else: 
        disease_nodes.loc[data.Index, 'group_name'] = data.node_name
        
disease_group_1 = disease_nodes.get(['group_name']).drop_duplicates().reset_index().rename(columns={'index':'group_idx'})
disease_nodes = pd.merge(disease_nodes, disease_group_1, 'left', 'group_name')
'''

"\ndisease_nodes.loc[:, 'group_name'] = ''\nfor data in disease_nodes.itertuples():\n    if data.node_idx in idx2group.keys(): \n        disease_nodes.loc[data.Index, 'group_name'] = idx2group[data.node_idx]\n    else: \n        disease_nodes.loc[data.Index, 'group_name'] = data.node_name\n        \ndisease_group_1 = disease_nodes.get(['group_name']).drop_duplicates().reset_index().rename(columns={'index':'group_idx'})\ndisease_nodes = pd.merge(disease_nodes, disease_group_1, 'left', 'group_name')\n"

### Grouping with BERT

In [50]:
# generate embeddings 
'''
input_text = list(disease_group_1.get('group_name').values)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#model_name='dmis-lab/biobert-large-cased-v1.1'
model_name = 'emilyalsentzer/Bio_ClinicalBERT'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model = model.to(device)
model.eval()

def batch(iterable, batch_size=4, return_idx=True):
    l = len(iterable)
    for ndx in range(0, l, batch_size):
        if return_idx: 
            yield (ndx, min(ndx + batch_size, l))
        else:
            yield iterable[ndx:min(ndx + batch_size, l)]
            
tmp_dir = 'tmp/'
if os.path.isdir(tmp_dir): 
    shutil.rmtree(tmp_dir)
os.mkdir(tmp_dir)

batch_size=32
input_tokens = tokenizer(input_text, padding=True, return_tensors='pt', truncation=True, max_length=512)
for i, (start, end) in tqdm(enumerate(batch(input_text, batch_size))):
    input_ids = input_tokens['input_ids'][start:end, :].to(device)
    attention_mask = input_tokens['attention_mask'][start:end, :].to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        embeds = torch.mean(outputs[0], dim=1)
    np.save(tmp_dir+str(i)+'.npy', embeds.numpy())
    
embeds = []
for i, _ in enumerate(batch(input_text, batch_size)):
    x = np.load(tmp_dir+str(i)+'.npy')
    embeds.append(x)
embeds = np.concatenate(embeds)

np.save(save_path+'auxillary/kg_disease_bert_embeds.npy', embeds)
if os.path.isdir(tmp_dir): 
    shutil.rmtree(tmp_dir)'''

'\ninput_text = list(disease_group_1.get(\'group_name\').values)\n\ndevice = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")\n#model_name=\'dmis-lab/biobert-large-cased-v1.1\'\nmodel_name = \'emilyalsentzer/Bio_ClinicalBERT\'\ntokenizer = AutoTokenizer.from_pretrained(model_name)\nmodel = AutoModel.from_pretrained(model_name)\nmodel = model.to(device)\nmodel.eval()\n\ndef batch(iterable, batch_size=4, return_idx=True):\n    l = len(iterable)\n    for ndx in range(0, l, batch_size):\n        if return_idx: \n            yield (ndx, min(ndx + batch_size, l))\n        else:\n            yield iterable[ndx:min(ndx + batch_size, l)]\n            \ntmp_dir = \'tmp/\'\nif os.path.isdir(tmp_dir): \n    shutil.rmtree(tmp_dir)\nos.mkdir(tmp_dir)\n\nbatch_size=32\ninput_tokens = tokenizer(input_text, padding=True, return_tensors=\'pt\', truncation=True, max_length=512)\nfor i, (start, end) in tqdm(enumerate(batch(input_text, batch_size))):\n    input_ids = input_tokens[\'input_ids\

In [51]:
'''
embeds = np.load(save_path+'auxillary/kg_disease_bert_embeds.npy')
cos_sim = cosine_similarity(embeds, embeds)

seen = set()
groups = []
idx2group = {}
no = set()

for i in range(disease_group_1.shape[0]):
    i_name = disease_group_1.loc[i, 'group_name']
    i_idx = disease_group_1.loc[i, 'group_idx']
    for w in ['cardiomyopathy', 'syndrome', 'combined', 'complement', 'deficiency', 
              'factor', 'immunodeficiency', 'monosomy','disomy', 'trisomy', 
              'trisomy/tetrasomy', 'chromosome', 'neuroendocrine tumor', 
              'neuroendocrine neoplasm', 'cancer', 'tumor', 'neoplasm','carcinoma',
              'lymphoma', 'lipoma']: 
        if w in i_name: 
            no.add(i_idx)
            continue
    for w in ['CDG']: 
        if i_name.endswith(w): 
            no.add(i_idx)
            continue
    for w in ['neurodevelopmental disorder', 'glycogen storage disease', 
              'congenital disorder of glycosylation', 'qualitative or quantitative defects']: 
        if i_name.startswith(w): 
            no.add(i_idx)
            continue
            
cutoff = 0.98
for i in range(disease_group_1.shape[0]):
    i_name = disease_group_1.loc[i, 'group_name']
    i_idx = disease_group_1.loc[i, 'group_idx']
    if i_idx in no or i_idx in seen: continue
    x = disease_group_1[cos_sim[i]>cutoff]
    if x.shape[0]>1: 
        for v in x.get('group_name').values: 
            print(v)
        main_text = input(' Ok? ')
        if main_text not in ['','on','no', 'No', 'NO']: 
            for v in x.get('group_idx').values: 
                seen.add(v)
                idx2group[v] = main_text
            g = list(x.get('group_idx').values.reshape(-1))
            groups.append((main_text, g)) # main_text contains group name
        else: 
            no.add(i_idx)
            print('Not added')
'''

"\nembeds = np.load(save_path+'auxillary/kg_disease_bert_embeds.npy')\ncos_sim = cosine_similarity(embeds, embeds)\n\nseen = set()\ngroups = []\nidx2group = {}\nno = set()\n\nfor i in range(disease_group_1.shape[0]):\n    i_name = disease_group_1.loc[i, 'group_name']\n    i_idx = disease_group_1.loc[i, 'group_idx']\n    for w in ['cardiomyopathy', 'syndrome', 'combined', 'complement', 'deficiency', \n              'factor', 'immunodeficiency', 'monosomy','disomy', 'trisomy', \n              'trisomy/tetrasomy', 'chromosome', 'neuroendocrine tumor', \n              'neuroendocrine neoplasm', 'cancer', 'tumor', 'neoplasm','carcinoma',\n              'lymphoma', 'lipoma']: \n        if w in i_name: \n            no.add(i_idx)\n            continue\n    for w in ['CDG']: \n        if i_name.endswith(w): \n            no.add(i_idx)\n            continue\n    for w in ['neurodevelopmental disorder', 'glycogen storage disease', \n              'congenital disorder of glycosylation', 'qualitat

In [52]:
'''
disease_group_1.loc[:, 'group_name_2'] = ''
for data in disease_group_1.itertuples(): 
    if data.group_idx in idx2group.keys():
        disease_group_1.loc[data.Index, 'group_name_2'] = idx2group[data.group_idx]
    else: 
        disease_group_1.loc[data.Index, 'group_name_2'] = data.group_name
        
disease_group_2 = disease_group_1.get(['group_name_2']).drop_duplicates().reset_index().rename(columns={'index':'group_idx_2'})
'''

"\ndisease_group_1.loc[:, 'group_name_2'] = ''\nfor data in disease_group_1.itertuples(): \n    if data.group_idx in idx2group.keys():\n        disease_group_1.loc[data.Index, 'group_name_2'] = idx2group[data.group_idx]\n    else: \n        disease_group_1.loc[data.Index, 'group_name_2'] = data.group_name\n        \ndisease_group_2 = disease_group_1.get(['group_name_2']).drop_duplicates().reset_index().rename(columns={'index':'group_idx_2'})\n"

In [53]:
'''
df_disease_group = pd.merge(disease_nodes, disease_group_1, 'left', 'group_name')
df_disease_group = df_disease_group.get(['node_id', 'node_type', 'node_name', 'node_source',
       'group_name', 'group_name_2'])
df_disease_group = df_disease_group.rename(columns={'group_name':'group_name_auto',
        'group_name_2':'group_name_bert'}).astype({'node_id':str})
df_disease_group.to_csv(save_path+'auxillary/kg_grouped_diseases.csv')
'''

"\ndf_disease_group = pd.merge(disease_nodes, disease_group_1, 'left', 'group_name')\ndf_disease_group = df_disease_group.get(['node_id', 'node_type', 'node_name', 'node_source',\n       'group_name', 'group_name_2'])\ndf_disease_group = df_disease_group.rename(columns={'group_name':'group_name_auto',\n        'group_name_2':'group_name_bert'}).astype({'node_id':str})\ndf_disease_group.to_csv(save_path+'auxillary/kg_grouped_diseases.csv')\n"

## Apply Groups

In [54]:
grouped_diseases = pd.read_csv(save_path+'auxillary/kg_grouped_diseases.csv').astype({'node_id':str})
group_col = 'group_name_bert'

groups = grouped_diseases.groupby(group_col).count().query('node_id>1').index.values
set_groups = set(groups)

id_col = group_col.replace('name','id')
group_map = pd.DataFrame(columns=[id_col, group_col])
group_map.loc[:, group_col] = groups

grouped_diseases = grouped_diseases.query('{} in @set_groups'.format(group_col))

for g, data in grouped_diseases.groupby(group_col): 
    if g in set_groups:
        x = '_'.join(list(data.get('node_id').values))
        i = group_map.query('{}==@g'.format(group_col)).index[0]
        group_map.loc[i, id_col] = x
        
grouped_diseases = pd.merge(grouped_diseases, group_map)
grouped_diseases.to_csv(save_path+'auxillary/kg_grouped_diseases_bert_map.csv', index=False)

In [55]:
kg_x_dis = kg.query('x_type=="disease" and x_source=="MONDO"')
kg_y_dis = kg.query('y_type=="disease" and y_source=="MONDO"')

for idx, data in tqdm(grouped_diseases.iterrows(), total=grouped_diseases.shape[0]): 
    x_index = kg_x_dis.query('x_id==@data.node_id and x_name==@data.node_name').index.values
    kg.loc[x_index, 'x_id'] = data.get(id_col)
    kg.loc[x_index, 'x_name'] = data.get(group_col)
    kg.loc[x_index, 'x_source'] = 'MONDO_grouped'

    y_index = kg_y_dis.query('y_id==@data.node_id and y_name==@data.node_name').index.values
    kg.loc[y_index, 'y_id'] = data.get(id_col)
    kg.loc[y_index, 'y_name'] = data.get(group_col)
    kg.loc[y_index, 'y_source'] = 'MONDO_grouped'

HBox(children=(FloatProgress(value=0.0, max=6392.0), HTML(value='')))




In [56]:
kg = kg.drop_duplicates()
kg_rev = kg.copy().rename(columns={'x_id':'y_id','x_type':'y_type', 'x_name':'y_name', 'x_source':'y_source',
                                   'y_id':'x_id','y_type':'x_type', 'y_name':'x_name', 'y_source':'x_source' }) #add reverse edges
kg = pd.concat([kg, kg_rev])
kg = kg.drop_duplicates()
kg = kg.dropna()
# remove self loops from edges 
kg = kg.query('not ((x_id == y_id) and (x_type == y_type) and (x_source == y_source) and (x_name == y_name))')

In [57]:
kg.to_csv(save_path+'auxillary/kg_grouped.csv', index=False)

# Knowledge graph description

In [58]:
kg = pd.read_csv(save_path+'auxillary/kg_grouped.csv', low_memory=False)

In [59]:
# nodes file 
nodes = pd.concat([kg.get(['x_id','x_type', 'x_name','x_source']).rename(columns={'x_id':'node_id', 'x_type':'node_type', 'x_name':'node_name', 'x_source':'node_source'}), 
                   kg.get(['y_id','y_type', 'y_name','y_source']).rename(columns={'y_id':'node_id', 'y_type':'node_type', 'y_name':'node_name', 'y_source':'node_source'})])
nodes = nodes.drop_duplicates().reset_index().drop('index',axis=1).reset_index().rename(columns={'index':'node_index'})

# assign index 
kg = pd.merge(kg, nodes.rename(columns={'node_index':'x_index',
                                        'node_id':'x_id',
                                        'node_type':'x_type',
                                        'node_name':'x_name',
                                        'node_source':'x_source'}), 'left').dropna()
kg = pd.merge(kg, nodes.rename(columns={'node_index':'y_index',
                                        'node_id':'y_id',
                                        'node_type':'y_type',
                                        'node_name':'y_name',
                                        'node_source':'y_source'}), 'left').dropna()
kg = kg.get(['relation', 'display_relation', 'x_index', 'x_id', 'x_type', 'x_name', 'x_source',
       'y_index', 'y_id', 'y_type', 'y_name', 'y_source'])

# edges file 
edges = kg.get(['relation', 'display_relation', 'x_index', 'y_index']).copy()

In [60]:
kg.to_csv(save_path+'kg.csv', index=False)
nodes.to_csv(save_path+'nodes.csv', index=False)
edges.to_csv(save_path+'edges.csv', index=False)

In [61]:
def kg_describe(df, by, count_col): 
    df = df.groupby(by).count().sort_values(by=count_col, ascending=False).rename(columns={count_col:'count'}).get(['count'])
    total = np.sum(df.get('count').values)
    df = df.eval('percent = 100*count/@total')
    df = df.append(df.sum(0).rename('total'))
    df['count'] = df.get(['count']).astype('int')
    df['percent'] = df.get(['percent']).round(1)
    return df

In [62]:
kg_describe(nodes,'node_type','node_index')

Unnamed: 0_level_0,count,percent
node_type,Unnamed: 1_level_1,Unnamed: 2_level_1
biological_process,28642,22.1
gene/protein,27671,21.4
disease,17080,13.2
effect/phenotype,15311,11.8
anatomy,14035,10.8
molecular_function,11169,8.6
drug,7957,6.2
cellular_component,4176,3.2
pathway,2516,1.9
exposure,818,0.6


In [63]:
kg_describe(edges,'relation','x_index')

Unnamed: 0_level_0,count,percent
relation,Unnamed: 1_level_1,Unnamed: 2_level_1
anatomy_protein_present,3036406,37.5
drug_drug,2672628,33.0
protein_protein,642150,7.9
disease_phenotype_positive,300634,3.7
bioprocess_protein,289610,3.6
cellcomp_protein,166804,2.1
disease_protein,160822,2.0
molfunc_protein,139060,1.7
drug_effect,129568,1.6
bioprocess_bioprocess,105772,1.3
