In [2]:
from tqdm.notebook import tqdm
import re
import os 
import shutil
import numpy as np
import pandas as pd
import igraph as ig
from scipy.sparse import lil_matrix, save_npz
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoTokenizer, AutoModel, pipeline
%load_ext autoreload
%autoreload 2

#data_path = '../../datasets/'
#save_path = data_path +'kg/'
data_path = '/n/data1/hms/dbmi/zaklab/emily/rare_disease_dx/data/8.9.21_kg/raw/sources/'
save_path = '/n/data1/hms/dbmi/zaklab/emily/rare_disease_dx/data/8.9.21_kg/our_kg/'

# Read datasets

In [131]:
def assert_dtypes(df): 
    all_string = True
    for i, x in enumerate(df.dtypes.values): 
        if x != np.dtype('O'): 
            all_string = False
            print(df.columns[i], x)
    if not all_string: assert False

In [132]:
df_ppi = pd.read_csv(data_path+'ppi/protein_protein.csv', low_memory=False).dropna()
df_ppi = df_ppi.astype({'proteinA_entrezid':int}).astype({'proteinA_entrezid':str})
df_ppi = df_ppi.astype({'proteinB_entrezid':int}).astype({'proteinB_entrezid':str})
assert_dtypes(df_ppi)

df_drugbank = pd.read_csv(data_path+'drugbank/drug_protein.csv', low_memory=False)
df_drugbank = df_drugbank.get(['DrugBank', 'relation', 'NCBIGeneID','DrugBankName']).dropna()
df_drugbank = df_drugbank.astype({'NCBIGeneID':int}).astype({'NCBIGeneID':str})
assert_dtypes(df_drugbank)

df_disgenet = pd.read_csv(data_path+'disgenet/curated_gene_disease_associations.tsv', sep='\t', low_memory=False)
df_disgenet = df_disgenet.astype({'geneId':int}).astype({'geneId':str})

df_mondo_terms = pd.read_csv(data_path+'mondo/mondo_terms.csv', low_memory=False)
df_mondo_terms = df_mondo_terms.astype({'id':int}).astype({'id':str})

df_mondo_xref = pd.read_csv(data_path+'mondo/mondo_references.csv', low_memory=False)
df_mondo_xref = df_mondo_xref.astype({'mondo_id':int}).astype({'mondo_id':str})
assert_dtypes(df_mondo_xref)

df_mondo_parents = pd.read_csv(data_path+'mondo/mondo_parents.csv', low_memory=False)
df_mondo_parents = df_mondo_parents.astype({'parent':int}).astype({'parent':str})
df_mondo_parents = df_mondo_parents.astype({'child':int}).astype({'child':str})
assert_dtypes(df_mondo_parents)

df_drug_central = pd.read_csv(data_path+'drugcentral/drug_disease.csv', low_memory=False)
df_drug_central = df_drug_central.get(['cas_reg_no','relationship_name', 'umls_cui']) # 'concept_id', 'concept_name', 'snomed_conceptid'
df_drug_central = df_drug_central.query('not @df_drug_central.cas_reg_no.isna()')
df_drug_central = df_drug_central.query('not @df_drug_central.umls_cui.isna()')
assert_dtypes(df_drug_central)

df_ddi = pd.read_csv(data_path+'drugbank/drug_drug.csv', low_memory=False)
assert_dtypes(df_ddi)

df_hp_terms = pd.read_csv(data_path+'hpo/hp_terms.csv', low_memory=False)
df_hp_terms = df_hp_terms.astype({'id':int}).astype({'id':str})

df_hp_xref = pd.read_csv(data_path+'hpo/hp_references.csv', low_memory=False)
df_hp_xref = df_hp_xref.astype({'hp_id':int}).astype({'hp_id':str})

df_hp_parents = pd.read_csv(data_path+'hpo/hp_parents.csv', low_memory=False)
df_hp_parents = df_hp_parents.astype({'parent':int}).astype({'parent':str})
df_hp_parents = df_hp_parents.astype({'child':int}).astype({'child':str})
assert_dtypes(df_hp_parents)

df_hpoa_pos = pd.read_csv(data_path+'hpo/disease_phenotype_pos.csv', low_memory=False)
df_hpoa_pos = df_hpoa_pos.astype({'hp_id':int}).astype({'hp_id':str})
df_hpoa_pos = df_hpoa_pos.astype({'disease_ontology_id':int}).astype({'disease_ontology_id':str})
assert_dtypes(df_hpoa_pos)

df_hpoa_neg = pd.read_csv(data_path+'hpo/disease_phenotype_neg.csv', low_memory=False)
df_hpoa_neg = df_hpoa_neg.astype({'hp_id':int}).astype({'hp_id':str})
df_hpoa_neg = df_hpoa_neg.astype({'disease_ontology_id':int}).astype({'disease_ontology_id':str})
assert_dtypes(df_hpoa_neg)

df_sider = pd.read_csv(data_path+'sider/sider.csv', low_memory=False)
assert_dtypes(df_sider)

df_go_terms = pd.read_csv(data_path+'go/go_terms_info.csv', low_memory=False)
df_go_terms = df_go_terms.astype({'go_term_id':int}).astype({'go_term_id':str})
assert_dtypes(df_go_terms)

df_go_edges = pd.read_csv(data_path+'go/go_terms_relations.csv', low_memory=False)
df_go_edges = df_go_edges.astype({'x':int}).astype({'x':str})
df_go_edges = df_go_edges.astype({'y':int}).astype({'y':str})
assert_dtypes(df_go_edges)

df_gene2go = pd.read_csv(data_path+'ncbigene/protein_go_associations.csv', low_memory=False)
df_gene2go = df_gene2go.astype({'ncbi_gene_id':int}).astype({'ncbi_gene_id':str})
df_gene2go = df_gene2go.astype({'go_term_id':int}).astype({'go_term_id':str})
assert_dtypes(df_gene2go)

df_exposures = pd.read_csv(data_path+'ctd/exposure_data.csv', low_memory=False)
df_exposures = df_exposures.get(['exposurestressorname', 'exposurestressorid',
                  'exposuremarker', 'exposuremarkerid',
                  'diseasename', 'diseaseid',
                  'phenotypename', 'phenotypeid'])
assert_dtypes(df_exposures)

df_uberon_terms = pd.read_csv(data_path+'uberon/uberon_terms.csv', low_memory=False)
df_uberon_terms = df_uberon_terms.astype({'id':int}).astype({'id':str})
assert_dtypes(df_uberon_terms)

df_uberon_is_a = pd.read_csv(data_path+'uberon/uberon_is_a.csv', low_memory=False)
df_uberon_is_a = df_uberon_is_a.astype({'id':int}).astype({'id':str})
df_uberon_is_a = df_uberon_is_a.astype({'is_a':int}).astype({'is_a':str})
assert_dtypes(df_uberon_is_a)

df_uberon_rels = pd.read_csv(data_path+'uberon/uberon_rels.csv', low_memory=False)
df_uberon_rels = df_uberon_rels.astype({'id':int}).astype({'id':str})
df_uberon_rels = df_uberon_rels.astype({'relation_id':int}).astype({'relation_id':str})
assert_dtypes(df_uberon_rels)

df_bgee = pd.read_csv(data_path+'bgee/anatomy_gene.csv', low_memory=False)
df_bgee = df_bgee.astype({'expression_rank':int}).astype({'expression_rank':str})
df_bgee = df_bgee.astype({'anatomy_id':int}).astype({'anatomy_id':str})
assert_dtypes(df_bgee)

df_reactome_terms = pd.read_csv(data_path+'reactome/reactome_terms.csv', low_memory=False)
assert_dtypes(df_reactome_terms)

df_reactome_rels = pd.read_csv(data_path+'reactome/reactome_relations.csv', low_memory=False)
assert_dtypes(df_reactome_rels)

df_reactome_ncbi = pd.read_csv(data_path+'reactome/reactome_ncbi.csv', low_memory=False)
df_reactome_ncbi = df_reactome_ncbi[df_reactome_ncbi.ncbi_id.str.isnumeric()]
assert_dtypes(df_reactome_ncbi)

df_umls_mondo = pd.read_csv(data_path+'vocab/umls_mondo.csv', low_memory=False)
df_umls_mondo = df_umls_mondo.astype({'mondo_id':int}).astype({'mondo_id':str})
assert_dtypes(df_umls_mondo)

df_prot_names = pd.read_csv(data_path+'vocab/gene_names.csv', low_memory=False, sep='\t')
df_prot_names = df_prot_names.rename(columns={'NCBI Gene ID(supplied by NCBI)':'ncbi_id', 'NCBI Gene ID':'ncbi_id2', 'Approved symbol':'symbol', 'Approved name':'name'})
df_prot_names = df_prot_names.get(['ncbi_id', 'symbol']).dropna()
df_prot_names = df_prot_names.astype({'ncbi_id':int}).astype({'ncbi_id':str})
assert_dtypes(df_prot_names)

db_vocab = pd.read_csv(data_path+'vocab/drugbank_vocabulary.csv', low_memory=False)
assert_dtypes(db_vocab)

df_db_atc = pd.read_csv(data_path+'vocab/drugbank_atc_codes.csv', low_memory=False).get(['atc_code','parent_key'])
assert_dtypes(df_db_atc)

# Converting databases into graph edges

In [133]:
def clean_edges(df): 
    df = df.get(['relation', 'display_relation', 'x_id','x_type', 'x_name', 'x_source','y_id','y_type', 'y_name', 'y_source'])
    df = df.dropna()
    df = df.drop_duplicates()
    df = df.query('not ((x_id == y_id) and (x_type == y_type) and (x_source == y_source) and (x_name == y_name))')
    return df

## Basic

### Protein protein interactions (NCBI)

In [134]:
df_prot_prot = pd.merge(df_ppi, df_prot_names, 'left', left_on='proteinA_entrezid', right_on='ncbi_id').rename(columns={'symbol':'symbolA'})
df_prot_prot = pd.merge(df_prot_prot, df_prot_names, 'left', left_on='proteinB_entrezid', right_on='ncbi_id').rename(columns={'symbol':'symbolB'})

df_prot_prot = df_prot_prot.rename(columns={'proteinA_entrezid':'x_id', 'proteinB_entrezid':'y_id', 'symbolA':'x_name', 'symbolB':'y_name'})
df_prot_prot['x_type'] = 'gene/protein'
df_prot_prot['x_source'] = 'NCBI'
df_prot_prot['y_type'] = 'gene/protein'
df_prot_prot['y_source'] = 'NCBI'
df_prot_prot['relation'] = 'protein_protein'
df_prot_prot['display_relation'] = 'ppi'
df_prot_prot = clean_edges(df_prot_prot)
df_prot_prot.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,protein_protein,ppi,9796,gene/protein,PHYHIP,NCBI,56992,gene/protein,KIF15,NCBI


### Drug protein interactions (DrugBank)

In [135]:
df_prot_drug = pd.merge(df_drugbank, df_prot_names, 'left', left_on='NCBIGeneID', right_on='ncbi_id')

df_prot_drug = df_prot_drug.rename(columns={'DrugBank':'x_id', 'NCBIGeneID':'y_id', 'DrugBankName':'x_name', 'symbol':'y_name'})
df_prot_drug['x_type'] = 'drug'
df_prot_drug['x_source'] = 'DrugBank'
df_prot_drug['y_type'] = 'gene/protein'
df_prot_drug['y_source'] = 'NCBI'
df_prot_drug['display_relation'] = df_prot_drug.get('relation').values
df_prot_drug['relation'] = 'drug_protein' # combine targets, carrier, enzyme and transporter
df_prot_drug = clean_edges(df_prot_drug)
df_prot_drug.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,drug_protein,carrier,DB09130,drug,Copper,DrugBank,2157,gene/protein,F8,NCBI


### Drug disease interactions (DiseaseCentral) –– PENDING

In [136]:
df_drug_dis = pd.merge(df_drug_central, db_vocab, 'left', left_on='cas_reg_no', right_on='CAS')
df_drug_dis = pd.merge(df_drug_dis, df_umls_mondo, 'inner', left_on='umls_cui', right_on='umls_id')
df_drug_dis = pd.merge(df_drug_dis, df_mondo_terms, 'left', left_on='mondo_id', right_on='id')

df_drug_dis = df_drug_dis.get(['relationship_name','DrugBank ID', 'Common name', 'mondo_id', 'name'])
df_drug_dis = df_drug_dis.dropna().drop_duplicates()

df_drug_dis = df_drug_dis.rename(columns={'DrugBank ID':'x_id', 'mondo_id':'y_id', 'Common name':'x_name', 'name':'y_name', 'relationship_name':'relation'})
df_drug_dis['x_type'] = 'drug'
df_drug_dis['x_source'] = 'DrugBank'
df_drug_dis['y_type'] = 'disease'
df_drug_dis['y_source'] = 'MONDO'
df_drug_dis['display_relation'] = df_drug_dis.get('relation').values
df_drug_dis = clean_edges(df_drug_dis)
df_drug_dis.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,contraindication,contraindication,DB05271,drug,Rotigotine,DrugBank,5044,disease,hypertensive disorder,MONDO


### Disease protein interactions (DisGenNet)

In [137]:
df_dis_prot1 = df_disgenet.query('diseaseType=="disease"')

df_dis_prot1 = pd.merge(df_dis_prot1, df_umls_mondo, 'inner', left_on='diseaseId', right_on='umls_id')
df_dis_prot1 = pd.merge(df_dis_prot1, df_mondo_terms, 'left', left_on='mondo_id', right_on='id')

df_dis_prot1 = df_dis_prot1.rename(columns={'geneId':'y_id', 'geneSymbol':'y_name', 'mondo_id':'x_id', 'name':'x_name'})
df_dis_prot1['x_type'] = 'disease'
df_dis_prot1['x_source'] = 'MONDO'
df_dis_prot1['y_type'] = 'gene/protein'
df_dis_prot1['y_source'] = 'NCBI'
df_dis_prot1['relation'] = 'disease_protein'
df_dis_prot1['display_relation'] = 'associated with'
df_dis_prot1 = clean_edges(df_dis_prot1)
df_dis_prot1.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,disease_protein,associated with,5090,disease,schizophrenia (disease),MONDO,1,gene/protein,A1BG,NCBI


### Disease disease interations (MONDO)

In [138]:
df_dis_dis1 = pd.merge(df_mondo_parents, df_mondo_terms, 'left', left_on='parent', right_on='id')
df_dis_dis1 = df_dis_dis1.rename(columns={'parent':'x_id', 'name':'x_name'})
df_dis_dis1 = pd.merge(df_dis_dis1, df_mondo_terms, 'left', left_on='child', right_on='id')
df_dis_dis1 = df_dis_dis1.rename(columns={'child':'y_id', 'name':'y_name'})
df_dis_dis1['x_type'] = 'disease'
df_dis_dis1['x_source'] = 'MONDO'
df_dis_dis1['y_type'] = 'disease'
df_dis_dis1['y_source'] = 'MONDO'
df_dis_dis1['relation'] = 'disease_disease'
df_dis_dis1['display_relation'] = 'parent-child'
df_dis_dis1 = clean_edges(df_dis_dis1)
df_dis_dis1.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,disease_disease,parent-child,2816,disease,adrenal cortex disease,MONDO,4,disease,adrenocortical insufficiency,MONDO


### Drug drug interactions (DrugBank)

In [139]:
df_drug_drug = pd.merge(df_ddi, db_vocab, 'inner', left_on='drug1', right_on='DrugBank ID')
df_drug_drug = df_drug_drug.rename(columns={'drug1':'x_id', 'Common name':'x_name'})
df_drug_drug = pd.merge(df_drug_drug.astype({'drug2':'str'}), db_vocab, 'inner', left_on='drug2', right_on='DrugBank ID')
df_drug_drug = df_drug_drug.rename(columns={'drug2':'y_id', 'Common name':'y_name'})
df_drug_drug['x_type'] = 'drug'
df_drug_drug['x_source'] = 'DrugBank'
df_drug_drug['y_type'] = 'drug'
df_drug_drug['y_source'] = 'DrugBank'
df_drug_drug['relation'] = 'drug_drug'
df_drug_drug['display_relation'] = 'synergistic interaction'
df_drug_drug = clean_edges(df_drug_drug)
df_drug_drug.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,drug_drug,synergistic interaction,DB00001,drug,Lepirudin,DrugBank,DB06605,drug,Apixaban,DrugBank


## Effect/Phenotype

### Effect protein interactions (DisGenNet)

In [140]:
df_phe_prot = df_disgenet.query('diseaseType=="phenotype"')

df_phe_prot = pd.merge(df_phe_prot, df_hp_xref, 'inner', left_on='diseaseId', right_on='ontology_id')
df_phe_prot = pd.merge(df_phe_prot, df_hp_terms, 'left', left_on='hp_id', right_on='id')

df_phe_prot = df_phe_prot.rename(columns={'geneId':'y_id', 'geneSymbol':'y_name', 'hp_id':'x_id', 'name':'x_name'})
df_phe_prot['x_type'] = 'effect/phenotype'
df_phe_prot['x_source'] = 'HPO'
df_phe_prot['y_type'] = 'gene/protein'
df_phe_prot['y_source'] = 'NCBI'
df_phe_prot['relation'] = 'phenotype_protein'
df_phe_prot['display_relation'] = 'associated with'
df_phe_prot = clean_edges(df_phe_prot)
df_phe_prot.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,phenotype_protein,associated with,2240,effect/phenotype,Hepatomegaly,HPO,1,gene/protein,A1BG,NCBI


### Effect effect interactions (HPO)

In [141]:
df_phe_phe = pd.merge(df_hp_parents, df_hp_terms, 'left', left_on='parent', right_on='id')
df_phe_phe = df_phe_phe.rename(columns={'name':'parent_name'})
df_phe_phe = pd.merge(df_phe_phe, df_hp_terms, 'left', left_on='child', right_on='id')
df_phe_phe = df_phe_phe.rename(columns={'name':'child_name'})
df_phe_phe = df_phe_phe.get(['parent', 'child', 'parent_name', 'child_name'])

df_phe_phe = df_phe_phe.rename(columns={'parent':'x_id', 'child':'y_id', 'parent_name':'x_name', 'child_name':'y_name'})
df_phe_phe['x_type'] = 'effect/phenotype'
df_phe_phe['x_source'] = 'HPO'
df_phe_phe['y_type'] = 'effect/phenotype'
df_phe_phe['y_source'] = 'HPO'
df_phe_phe['relation'] = 'phenotype_phenotype'
df_phe_phe['display_relation'] = 'parent-child'
df_phe_phe = clean_edges(df_phe_phe)
df_phe_phe.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,phenotype_phenotype,parent-child,1507,effect/phenotype,Growth abnormality,HPO,2,effect/phenotype,Abnormality of body height,HPO


### Disease effect interactions (HPO-A)

In [142]:
df_dis_phe_pos1 = pd.merge(df_hpoa_pos, df_mondo_xref, 'left', left_on='disease_ontology_id', right_on='ontology_id')
df_dis_phe_pos1 = df_dis_phe_pos1.query('(disease_ontology==ontology) or (disease_ontology=="ORPHA" and ontology=="Orphanet")')
df_dis_phe_pos1 = pd.merge(df_dis_phe_pos1, df_hp_terms, 'left', left_on='hp_id', right_on='id').rename(columns={'name':'hp_name'})
df_dis_phe_pos1 = pd.merge(df_dis_phe_pos1, df_mondo_terms, 'left', left_on='mondo_id', right_on='id').rename(columns={'name':'mondo_name'})
df_dis_phe_pos1 = df_dis_phe_pos1.get(['mondo_id', 'mondo_name', 'hp_id', 'hp_name'])
df_dis_phe_pos1 = df_dis_phe_pos1.rename(columns={'mondo_id':'x_id', 'mondo_name':'x_name', 'hp_id': 'y_id', 'hp_name':'y_name'})
df_dis_phe_pos1.loc[:, 'x_source'] = 'MONDO'
df_dis_phe_pos1.loc[:, 'x_type'] = 'disease'
df_dis_phe_pos1.loc[:, 'y_source'] = 'HPO'
df_dis_phe_pos1.loc[:, 'y_type'] = 'effect/phenotype'
df_dis_phe_pos1.loc[:, 'relation'] = 'disease_phenotype_positive'
df_dis_phe_pos1.loc[:, 'display_relation'] = 'phenotype present'
df_dis_phe_pos1 = clean_edges(df_dis_phe_pos1)
df_dis_phe_pos1.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,disease_phenotype_positive,phenotype present,10761,disease,retinitis pigmentosa Y-linked,MONDO,510,effect/phenotype,Rod-cone dystrophy,HPO


In [143]:
df_dis_phe_neg = pd.merge(df_hpoa_neg, df_mondo_xref, 'left', left_on='disease_ontology_id', right_on='ontology_id')
df_dis_phe_neg = df_dis_phe_neg.query('(disease_ontology==ontology) or (disease_ontology=="ORPHA" and ontology=="Orphanet")')
df_dis_phe_neg = pd.merge(df_dis_phe_neg, df_hp_terms, 'left', left_on='hp_id', right_on='id').rename(columns={'name':'hp_name'})
df_dis_phe_neg = pd.merge(df_dis_phe_neg, df_mondo_terms, 'left', left_on='mondo_id', right_on='id').rename(columns={'name':'mondo_name'})
df_dis_phe_neg = df_dis_phe_neg.get(['mondo_id', 'mondo_name', 'hp_id', 'hp_name'])
df_dis_phe_neg = df_dis_phe_neg.rename(columns={'mondo_id':'x_id', 'mondo_name':'x_name', 'hp_id': 'y_id', 'hp_name':'y_name'})
df_dis_phe_neg.loc[:, 'x_source'] = 'MONDO'
df_dis_phe_neg.loc[:, 'x_type'] = 'disease'
df_dis_phe_neg.loc[:, 'y_source'] = 'HPO'
df_dis_phe_neg.loc[:, 'y_type'] = 'effect/phenotype'
df_dis_phe_neg.loc[:, 'relation'] = 'disease_phenotype_negative'
df_dis_phe_neg.loc[:, 'display_relation'] = 'phenotype absent'
df_dis_phe_neg = clean_edges(df_dis_phe_neg)
df_dis_phe_neg.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,disease_phenotype_negative,phenotype absent,13924,disease,osteogenesis imperfecta type 13,MONDO,365,effect/phenotype,Hearing impairment,HPO


### Remove MONDO nodes if they exist in HPO (Modified)

In [144]:
# phenotypes that are actually diseases in MONDO
# avoid duplicate nodes and convert disease nodes to phenotype nodes
mondo_xref_hp_subset = df_mondo_xref.query('ontology=="HP"')
mondo_xref_hp_subset.loc[:, 'ontology_id'] = mondo_xref_hp_subset.get('ontology_id').astype(int).astype(str).values
merged_mondo_hpo = pd.merge(mondo_xref_hp_subset, df_hp_terms, 'inner', left_on='ontology_id', right_on='id')

merged_mondo_hpo[['ontology_id', 'mondo_id']].to_csv(save_path+'auxillary/mondo2hpo.csv', index=False)
mondo_r_hp_ids = merged_mondo_hpo.get('mondo_id').values

def replace_mondo_w_hpo(df, mondo_id_col, drop_cols=[]): 
    cols = list(df.columns.values)
    cols.extend(['ontology_id', 'ontology_name'])
    [cols.remove(x) for x in drop_cols]
    df = pd.merge(df, mondo_xref_hp_subset, 'left', left_on=mondo_id_col, right_on='mondo_id')
    df = pd.merge(df, df_hp_terms, 'left', left_on='ontology_id', right_on='id')
    df = df.rename(columns={'name':'ontology_name'}).get(cols)
    return df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [145]:
# HANDLE DISEASE DISEASE --> EFFECT EFFECT

df_phe_phe2 = df_dis_dis1.query('x_id in @mondo_r_hp_ids and y_id in @mondo_r_hp_ids')
df_phe_phe2 = replace_mondo_w_hpo(df=df_phe_phe2, mondo_id_col='x_id', drop_cols=[c for c in df_phe_phe2.columns.values if 'x_' in c])
df_phe_phe2 = df_phe_phe2.rename(columns={'ontology_id':'x_id', 'ontology_name':'x_name'})
df_phe_phe2 = replace_mondo_w_hpo(df=df_phe_phe2, mondo_id_col='y_id', drop_cols=[c for c in df_phe_phe2.columns.values if 'y_' in c])
df_phe_phe2 = df_phe_phe2.rename(columns={'ontology_id':'y_id', 'ontology_name':'y_name'})
df_phe_phe2.loc[:, 'x_source'] = 'HPO'
df_phe_phe2.loc[:, 'x_type'] = 'effect/phenotype'
df_phe_phe2.loc[:, 'y_source'] = 'HPO'
df_phe_phe2.loc[:, 'y_type'] = 'effect/phenotype'
df_phe_phe2.loc[:,'relation'] = 'phenotype_phenotype'
df_phe_phe2.loc[:,'display_relation'] = 'parent-child'
df_phe_phe2 = clean_edges(df_phe_phe2)

# drop relations in DIS DIS if either DIS is in HPO
# disease disease should have no phenotype nodes
df_dis_dis = df_dis_dis1.query('x_id not in @mondo_r_hp_ids and y_id not in @mondo_r_hp_ids')

# ensure that none of the disease nodes (source or target) are hpo nodes
assert len(df_dis_dis.query('x_id in @mondo_r_hp_ids')) == 0
assert len(df_dis_dis.query('y_id in @mondo_r_hp_ids')) == 0

In [146]:
# HANDLE DISEASE EFFECT NEGATIVE --> EFFECT EFFECT
df_phe_phe3 = df_dis_phe_neg.query('x_id in @mondo_r_hp_ids')
df_phe_phe3 = replace_mondo_w_hpo(df=df_phe_phe3, mondo_id_col='x_id', drop_cols=[c for c in df_phe_phe3.columns.values if 'x_' in c])
df_phe_phe3 = df_phe_phe3.rename(columns={'ontology_id':'x_id', 'ontology_name':'x_name'})

df_phe_phe3.loc[:, 'x_source'] = 'HPO'
df_phe_phe3.loc[:, 'x_type'] = 'effect/phenotype'
df_phe_phe3.loc[:, 'relation'] = 'phenotype_phenotype'
df_phe_phe3.loc[:, 'display_relation'] = 'parent-child'
df_phe_phe3 = clean_edges(df_phe_phe3)

# drop relations in DIS PHE if DIS is in HPO
df_dis_phe_neg = df_dis_phe_neg.query('x_id not in @mondo_r_hp_ids')

# HANDLE DISEASE EFFECT POSITIVE --> EFFECT EFFECT
df_phe_phe4 = df_dis_phe_pos1.query('x_id in @mondo_r_hp_ids')
df_phe_phe4 = replace_mondo_w_hpo(df=df_phe_phe4, mondo_id_col='x_id', drop_cols=[c for c in df_phe_phe4.columns.values if 'x_' in c])
df_phe_phe4 = df_phe_phe4.rename(columns={'ontology_id':'x_id', 'ontology_name':'x_name'})

df_phe_phe4.loc[:, 'x_source'] = 'HPO'
df_phe_phe4.loc[:, 'x_type'] = 'effect/phenotype'
df_phe_phe4.loc[:,'relation'] = 'phenotype_phenotype'
df_phe_phe4.loc[:,'display_relation'] = 'parent-child'
df_phe_phe4 = clean_edges(df_phe_phe4)

# drop relations in DIS PHE if DIS is in HPO
df_dis_phe_pos = df_dis_phe_pos1.query('x_id not in @mondo_r_hp_ids')

In [147]:
# HANDLE DISEASE PROTEIN --> EFFECT PROTEIN
df_phe_prot2 = df_dis_prot1.query('x_id in @mondo_r_hp_ids')
df_phe_prot2 = replace_mondo_w_hpo(df=df_phe_prot2, mondo_id_col='x_id', drop_cols=[c for c in df_phe_prot2.columns.values if 'x_' in c])
df_phe_prot2 = df_phe_prot2.rename(columns={'ontology_id':'x_id', 'ontology_name':'x_name'})

df_phe_prot2.loc[:, 'x_source'] = 'HPO'
df_phe_prot2.loc[:, 'x_type'] = 'effect/phenotype'
df_phe_prot2.loc[:, 'relation'] = 'phenotype_protein'
df_phe_prot2.loc[:, 'display_relation'] = 'associated with'
df_phe_prot2 = clean_edges(df_phe_prot2)

# drop relations in DIS GENE if DIS is in HPO
df_dis_prot = df_dis_prot1.query('x_id not in @mondo_r_hp_ids')

In [148]:
# HANDLE DISEASE DRUG --> Remove (does not make sense to have EFFECT DRUG)

df_drug_dis = df_drug_dis.query('y_id not in @mondo_r_hp_ids')

In [149]:
# COMBINE DATAFRAMES

df_phe_phe = pd.concat([df_phe_phe, df_phe_phe2, df_phe_phe3, df_phe_phe4], ignore_index=True).drop_duplicates()
df_phe_prot = pd.concat([df_phe_prot, df_phe_prot2], ignore_index=True).drop_duplicates()

print(df_phe_phe.query('x_source == "MONDO" and y_source == "MONDO" and x_id in @mondo_r_hp_ids and y_id in @mondo_r_hp_ids'))
print(df_phe_prot.query('x_source == "MONDO" and x_id in @mondo_r_hp_ids'))

Empty DataFrame
Columns: [relation, display_relation, x_id, x_type, x_name, x_source, y_id, y_type, y_name, y_source]
Index: []
Empty DataFrame
Columns: [relation, display_relation, x_id, x_type, x_name, x_source, y_id, y_type, y_name, y_source]
Index: []


### Drug effect interactions (SIDER)

In [150]:
df_drug_effect = pd.merge(df_sider, df_db_atc, 'left', left_on='atc', right_on='atc_code')
df_drug_effect = df_drug_effect.rename(columns={'parent_key':'DrugBank', 'UMLS_from_meddra':'UMLS'})
df_drug_effect = pd.merge(df_drug_effect, db_vocab, 'left', left_on='DrugBank', right_on='DrugBank ID')
df_drug_effect = pd.merge(df_drug_effect, df_hp_xref, 'left', left_on='UMLS' , right_on='ontology_id')
df_drug_effect = pd.merge(df_drug_effect, df_hp_terms, 'left', left_on='hp_id' , right_on='id')
df_drug_effect = df_drug_effect.get(['DrugBank ID','Common name','hp_id', 'name'])
df_drug_effect = df_drug_effect.dropna().drop_duplicates()

df_drug_effect = df_drug_effect.rename(columns={'DrugBank ID':'x_id', 'Common name':'x_name', 'hp_id':'y_id', 'name':'y_name'})
df_drug_effect['x_type'] = 'drug'
df_drug_effect['x_source'] = 'DrugBank'
df_drug_effect['y_type'] = 'effect/phenotype'
df_drug_effect['y_source'] = 'HPO'
df_drug_effect['relation'] = 'drug_effect'
df_drug_effect['display_relation'] = 'side effect'
#df_drug_effect = df_drug_effect.query('y_id not in @hp_ids_r_mondo')
df_drug_effect = clean_edges(df_drug_effect)
df_drug_effect.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,drug_effect,side effect,DB00583,drug,Levocarnitine,DrugBank,2027,effect/phenotype,Abdominal pain,HPO


## GO Terms

### Go terms interactions (GO)

In [151]:
bp = df_go_terms.query('go_term_type=="biological_process"')
df_bp_bp = pd.merge(df_go_edges, bp, 'inner', left_on='x', right_on='go_term_id')
df_bp_bp = df_bp_bp.rename(columns={'go_term_id':'x_id','go_term_name':'x_name','go_term_type':'x_type'})
df_bp_bp = pd.merge(df_bp_bp, bp, 'inner', left_on='y', right_on='go_term_id')
df_bp_bp = df_bp_bp.rename(columns={'go_term_id':'y_id','go_term_name':'y_name','go_term_type':'y_type'})
df_bp_bp['relation'] = 'bioprocess_bioprocess'
df_bp_bp['x_source'] = 'GO'
df_bp_bp['y_source'] = 'GO'
df_bp_bp['display_relation'] = 'parent-child'
df_bp_bp = clean_edges(df_bp_bp)
df_bp_bp.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,bioprocess_bioprocess,parent-child,51581,biological_process,negative regulation of neurotransmitter uptake,GO,51612,biological_process,negative regulation of serotonin uptake,GO


In [152]:
mf = df_go_terms.query('go_term_type=="molecular_function"')
df_mf_mf = pd.merge(df_go_edges, mf, 'inner', left_on='x', right_on='go_term_id')
df_mf_mf = df_mf_mf.rename(columns={'go_term_id':'x_id','go_term_name':'x_name','go_term_type':'x_type'})
df_mf_mf = pd.merge(df_mf_mf, mf, 'inner', left_on='y', right_on='go_term_id')
df_mf_mf = df_mf_mf.rename(columns={'go_term_id':'y_id','go_term_name':'y_name','go_term_type':'y_type'})
df_mf_mf['relation'] = 'molfunc_molfunc'
df_mf_mf['display_relation'] = 'parent-child'
df_mf_mf['x_source'] = 'GO'
df_mf_mf['y_source'] = 'GO'
df_mf_mf = clean_edges(df_mf_mf)
df_mf_mf.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,molfunc_molfunc,parent-child,8168,molecular_function,methyltransferase activity,GO,102130,molecular_function,malonyl-CoA methyltransferase activity,GO


In [153]:
cc = df_go_terms.query('go_term_type=="cellular_component"')
df_cc_cc = pd.merge(df_go_edges, cc, 'inner', left_on='x', right_on='go_term_id')
df_cc_cc = df_cc_cc.rename(columns={'go_term_id':'x_id','go_term_name':'x_name','go_term_type':'x_type'})
df_cc_cc = pd.merge(df_cc_cc, cc, 'inner', left_on='y', right_on='go_term_id')
df_cc_cc = df_cc_cc.rename(columns={'go_term_id':'y_id','go_term_name':'y_name','go_term_type':'y_type'})
df_cc_cc['relation'] = 'cellcomp_cellcomp'
df_cc_cc['display_relation'] = 'parent-child'
df_cc_cc['x_source'] = 'GO'
df_cc_cc['y_source'] = 'GO'
df_cc_cc = clean_edges(df_cc_cc)
df_cc_cc.head(1)


Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,cellcomp_cellcomp,parent-child,110165,cellular_component,cellular anatomical entity,GO,90553,cellular_component,unicellular trichome tip,GO


### Go protein interactions (Gene2GO)

In [154]:
df_prot_path = pd.merge(df_gene2go, df_go_terms, 'inner', 'go_term_id').rename(columns={'go_term_type_x':'go_term_type'})
df_prot_path = pd.merge(df_prot_path, df_prot_names, 'left', left_on='ncbi_gene_id', right_on='ncbi_id')
df_prot_path = df_prot_path.rename(columns={'ncbi_gene_id':'x_id', 'symbol':'x_name', 
                             'go_term_id':'y_id','go_term_name':'y_name', 'go_term_type':'y_type'})
df_prot_path['x_type'] = 'gene/protein'
df_prot_path['x_source'] = 'NCBI'
df_prot_path['y_source'] = 'GO'
df_prot_path = df_prot_path.get(['x_id','x_type', 'x_name', 'x_source','y_id','y_type', 'y_name', 'y_source'])

In [155]:
df_prot_mf = df_prot_path.query('y_type=="molecular_function"').copy()
df_prot_mf['relation'] = 'protein_molfunc'
df_prot_mf['display_relation'] = 'interacts with'
df_prot_mf = clean_edges(df_prot_mf)
df_prot_mf.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,protein_molfunc,interacts with,2,gene/protein,A2M,NCBI,19966,molecular_function,interleukin-1 binding,GO


In [156]:
df_prot_cc = df_prot_path.query('y_type=="cellular_component"').copy()
df_prot_cc['relation'] = 'protein_cellcomp'
df_prot_cc['display_relation'] = 'interacts with'
df_prot_cc = clean_edges(df_prot_cc)
df_prot_cc.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
214459,protein_cellcomp,interacts with,1,gene/protein,A1BG,NCBI,1904813,cellular_component,ficolin-1-rich granule lumen,GO


In [157]:
df_prot_bp = df_prot_path.query('y_type=="biological_process"').copy()
df_prot_bp['relation'] = 'protein_bioprocess'
df_prot_bp['display_relation'] = 'interacts with'
df_prot_bp = clean_edges(df_prot_bp)
df_prot_bp.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
69588,protein_bioprocess,interacts with,1,gene/protein,A1BG,NCBI,43312,biological_process,neutrophil degranulation,GO


## Exposure

### Exposure protein interactions (CTD)

In [158]:
df_exp_prot = df_exposures.get(['exposurestressorname', 'exposurestressorid','exposuremarker', 'exposuremarkerid'])
df_exp_prot = df_exp_prot.loc[df_exp_prot.get(['exposuremarkerid']).dropna().index, :]

gene_row_index = []
for idx, data in df_exp_prot.iterrows():
    if data.exposuremarkerid.isnumeric(): 
        gene_row_index.append(idx)

df_exp_prot = df_exp_prot.loc[gene_row_index, :].astype({'exposuremarkerid': 'int'}).astype({'exposuremarkerid': 'str'})
df_exp_prot = pd.merge(df_exp_prot, df_prot_names, 'left', left_on='exposuremarkerid', right_on='ncbi_id')

df_exp_prot = df_exp_prot.rename(columns={'exposurestressorid':'x_id', 'exposurestressorname':'x_name', 'ncbi_id':'y_id', 'symbol':'y_name'})
df_exp_prot['x_type'] = 'exposure'
df_exp_prot['x_source'] = 'CTD'
df_exp_prot['y_type'] = 'gene/protein'
df_exp_prot['y_source'] = 'NCBI'
df_exp_prot['relation'] = 'exposure_protein'
df_exp_prot['display_relation'] = 'interacts with'
df_exp_prot = clean_edges(df_exp_prot)
df_exp_prot.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,exposure_protein,interacts with,C092102,exposure,1-hydroxyphenanthrene,CTD,1401,gene/protein,CRP,NCBI


### Exposure disease interactions (CTD)

In [159]:
df_exp_dis = df_exposures.get(['exposurestressorname', 'exposurestressorid','diseasename', 'diseaseid'])
df_exp_dis = df_exp_dis.loc[df_exp_dis.get(['diseaseid']).dropna().index, :]
df_exp_dis = pd.merge(df_exp_dis, df_mondo_xref.query('ontology=="MESH"'), 'left', left_on='diseaseid', right_on='ontology_id')
df_exp_dis = pd.merge(df_exp_dis, df_mondo_terms, 'left', left_on='mondo_id', right_on= 'id')

df_exp_dis = df_exp_dis.rename(columns={'exposurestressorid':'x_id', 'exposurestressorname':'x_name', 'mondo_id':'y_id', 'name':'y_name'})
df_exp_dis['x_type'] = 'exposure'
df_exp_dis['x_source'] = 'CTD'
df_exp_dis['y_type'] = 'disease'
df_exp_dis['y_source'] = 'MONDO'
df_exp_dis['relation'] = 'exposure_disease'
df_exp_dis['display_relation'] = 'linked to'
df_exp_dis = df_exp_dis.query('y_id not in @mondo_r_hp_ids') # Michelle added
df_exp_dis = clean_edges(df_exp_dis)
df_exp_dis.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,exposure_disease,linked to,C024566,exposure,"1,1,1-trichloroethane",CTD,4976,disease,amyotrophic lateral sclerosis,MONDO


### Exposure exposure interactions (CTD)

In [160]:
exposures = np.unique(df_exposures.get('exposurestressorid').values)
df_exp_exp = df_exposures.query('exposuremarkerid in @exposures')

df_exp_exp = df_exp_exp.get(['exposurestressorname', 'exposurestressorid','exposuremarker', 'exposuremarkerid'])
df_exp_exp = df_exp_exp.loc[df_exp_exp.get(['exposuremarkerid']).dropna().index, :]
df_exp_exp = df_exp_exp.drop_duplicates()

df_exp_exp = df_exp_exp.rename(columns={'exposurestressorid':'x_id', 'exposurestressorname':'x_name', 'exposuremarker':'y_name', 'exposuremarkerid':'y_id'})
df_exp_exp['x_type'] = 'exposure'
df_exp_exp['x_source'] = 'CTD'
df_exp_exp['y_type'] = 'exposure'
df_exp_exp['y_source'] = 'CTD'
df_exp_exp['relation'] = 'exposure_exposure'
df_exp_exp['display_relation'] = 'parent-child'
df_exp_exp = clean_edges(df_exp_exp)

### Exposure pathway interactions (CTD)

In [161]:
# phenotypes are actually pathways 

df_exp_path = df_exposures.get(['exposurestressorname', 'exposurestressorid','phenotypename', 'phenotypeid'])
df_exp_path = df_exp_path.loc[df_exp_path.get(['phenotypeid']).dropna().index, :]
df_exp_path.loc[:, 'phenotypeid'] = [str(int(x.split(':')[1])) for x in df_exp_path.get(['phenotypeid']).values.reshape(-1)]
df_exp_path = df_exp_path.drop_duplicates()
df_exp_path = pd.merge(df_exp_path, df_go_terms, 'inner', left_on='phenotypeid', right_on='go_term_id')
df_exp_path = df_exp_path.rename(columns={'exposurestressorid':'x_id', 'exposurestressorname':'x_name', 
                                          'go_term_id':'y_id', 'go_term_name':'y_name', 'go_term_type':'y_type'})
df_exp_path['x_type'] = 'exposure'
df_exp_path['x_source'] = 'CTD'
df_exp_path['y_source'] = 'GO'

In [162]:
df_exp_bp = df_exp_path.query('y_type=="biological_process"').copy()
df_exp_bp['relation'] = 'exposure_bioprocess'
df_exp_bp['display_relation'] = 'interacts with'
df_exp_bp = clean_edges(df_exp_bp)
df_exp_bp.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,exposure_bioprocess,interacts with,C046839,exposure,"1,2,3,4,6,7,8-heptachlorodibenzodioxin",CTD,8217,biological_process,regulation of blood pressure,GO


In [163]:
df_exp_mf = df_exp_path.query('y_type=="molecular_function"').copy()
df_exp_mf['relation'] = 'exposure_molfunc'
df_exp_mf['display_relation'] = 'interacts with'
df_exp_mf = clean_edges(df_exp_mf)
df_exp_mf.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
527,exposure_molfunc,interacts with,C014024,exposure,"2,4,5,2',4',5'-hexachlorobiphenyl",CTD,19766,molecular_function,IgA receptor activity,GO


In [164]:
df_exp_cc = df_exp_path.query('y_type=="cellular_component"').copy()
df_exp_cc['relation'] = 'exposure_cellcomp'
df_exp_cc['display_relation'] = 'interacts with'
df_exp_cc = clean_edges(df_exp_cc)
df_exp_cc.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
833,exposure_cellcomp,interacts with,D000393,exposure,Air Pollutants,CTD,71743,cellular_component,"IgE immunoglobulin complex, circulating",GO


## Anatomy

### Anatomy anatomy interactions (UBERON) 

In [165]:
df_ana_ana = pd.merge(df_uberon_is_a, df_uberon_terms, 'left', left_on='id', right_on='id')
df_ana_ana = df_ana_ana.rename(columns={'id':'x_id', 'name':'x_name'})
df_ana_ana = pd.merge(df_ana_ana, df_uberon_terms, 'left', left_on='is_a', right_on='id')
df_ana_ana = df_ana_ana.rename(columns={'id':'y_id', 'name':'y_name'})
df_ana_ana['x_type'] = 'anatomy'
df_ana_ana['x_source'] = 'UBERON'
df_ana_ana['y_type'] = 'anatomy'
df_ana_ana['y_source'] = 'UBERON'
df_ana_ana['relation'] = 'anatomy_anatomy'
df_ana_ana['display_relation'] = 'parent-child'
df_ana_ana = clean_edges(df_ana_ana)
df_ana_ana.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,anatomy_anatomy,parent-child,2,anatomy,uterine cervix,UBERON,5156,anatomy,reproductive structure,UBERON


### Anatomy Protein (BGEE)

In [166]:
df_bgee = pd.merge(df_bgee, df_prot_names, 'inner', left_on='gene_name', right_on='symbol')
df_bgee = df_bgee.rename(columns={'ncbi_id':'x_id', 'symbol':'x_name', 
                                  'anatomy_id':'y_id', 'anatomy_name':'y_name'})
df_bgee['x_source'] = 'NCBI'
df_bgee['x_type'] = 'gene/protein'
df_bgee['y_source'] = 'UBERON'
df_bgee['y_type'] = 'anatomy'

In [167]:
df_ana_prot_pos = df_bgee.query('expression=="present"').copy()
df_ana_prot_pos['relation'] = 'protein_present_anatomy'
df_ana_prot_pos['display_relation'] = 'expression present'
df_ana_prot_pos = clean_edges(df_ana_prot_pos)
df_ana_prot_pos.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,protein_present_anatomy,expression present,7105,gene/protein,TSPAN6,NCBI,2,anatomy,uterine cervix,UBERON


In [168]:
df_ana_prot_neg = df_bgee.query('expression=="absent"').copy()
df_ana_prot_neg['relation'] = 'protein_absent_anatomy'
df_ana_prot_neg['display_relation'] = 'expression absent'
df_ana_prot_neg = clean_edges(df_ana_prot_neg)
df_ana_prot_neg.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
507,protein_absent_anatomy,expression absent,2268,gene/protein,FGR,NCBI,1476,anatomy,deltoid,UBERON


## Pathways

In [169]:
df_path_path = pd.merge(df_reactome_rels, df_reactome_terms, 'inner', left_on='reactome_id_1', right_on='reactome_id')
df_path_path = df_path_path.rename(columns={'reactome_id': 'x_id', 'reactome_name':'x_name'})
df_path_path = pd.merge(df_path_path, df_reactome_terms, 'inner', left_on='reactome_id_2', right_on='reactome_id')
df_path_path = df_path_path.rename(columns={'reactome_id': 'y_id', 'reactome_name':'y_name'})

df_path_path['x_source'] = 'REACTOME'
df_path_path['x_type'] = 'pathway'
df_path_path['y_source'] = 'REACTOME'
df_path_path['y_type'] = 'pathway'
df_path_path['relation'] = 'pathway_pathway'
df_path_path['display_relation'] = 'parent-child'
df_path_path = clean_edges(df_path_path)
df_path_path.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,pathway_pathway,parent-child,R-HSA-109581,pathway,Apoptosis,REACTOME,R-HSA-109606,pathway,Intrinsic Pathway for Apoptosis,REACTOME


### Pathway protein interactions

In [170]:
df_path_prot = pd.merge(df_reactome_ncbi, df_prot_names, 'inner', 'ncbi_id')

df_path_prot = df_path_prot.rename(columns={'ncbi_id': 'x_id', 'symbol':'x_name', 
                                            'reactome_id': 'y_id', 'reactome_name':'y_name'})
df_path_prot['x_source'] = 'NCBI'
df_path_prot['x_type'] = 'gene/protein'
df_path_prot['y_source'] = 'REACTOME'
df_path_prot['y_type'] = 'pathway'
df_path_prot['relation'] = 'protein_pathway'
df_path_prot['display_relation'] = 'interacts with'
df_path_prot = clean_edges(df_path_prot)
df_path_prot.head(1)

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
0,protein_pathway,interacts with,1,gene/protein,A1BG,NCBI,R-HSA-114608,pathway,Platelet degranulation,REACTOME


# Compiling knowledge graph

In [171]:
print("df_prot_prot")
print(df_prot_prot['x_type'].unique())
print(df_prot_prot['y_type'].unique())
print(df_prot_prot['relation'].unique())
print(df_prot_prot['display_relation'].unique())

print("\ndf_prot_drug")
print(df_prot_drug['x_type'].unique())
print(df_prot_drug['y_type'].unique())
print(df_prot_drug['relation'].unique())
print(df_prot_drug['display_relation'].unique())

print("\ndf_drug_dis")
print(df_drug_dis['x_type'].unique())
print(df_drug_dis['y_type'].unique())
print(df_drug_dis['relation'].unique())
print(df_drug_dis['display_relation'].unique())

print("\ndf_drug_drug")
print(df_drug_drug['x_type'].unique())
print(df_drug_drug['y_type'].unique())
print(df_drug_drug['relation'].unique())
print(df_drug_drug['display_relation'].unique())

print("\ndf_phe_prot")
print(df_phe_prot['x_type'].unique())
print(df_phe_prot['y_type'].unique())
print(df_phe_prot['relation'].unique())
print(df_phe_prot['display_relation'].unique())

print("\ndf_phe_phe")
print(df_phe_phe['x_type'].unique())
print(df_phe_phe['y_type'].unique())
print(df_phe_phe['relation'].unique())
print(df_phe_phe['display_relation'].unique())

print("\ndf_dis_phe_neg")
print(df_dis_phe_neg['x_type'].unique())
print(df_dis_phe_neg['y_type'].unique())
print(df_dis_phe_neg['relation'].unique())
print(df_dis_phe_neg['display_relation'].unique())

print("\ndf_dis_phe_pos")
print(df_dis_phe_pos['x_type'].unique())
print(df_dis_phe_pos['y_type'].unique())
print(df_dis_phe_pos['relation'].unique())
print(df_dis_phe_pos['display_relation'].unique())

print("\ndf_dis_prot")
print(df_dis_prot['x_type'].unique())
print(df_dis_prot['y_type'].unique())
print(df_dis_prot['relation'].unique())
print(df_dis_prot['display_relation'].unique())

print("\ndf_dis_dis")
print(df_dis_dis['x_type'].unique())
print(df_dis_dis['y_type'].unique())
print(df_dis_dis['relation'].unique())
print(df_dis_dis['display_relation'].unique())

print("\ndf_drug_effect")
print(df_drug_effect['x_type'].unique())
print(df_drug_effect['y_type'].unique())
print(df_drug_effect['relation'].unique())
print(df_drug_effect['display_relation'].unique())

print("\ndf_bp_bp")
print(df_bp_bp['x_type'].unique())
print(df_bp_bp['y_type'].unique())
print(df_bp_bp['relation'].unique())
print(df_bp_bp['display_relation'].unique())

print("\ndf_mf_mf")
print(df_mf_mf['x_type'].unique())
print(df_mf_mf['y_type'].unique())
print(df_mf_mf['relation'].unique())
print(df_mf_mf['display_relation'].unique())

print("\ndf_cc_cc")
print(df_cc_cc['x_type'].unique())
print(df_cc_cc['y_type'].unique())
print(df_cc_cc['relation'].unique())
print(df_cc_cc['display_relation'].unique())

print("\ndf_prot_mf")
print(df_prot_mf['x_type'].unique())
print(df_prot_mf['y_type'].unique())
print(df_prot_mf['relation'].unique())
print(df_prot_mf['display_relation'].unique())

print("\ndf_prot_cc")
print(df_prot_cc['x_type'].unique())
print(df_prot_cc['y_type'].unique())
print(df_prot_cc['relation'].unique())
print(df_prot_cc['display_relation'].unique())

print("\ndf_prot_bp")
print(df_prot_bp['x_type'].unique())
print(df_prot_bp['y_type'].unique())
print(df_prot_bp['relation'].unique())
print(df_prot_bp['display_relation'].unique())

print("\ndf_exp_prot")
print(df_exp_prot['x_type'].unique())
print(df_exp_prot['y_type'].unique())
print(df_exp_prot['relation'].unique())
print(df_exp_prot['display_relation'].unique())

print("\ndf_exp_dis")
print(df_exp_dis['x_type'].unique())
print(df_exp_dis['y_type'].unique())
print(df_exp_dis['relation'].unique())
print(df_exp_dis['display_relation'].unique())

print("\ndf_exp_exp")
print(df_exp_exp['x_type'].unique())
print(df_exp_exp['y_type'].unique())
print(df_exp_exp['relation'].unique())
print(df_exp_exp['display_relation'].unique())

print("\ndf_exp_bp")
print(df_exp_bp['x_type'].unique())
print(df_exp_bp['y_type'].unique())
print(df_exp_bp['relation'].unique())
print(df_exp_bp['display_relation'].unique())

print("\ndf_exp_mf")
print(df_exp_mf['x_type'].unique())
print(df_exp_mf['y_type'].unique())
print(df_exp_mf['relation'].unique())
print(df_exp_mf['display_relation'].unique())

print("\ndf_exp_cc")
print(df_exp_cc['x_type'].unique())
print(df_exp_cc['y_type'].unique())
print(df_exp_cc['relation'].unique())
print(df_exp_cc['display_relation'].unique())

print("\ndf_path_path")
print(df_path_path['x_type'].unique())
print(df_path_path['y_type'].unique())
print(df_path_path['relation'].unique())
print(df_path_path['display_relation'].unique())

print("\ndf_path_prot")
print(df_path_prot['x_type'].unique())
print(df_path_prot['y_type'].unique())
print(df_path_prot['relation'].unique())
print(df_path_prot['display_relation'].unique())

print("\ndf_ana_ana")
print(df_ana_ana['x_type'].unique())
print(df_ana_ana['y_type'].unique())
print(df_ana_ana['relation'].unique())
print(df_ana_ana['display_relation'].unique())

print("\ndf_ana_prot_pos")
print(df_ana_prot_pos['x_type'].unique())
print(df_ana_prot_pos['y_type'].unique())
print(df_ana_prot_pos['relation'].unique())
print(df_ana_prot_pos['display_relation'].unique())

print("\ndf_ana_prot_neg")
print(df_ana_prot_neg['x_type'].unique())
print(df_ana_prot_neg['y_type'].unique())
print(df_ana_prot_neg['relation'].unique())
print(df_ana_prot_neg['display_relation'].unique())


df_prot_prot
['gene/protein']
['gene/protein']
['protein_protein']
['ppi']

df_prot_drug
['drug']
['gene/protein']
['drug_protein']
['carrier' 'enzyme' 'target' 'transporter']

df_drug_dis
['drug']
['disease']
['contraindication' 'indication' 'off-label use']
['contraindication' 'indication' 'off-label use']

df_drug_drug
['drug']
['drug']
['drug_drug']
['synergistic interaction']

df_phe_prot
['effect/phenotype']
['gene/protein']
['phenotype_protein']
['associated with']

df_phe_phe
['effect/phenotype']
['effect/phenotype']
['phenotype_phenotype']
['parent-child']

df_dis_phe_neg
['disease']
['effect/phenotype']
['disease_phenotype_negative']
['phenotype absent']

df_dis_phe_pos
['disease']
['effect/phenotype']
['disease_phenotype_positive']
['phenotype present']

df_dis_prot
['disease']
['gene/protein']
['disease_protein']
['associated with']

df_dis_dis
['disease']
['disease']
['disease_disease']
['parent-child']

df_drug_effect
['drug']
['effect/phenotype']
['drug_effect']
['side e

In [172]:
kg = pd.concat([df_prot_prot, df_prot_drug, df_drug_dis, df_drug_drug, df_phe_prot,
                df_phe_phe, df_dis_phe_neg, df_dis_phe_pos, df_dis_prot, df_dis_dis, 
                df_drug_effect, df_bp_bp, df_mf_mf, df_cc_cc, df_prot_mf, 
                df_prot_cc, df_prot_bp, df_exp_prot, df_exp_dis, df_exp_exp, 
                df_exp_bp, df_exp_mf, df_exp_cc, df_path_path, df_path_prot,
                df_ana_ana, df_ana_prot_pos, df_ana_prot_neg]) #28
kg = kg.drop_duplicates()
#kg_rev = kg.copy().rename(columns={'x_id':'y_id','x_type':'y_type', 'x_name':'y_name', 'x_source':'y_source', 'y_id':'x_id','y_type':'x_type', 'y_name':'x_name', 'y_source':'x_source' }) #add reverse edges
#kg_rev['relation'] = kg_rev['relation'] + "_rev"
#print(len(kg), len(kg_rev))

#kg = pd.concat([kg, kg_rev])
#kg = kg.drop_duplicates()
kg = kg.dropna()
# remove self loops from edges 
kg = kg.query('not ((x_id == y_id) and (x_type == y_type) and (x_source == y_source) and (x_name == y_name))')
kg.tail()

Unnamed: 0,relation,display_relation,x_id,x_type,x_name,x_source,y_id,y_type,y_name,y_source
1539160,protein_absent_anatomy,expression absent,140,gene/protein,ADORA3,NCBI,4720,anatomy,cerebellar vermis,UBERON
1539470,protein_absent_anatomy,expression absent,105378952,gene/protein,KLF18,NCBI,1377,anatomy,quadriceps femoris,UBERON
1539471,protein_absent_anatomy,expression absent,105378952,gene/protein,KLF18,NCBI,1379,anatomy,vastus lateralis,UBERON
1539472,protein_absent_anatomy,expression absent,105378952,gene/protein,KLF18,NCBI,2084,anatomy,heart left ventricle,UBERON
1539473,protein_absent_anatomy,expression absent,105378952,gene/protein,KLF18,NCBI,5384,anatomy,nasal cavity epithelium,UBERON


In [173]:
print(kg['relation'].unique())
print(kg['display_relation'].unique())
print(len(kg))

['protein_protein' 'drug_protein' 'contraindication' 'indication'
 'off-label use' 'drug_drug' 'phenotype_protein' 'phenotype_phenotype'
 'disease_phenotype_negative' 'disease_phenotype_positive'
 'disease_protein' 'disease_disease' 'drug_effect' 'bioprocess_bioprocess'
 'molfunc_molfunc' 'cellcomp_cellcomp' 'protein_molfunc'
 'protein_cellcomp' 'protein_bioprocess' 'exposure_protein'
 'exposure_disease' 'exposure_exposure' 'exposure_bioprocess'
 'exposure_molfunc' 'exposure_cellcomp' 'pathway_pathway'
 'protein_pathway' 'anatomy_anatomy' 'protein_present_anatomy'
 'protein_absent_anatomy']
['ppi' 'carrier' 'enzyme' 'target' 'transporter' 'contraindication'
 'indication' 'off-label use' 'synergistic interaction' 'associated with'
 'parent-child' 'phenotype absent' 'phenotype present' 'side effect'
 'interacts with' 'linked to' 'expression present' 'expression absent']
5463048


In [119]:
kg.to_csv(save_path+'auxillary/kg_raw.csv', index=False) # No reverse edges

In [174]:
# Double check that none of the MONDO terms are still in HPO
print(kg.query('x_source == "MONDO" and x_id in @mondo_r_hp_ids'))
print(kg.query('y_source == "MONDO" and y_id in @mondo_r_hp_ids'))

Empty DataFrame
Columns: [relation, display_relation, x_id, x_type, x_name, x_source, y_id, y_type, y_name, y_source]
Index: []
Empty DataFrame
Columns: [relation, display_relation, x_id, x_type, x_name, x_source, y_id, y_type, y_name, y_source]
Index: []


# Get giant component

In [121]:
kg = pd.read_csv(save_path+'auxillary/kg_raw_orphanet.csv', low_memory=False)

In [122]:
print(kg)

                           relation   display_relation   x_id  \
0                   protein_protein                ppi   9796   
1                   protein_protein                ppi   7918   
2                   protein_protein                ppi   8233   
3                   protein_protein                ppi   4899   
4                   protein_protein                ppi   5297   
...                             ...                ...    ...   
5471989  disease_phenotype_positive  phenotype present  15942   
5471990  disease_phenotype_positive  phenotype present  16355   
5471991  disease_phenotype_positive  phenotype present   8294   
5471992  disease_phenotype_positive  phenotype present  14412   
5471993         phenotype_phenotype       parent-child   8255   

                   x_type                                x_name x_source  \
0            gene/protein                                PHYHIP     NCBI   
1            gene/protein                                GPANK1    

In [123]:
nodes = pd.concat([kg.get(['x_id','x_type', 'x_name','x_source']).rename(columns={'x_id':'node_id', 'x_type':'node_type', 'x_name':'node_name','x_source':'node_source'}), 
                   kg.get(['y_id','y_type', 'y_name','y_source']).rename(columns={'y_id':'node_id', 'y_type':'node_type', 'y_name':'node_name','y_source':'node_source'})])
nodes = nodes.drop_duplicates().reset_index().drop('index',axis=1).reset_index().rename(columns={'index':'node_idx'})

edges = pd.merge(kg, nodes, 'left', left_on=['x_id','x_type', 'x_name','x_source'], right_on=['node_id','node_type','node_name','node_source'])
edges = edges.rename(columns={'node_idx':'x_idx'})
edges = pd.merge(edges, nodes, 'left', left_on=['y_id','y_type', 'y_name','y_source'], right_on=['node_id','node_type','node_name','node_source'])
edges = edges.rename(columns={'node_idx':'y_idx'})
edges = edges.get(['relation', 'display_relation','x_idx', 'y_idx'])
edges['combine_idx'] = edges['x_idx'].astype(str) + '-' + edges['y_idx'].astype(str)

edge_index = edges.get(['x_idx', 'y_idx']).values.T

graph = ig.Graph()
graph.add_vertices(list(range(nodes.shape[0])))
graph.add_edges([tuple(x) for x in edge_index.T])

graph = graph.as_undirected(mode='collapse')

c = graph.components(mode='strong')
giant = c.giant()

#print('Nodes: %d' % giant.vcount())
#print('Edges: %d' % giant.ecount())

assert not giant.is_directed()
assert giant.is_connected()

giant_nodes = giant.vs['name']
new_nodes = nodes.query('node_idx in @giant_nodes')
assert new_nodes.shape[0] == giant.vcount()

new_edges = edges.query('x_idx in @giant_nodes and y_idx in @giant_nodes').copy()
assert new_edges.shape[0] == giant.ecount()

new_kg = pd.merge(new_edges, new_nodes, 'left', left_on='x_idx', right_on='node_idx')
new_kg = new_kg.rename(columns={'node_id':'x_id', 'node_type':'x_type', 'node_name':'x_name','node_source':'x_source'}) 
new_kg = pd.merge(new_kg, new_nodes, 'left', left_on='y_idx', right_on='node_idx')
new_kg = new_kg.rename(columns={'node_id':'y_id', 'node_type':'y_type', 'node_name':'y_name','node_source':'y_source'}) 
new_kg = clean_edges(new_kg)

In [124]:
kg = new_kg.copy()
kg.to_csv(save_path+'auxillary/kg_giant_orphanet.csv', index=False)

# Collapse similar diseases

In [3]:
# Disease groupings are independent of the KG (requires only MONDO terms)

kg = pd.read_csv(save_path+'auxillary/kg_giant_orphanet.csv', low_memory=False)

## Find Groups

### Automated grouping

In [None]:
'''
disease_nodes = pd.concat([kg.get(['x_id','x_type', 'x_name','x_source']).rename(columns={'x_id':'node_id', 'x_type':'node_type', 'x_name':'node_name','x_source':'node_source'}), 
                   kg.get(['y_id','y_type', 'y_name','y_source']).rename(columns={'y_id':'node_id', 'y_type':'node_type', 'y_name':'node_name','y_source':'node_source'})])
disease_nodes = disease_nodes.query('node_type=="disease"')
disease_nodes = disease_nodes.drop_duplicates().reset_index().drop('index',axis=1).reset_index().rename(columns={'index':'node_idx'})
'''

In [None]:
'''
groups = []
seen = set()
idx2group = {}
no = set()

def isroman(s):
    return bool(re.search(r"^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$",s))

def issingleletter(s): 
    if len(s)>1: return False

def same_words(s1, s2): 
    for word in s1.lower().split(' '): 
        word = word.split(',')[0]
        if word!='type' and word!='(disease)' and word not in s2.lower(): 
            return False 
    for word in s2.lower().split(' '): 
        word = word.split(',')[0]
        if word!='type' and word!='(disease)' and word not in s1.lower(): 
            return False
    return True

for i in range(disease_nodes.shape[0]):
    i_name = disease_nodes.loc[i, 'node_name']
    i_idx = disease_nodes.loc[i, 'node_idx']
    for w in ['monosomy','disomy', 'trisomy', 'trisomy/tetrasomy', 'chromosome']: 
        if w in i_name: 
            no.add(i_idx)

for i in range(disease_nodes.shape[0]):
    i_idx = disease_nodes.loc[i, 'node_idx']
    if i_idx in seen: continue 
    if i_idx in no: continue 
    i_name = disease_nodes.loc[i, 'node_name']
    i_split = i_name.split(' ')
    end = i_split[-1]
    if len(end)<=2 or end.isnumeric() or isroman(end):  
        main_text = ' '.join(i_split[:-1])
        matches = [i_name]
        matches_idx = [i_idx]
        match_found = False
        numeric = True
        for j in range(disease_nodes.shape[0]):
            j_idx = disease_nodes.loc[j, 'node_idx']
            j_name = disease_nodes.loc[j, 'node_name']
            m = ' '.join(j_name.split(' ')[:-1])
            if m.lower() == main_text.lower() or same_words(m, main_text): 
                matches.append(j_name)
                matches_idx.append(j_idx)
                match_found = True
        if match_found:
            matches_idx = list(set(matches_idx))
            matches = list(set(matches))
            if len(matches) <= 1: continue 
            if main_text.endswith('type'): 
                main_text = main_text[:-4]
            if main_text.endswith(','): 
                main_text = main_text[:-1]
            if main_text.endswith(' '): 
                main_text = main_text[:-1]
            print(main_text)
            for x in sorted(matches): 
                print('-  ',x)
            for x in matches_idx: 
                seen.add(x)
                idx2group[x] = main_text
            groups.append((main_text, matches_idx))

'''

In [None]:
'''
disease_nodes.loc[:, 'group_name'] = ''
for data in disease_nodes.itertuples():
    if data.node_idx in idx2group.keys(): 
        disease_nodes.loc[data.Index, 'group_name'] = idx2group[data.node_idx]
    else: 
        disease_nodes.loc[data.Index, 'group_name'] = data.node_name
        
disease_group_1 = disease_nodes.get(['group_name']).drop_duplicates().reset_index().rename(columns={'index':'group_idx'})
disease_nodes = pd.merge(disease_nodes, disease_group_1, 'left', 'group_name')
'''

### Grouping with BERT

In [None]:
# generate embeddings 
'''
input_text = list(disease_group_1.get('group_name').values)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#model_name='dmis-lab/biobert-large-cased-v1.1'
model_name = 'emilyalsentzer/Bio_ClinicalBERT'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model = model.to(device)
model.eval()

def batch(iterable, batch_size=4, return_idx=True):
    l = len(iterable)
    for ndx in range(0, l, batch_size):
        if return_idx: 
            yield (ndx, min(ndx + batch_size, l))
        else:
            yield iterable[ndx:min(ndx + batch_size, l)]
            
tmp_dir = 'tmp/'
if os.path.isdir(tmp_dir): 
    shutil.rmtree(tmp_dir)
os.mkdir(tmp_dir)

batch_size=32
input_tokens = tokenizer(input_text, padding=True, return_tensors='pt', truncation=True, max_length=512)
for i, (start, end) in tqdm(enumerate(batch(input_text, batch_size))):
    input_ids = input_tokens['input_ids'][start:end, :].to(device)
    attention_mask = input_tokens['attention_mask'][start:end, :].to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        embeds = torch.mean(outputs[0], dim=1)
    np.save(tmp_dir+str(i)+'.npy', embeds.numpy())
    
embeds = []
for i, _ in enumerate(batch(input_text, batch_size)):
    x = np.load(tmp_dir+str(i)+'.npy')
    embeds.append(x)
embeds = np.concatenate(embeds)

np.save(save_path+'auxillary/kg_disease_bert_embeds.npy', embeds)
if os.path.isdir(tmp_dir): 
    shutil.rmtree(tmp_dir)'''

In [None]:
'''
embeds = np.load(save_path+'auxillary/kg_disease_bert_embeds.npy')
cos_sim = cosine_similarity(embeds, embeds)

seen = set()
groups = []
idx2group = {}
no = set()

for i in range(disease_group_1.shape[0]):
    i_name = disease_group_1.loc[i, 'group_name']
    i_idx = disease_group_1.loc[i, 'group_idx']
    for w in ['cardiomyopathy', 'syndrome', 'combined', 'complement', 'deficiency', 
              'factor', 'immunodeficiency', 'monosomy','disomy', 'trisomy', 
              'trisomy/tetrasomy', 'chromosome', 'neuroendocrine tumor', 
              'neuroendocrine neoplasm', 'cancer', 'tumor', 'neoplasm','carcinoma',
              'lymphoma', 'lipoma']: 
        if w in i_name: 
            no.add(i_idx)
            continue
    for w in ['CDG']: 
        if i_name.endswith(w): 
            no.add(i_idx)
            continue
    for w in ['neurodevelopmental disorder', 'glycogen storage disease', 
              'congenital disorder of glycosylation', 'qualitative or quantitative defects']: 
        if i_name.startswith(w): 
            no.add(i_idx)
            continue
            
cutoff = 0.98
for i in range(disease_group_1.shape[0]):
    i_name = disease_group_1.loc[i, 'group_name']
    i_idx = disease_group_1.loc[i, 'group_idx']
    if i_idx in no or i_idx in seen: continue
    x = disease_group_1[cos_sim[i]>cutoff]
    if x.shape[0]>1: 
        for v in x.get('group_name').values: 
            print(v)
        main_text = input(' Ok? ')
        if main_text not in ['','on','no', 'No', 'NO']: 
            for v in x.get('group_idx').values: 
                seen.add(v)
                idx2group[v] = main_text
            g = list(x.get('group_idx').values.reshape(-1))
            groups.append((main_text, g)) # main_text contains group name
        else: 
            no.add(i_idx)
            print('Not added')
'''

In [None]:
'''
disease_group_1.loc[:, 'group_name_2'] = ''
for data in disease_group_1.itertuples(): 
    if data.group_idx in idx2group.keys():
        disease_group_1.loc[data.Index, 'group_name_2'] = idx2group[data.group_idx]
    else: 
        disease_group_1.loc[data.Index, 'group_name_2'] = data.group_name
        
disease_group_2 = disease_group_1.get(['group_name_2']).drop_duplicates().reset_index().rename(columns={'index':'group_idx_2'})
'''

In [None]:
'''
df_disease_group = pd.merge(disease_nodes, disease_group_1, 'left', 'group_name')
df_disease_group = df_disease_group.get(['node_id', 'node_type', 'node_name', 'node_source',
       'group_name', 'group_name_2'])
df_disease_group = df_disease_group.rename(columns={'group_name':'group_name_auto',
        'group_name_2':'group_name_bert'}).astype({'node_id':str})
df_disease_group.to_csv(save_path+'auxillary/kg_grouped_diseases.csv')
'''

## Apply Groups

In [4]:
grouped_diseases = pd.read_csv(save_path+'auxillary/kg_grouped_diseases.csv').astype({'node_id':str})
group_col = 'group_name_bert'

groups = grouped_diseases.groupby(group_col).count().query('node_id>1').index.values
set_groups = set(groups)

id_col = group_col.replace('name','id')
group_map = pd.DataFrame(columns=[id_col, group_col])
group_map.loc[:, group_col] = groups

grouped_diseases = grouped_diseases.query('{} in @set_groups'.format(group_col))

for g, data in grouped_diseases.groupby(group_col): 
    if g in set_groups:
        x = '_'.join(list(data.get('node_id').values))
        i = group_map.query('{}==@g'.format(group_col)).index[0]
        group_map.loc[i, id_col] = x
        
grouped_diseases = pd.merge(grouped_diseases, group_map)
grouped_diseases.to_csv(save_path+'auxillary/kg_grouped_diseases_bert_map.csv', index=False)

In [5]:
kg_x_dis = kg.query('x_type=="disease" and x_source=="MONDO"')
kg_y_dis = kg.query('y_type=="disease" and y_source=="MONDO"')

for idx, data in tqdm(grouped_diseases.iterrows(), total=grouped_diseases.shape[0]): 
    x_index = kg_x_dis.query('x_id==@data.node_id and x_name==@data.node_name').index.values
    kg.loc[x_index, 'x_id'] = data.get(id_col)
    kg.loc[x_index, 'x_name'] = data.get(group_col)
    kg.loc[x_index, 'x_source'] = 'MONDO_grouped'

    y_index = kg_y_dis.query('y_id==@data.node_id and y_name==@data.node_name').index.values
    kg.loc[y_index, 'y_id'] = data.get(id_col)
    kg.loc[y_index, 'y_name'] = data.get(group_col)
    kg.loc[y_index, 'y_source'] = 'MONDO_grouped'

  0%|          | 0/6392 [00:00<?, ?it/s]

In [6]:
kg = kg.drop_duplicates()
#kg_rev = kg.copy().rename(columns={'x_id':'y_id','x_type':'y_type', 'x_name':'y_name', 'x_source':'y_source', 'y_id':'x_id','y_type':'x_type', 'y_name':'x_name', 'y_source':'x_source' }) #add reverse edges
#kg_rev['relation'] = kg_rev['relation'] + "_rev"
#print(kg_rev)

#kg = pd.concat([kg, kg_rev])
#kg = kg.drop_duplicates()
kg = kg.dropna()
# remove self loops from edges 
kg = kg.query('not ((x_id == y_id) and (x_type == y_type) and (x_source == y_source) and (x_name == y_name))')

In [7]:
kg.to_csv(save_path+'auxillary/kg_grouped_orphanet.csv', index=False)

# Knowledge graph description

In [8]:
kg = pd.read_csv(save_path+'auxillary/kg_grouped_orphanet.csv', low_memory=False)

In [9]:
# nodes file 
nodes = pd.concat([kg.get(['x_id','x_type', 'x_name','x_source']).rename(columns={'x_id':'node_id', 'x_type':'node_type', 'x_name':'node_name', 'x_source':'node_source'}), 
                   kg.get(['y_id','y_type', 'y_name','y_source']).rename(columns={'y_id':'node_id', 'y_type':'node_type', 'y_name':'node_name', 'y_source':'node_source'})])
nodes = nodes.drop_duplicates().reset_index().drop('index',axis=1).reset_index().rename(columns={'index':'node_index'})

# assign index 
kg = pd.merge(kg, nodes.rename(columns={'node_index':'x_index',
                                        'node_id':'x_id',
                                        'node_type':'x_type',
                                        'node_name':'x_name',
                                        'node_source':'x_source'}), 'left').dropna()
kg = pd.merge(kg, nodes.rename(columns={'node_index':'y_index',
                                        'node_id':'y_id',
                                        'node_type':'y_type',
                                        'node_name':'y_name',
                                        'node_source':'y_source'}), 'left').dropna()
kg = kg.get(['relation', 'display_relation', 'x_index', 'x_id', 'x_type', 'x_name', 'x_source',
       'y_index', 'y_id', 'y_type', 'y_name', 'y_source'])

# edges file 
edges = kg.get(['relation', 'display_relation', 'x_index', 'y_index']).copy()

In [10]:
kg.to_csv(save_path+'kg.csv', index=False)
nodes.to_csv(save_path+'nodes.csv', index=False)
edges.to_csv(save_path+'edges.csv', index=False)

In [11]:
def kg_describe(df, by, count_col): 
    df = df.groupby(by).count().sort_values(by=count_col, ascending=False).rename(columns={count_col:'count'}).get(['count'])
    total = np.sum(df.get('count').values)
    df = df.eval('percent = 100*count/@total')
    df = df.append(df.sum(0).rename('total'))
    df['count'] = df.get(['count']).astype('int')
    df['percent'] = df.get(['percent']).round(1)
    return df

In [12]:
kg_describe(nodes,'node_type','node_index')

Unnamed: 0_level_0,count,percent
node_type,Unnamed: 1_level_1,Unnamed: 2_level_1
biological_process,28642,22.2
gene/protein,27671,21.4
disease,16305,12.6
effect/phenotype,15874,12.3
anatomy,14035,10.9
molecular_function,11169,8.6
drug,7949,6.2
cellular_component,4176,3.2
pathway,2516,1.9
exposure,802,0.6


In [13]:
kg_describe(edges,'relation','x_index')

Unnamed: 0_level_0,count,percent
relation,Unnamed: 1_level_1,Unnamed: 2_level_1
drug_drug,2672628,49.3
protein_present_anatomy,1518203,28.0
protein_protein,321075,5.9
disease_phenotype_positive,172469,3.2
protein_bioprocess,144805,2.7
protein_cellcomp,83402,1.5
drug_effect,79137,1.5
disease_protein,74752,1.4
protein_molfunc,69530,1.3
bioprocess_bioprocess,52886,1.0
