In [4]:
import pandas as pd

####################
# Load the data
####################
train_df = pd.read_csv("complex_disease_train.csv")
valid_df = pd.read_csv("complex_disease_valid.csv")
test_df = pd.read_csv("complex_disease_test.csv")

####################
# Node information
####################
nodes_df = pd.read_csv("data/node.csv", delimiter="\t")
nodes_df = nodes_df.set_index("node_index")
print("--- Nodes")
display(nodes_df.head())
display(nodes_df.shape)

# dictionary to convert node index to node name
node_id2name = nodes_df.set_index(["node_id", "node_type"])["node_name"].to_dict()

####################
# Convert node index to node name
####################
# Relations to be used
relations = ["contraindication", "indication", "off-label use", "rev_contraindication", "rev_indication", "rev_off-label use"]

def validate_id(s):
    if s.endswith(".0"):
        s = s[:-2]
    return s

train_df = train_df[train_df["relation"].isin(relations)].reset_index(drop=True)
train_df["x_id"] = train_df["x_id"].apply(validate_id)
train_df["y_id"] = train_df["y_id"].apply(validate_id)
train_df["x_name"] = train_df.apply(lambda row: node_id2name[(row["x_id"], row["x_type"])], axis=1)
train_df["y_name"] = train_df.apply(lambda row: node_id2name[(row["y_id"], row["y_type"])], axis=1)
print("--- Train data")
display(train_df.head())
display(train_df.shape)

valid_df = valid_df[valid_df["relation"].isin(relations)].reset_index(drop=True)
valid_df["x_id"] = valid_df["x_id"].apply(validate_id)
valid_df["y_id"] = valid_df["y_id"].apply(validate_id)
valid_df["x_name"] = valid_df.apply(lambda row: node_id2name[(row["x_id"], row["x_type"])], axis=1)
valid_df["y_name"] = valid_df.apply(lambda row: node_id2name[(row["y_id"], row["y_type"])], axis=1)
print("--- Valid data")
display(valid_df.head())
display(valid_df.shape)

test_df = test_df[test_df["relation"].isin(relations)].reset_index(drop=True)
test_df["x_id"] = test_df["x_id"].apply(validate_id)
test_df["y_id"] = test_df["y_id"].apply(validate_id)
test_df["x_name"] = test_df.apply(lambda row: node_id2name[(row["x_id"], row["x_type"])], axis=1)
test_df["y_name"] = test_df.apply(lambda row: node_id2name[(row["y_id"], row["y_type"])], axis=1)
print("--- Test data")
display(test_df.head())
display(test_df.shape)

  train_df = pd.read_csv("complex_disease_train.csv")
  valid_df = pd.read_csv("complex_disease_valid.csv")
  test_df = pd.read_csv("complex_disease_test.csv")


--- Nodes


Unnamed: 0_level_0,node_id,node_type,node_name,node_source
node_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,9796,gene/protein,PHYHIP,NCBI
1,7918,gene/protein,GPANK1,NCBI
2,8233,gene/protein,ZRSR2,NCBI
3,4899,gene/protein,NRF1,NCBI
4,5297,gene/protein,PI4KA,NCBI


(129375, 4)

--- Train data


Unnamed: 0,x_type,x_id,relation,y_type,y_id,x_idx,y_idx,x_name,y_name
0,drug,DB05271,contraindication,disease,5044,3448.0,12675.0,Rotigotine,hypertensive disorder
1,drug,DB00492,indication,disease,5044,478.0,12675.0,Fosinopril,hypertensive disorder
2,drug,DB13956,contraindication,disease,5044,7534.0,12675.0,Estradiol valerate,hypertensive disorder
3,drug,DB01438,contraindication,disease,5044,1370.0,12675.0,Phenazopyridine,hypertensive disorder
4,drug,DB09203,contraindication,disease,5044,5854.0,12675.0,Synephrine,hypertensive disorder


(73060, 9)

--- Valid data


Unnamed: 0,x_type,x_id,relation,y_type,y_id,x_idx,y_idx,x_name,y_name
0,drug,DB05271,contraindication,disease,1200_1134_15512_5080_100078,3448.0,1569.0,Rotigotine,hypertension
1,drug,DB00492,indication,disease,1200_1134_15512_5080_100078,478.0,1569.0,Fosinopril,hypertension
2,drug,DB13956,contraindication,disease,1200_1134_15512_5080_100078,7534.0,1569.0,Estradiol valerate,hypertension
3,drug,DB01438,contraindication,disease,1200_1134_15512_5080_100078,1370.0,1569.0,Phenazopyridine,hypertension
4,drug,DB09203,contraindication,disease,1200_1134_15512_5080_100078,5854.0,1569.0,Synephrine,hypertension


(7522, 9)

--- Test data


Unnamed: 0,x_type,x_id,relation,y_type,y_id,x_idx,y_idx,x_name,y_name
0,drug,DB05271,contraindication,disease,20751_5469_15914_21272,3448.0,7701.0,Rotigotine,orthostatic hypotension
1,drug,DB00268,contraindication,disease,20751_5469_15914_21272,256.0,7701.0,Ropinirole,orthostatic hypotension
2,drug,DB00370,contraindication,disease,20751_5469_15914_21272,357.0,7701.0,Mirtazapine,orthostatic hypotension
3,drug,DB12710,contraindication,disease,20751_5469_15914_21272,6841.0,7701.0,Perazine,orthostatic hypotension
4,drug,DB01246,contraindication,disease,20751_5469_15914_21272,1222.0,7701.0,Alimemazine,orthostatic hypotension


(4680, 9)

In [3]:
train_df.to_csv("complex_disease_train_w_name.csv", index=False)
valid_df.to_csv("complex_disease_valid_w_name.csv", index=False)
test_df.to_csv("complex_disease_test_w_name.csv", index=False)

In [5]:
train_chemicals = set(train_df[train_df["relation"].apply(lambda x: not x.startswith("rev_"))]["x_name"]) | set(train_df[train_df["relation"].apply(lambda x: x.startswith("rev_"))]["y_name"])
train_diseases = set(train_df[train_df["relation"].apply(lambda x: not x.startswith("rev_"))]["y_name"]) | set(train_df[train_df["relation"].apply(lambda x: x.startswith("rev_"))]["x_name"])

valid_chemicals = set(valid_df[valid_df["relation"].apply(lambda x: not x.startswith("rev_"))]["x_name"]) | set(valid_df[valid_df["relation"].apply(lambda x: x.startswith("rev_"))]["y_name"])
valid_diseases = set(valid_df[valid_df["relation"].apply(lambda x: not x.startswith("rev_"))]["y_name"]) | set(valid_df[valid_df["relation"].apply(lambda x: x.startswith("rev_"))]["x_name"])

test_chemicals = set(test_df[test_df["relation"].apply(lambda x: not x.startswith("rev_"))]["x_name"]) | set(test_df[test_df["relation"].apply(lambda x: x.startswith("rev_"))]["y_name"])
test_diseases = set(test_df[test_df["relation"].apply(lambda x: not x.startswith("rev_"))]["y_name"]) | set(test_df[test_df["relation"].apply(lambda x: x.startswith("rev_"))]["x_name"])

In [6]:
all_chemicals = train_chemicals | valid_chemicals | test_chemicals
print(f"Number of unique chemicals: {len(all_chemicals)} (Train | Valid | Test: {len(train_chemicals)} | {len(valid_chemicals)} | {len(test_chemicals)})")

all_diseases = train_diseases | valid_diseases | test_diseases
print(f"Number of unique diseases: {len(all_diseases)} (Train | Valid | Test: {len(train_diseases)} | {len(valid_diseases)} | {len(test_diseases)})")

Number of unique chemicals: 2074 (Train | Valid | Test: 2033 | 1251 | 983)
Number of unique diseases: 2054 (Train | Valid | Test: 1707 | 244 | 103)


In [6]:
with open("all_diseases.txt", "w") as f:
    for disease in all_diseases:
        f.write(f"{disease}\n")

In [9]:
df = pd.concat([train_df, valid_df, test_df], axis=0).reset_index(drop=True)
df = df.drop_duplicates(subset=["x_name", "y_name"])
all_interactions = set(["--".join(sorted([x, y])) for x, y in zip(df["x_name"], df["y_name"])])

density = len(all_interactions) / (len(all_chemicals) * len(all_diseases))

print(f"All interactions: {len(all_interactions)}, Density: {density:.4f}")

All interactions: 42383, Density: 0.0099
