In [26]:
import numpy as np
import pandas as pd
import csv
import time
import scipy.io
from matplotlib import pyplot as plt

### 1. Load the Data and create index dictionary

In [27]:
with open("./DiseaseName", "r") as f:
    DiseaseName = f.readlines()
    f.close()
DiseaseName = [x.strip() for x in DiseaseName]
DiseaseName = np.array(DiseaseName[1:])

with open("./Drug-Disease Interactions", "r") as f:
    Drug_Disease = f.readlines()
    f.close()
Drug_Disease = [x.strip() for x in Drug_Disease[1:]]
Drug_Disease = np.array([x.split('\t') for x in Drug_Disease])

with open("./DrugDomain1", "r") as f:
    DrugDomain1 = f.readlines()
    f.close()
DrugDomain1 = np.array([x.strip() for x in DrugDomain1[1:]])

with open("./disease_similarity", "r") as f:
    disease_similarity = f.readlines()
    f.close()
disease_similarity = [x.strip().split('\t') for x in disease_similarity]

In [28]:
Drug_Dict = dict(zip(DrugDomain1, range(0, len(DrugDomain1))))
Disease_Dict = dict(zip(DiseaseName, range(0, len(DiseaseName))))

### 2. Create Similarity Matrix

In [29]:
# Create Drug-Disease Similarity Matrix

Drug_Disease_Matrix = np.zeros((len(DrugDomain1), len(DiseaseName)))

# Check running time
start_time = time.time()

for disease in DiseaseName:
    drug_index = [Drug_Dict[drug_name] for drug_name in Drug_Disease[Drug_Disease[:, 1] == disease, 0]]
    disease_index = Disease_Dict[disease]
    
    Drug_Disease_Matrix[drug_index, disease_index] = 1

# Check Correctness of the sum of interaction
print("Shape of matrix: " , Drug_Disease_Matrix.shape)
print("Total number of drug-disease interaction in similarity matrix: " , int(Drug_Disease_Matrix.sum()))
print("Total number of drug-disease interaction in drug-disease interaction list: " , len(Drug_Disease))
print("--- %s seconds ---" % (time.time() - start_time))
scipy.io.savemat('./processeddata/DrugDisease', mdict={'DrugDisease': Drug_Disease_Matrix})

Shape of matrix:  (1321, 4627)
Total number of drug-disease interaction in similarity matrix:  111481
Total number of drug-disease interaction in drug-disease interaction list:  111481
--- 13.910539150238037 seconds ---


In [30]:
# Create Disease-Disease Similarity Matrix

DiseaseSimMat = np.zeros((len(DiseaseName), len(DiseaseName)))

# Check running time
start_time = time.time()

i = 0
for d1, d2, sim in disease_similarity:
    if (i%1000000 == 0):
        print(i, " / ", len(disease_similarity))
    i += 1
    try: 
        DiseaseSimMat[Disease_Dict[d1], Disease_Dict[d2]] = sim
        DiseaseSimMat[Disease_Dict[d2], Disease_Dict[d1]] = sim
    except:
        pass
print("--- %s seconds ---" % (time.time() - start_time))

# Find disease that has 0 similiarity score to all other diseases
NullList = [] 
for i, disease in enumerate(DiseaseSimMat):
    if (disease.sum() == 0):
        NullList.append(i)

# Remove diseases in the null list
DiseaseSimMat = np.delete(DiseaseSimMat, NullList, 1)
DiseaseSimMat = np.delete(DiseaseSimMat, NullList, 0)
scipy.io.savemat('./processeddata/DiseaseSimMat', mdict={'DiseaseSimMat': DiseaseSimMat})

0  /  28649246
1000000  /  28649246
2000000  /  28649246
3000000  /  28649246
4000000  /  28649246
5000000  /  28649246
6000000  /  28649246
7000000  /  28649246
8000000  /  28649246
9000000  /  28649246
10000000  /  28649246
11000000  /  28649246
12000000  /  28649246
13000000  /  28649246
14000000  /  28649246
15000000  /  28649246
16000000  /  28649246
17000000  /  28649246
18000000  /  28649246
19000000  /  28649246
20000000  /  28649246
21000000  /  28649246
22000000  /  28649246
23000000  /  28649246
24000000  /  28649246
25000000  /  28649246
26000000  /  28649246
27000000  /  28649246
28000000  /  28649246
--- 42.37141680717468 seconds ---


### 3. Save Dictionary Data to .CSV

In [31]:
def write_csv(Dict, Directory):
    w = csv.writer(open(Directory, "w"))
    # loop over dictionary keys and values
    
    for key, val in Drug_Dict.items():
    # write every key and value to file
        w.writerow([key, val])

In [33]:
# Remove diseases in drug name dictionary
DiseaseName = [DiseaseName[i] for i in range(0, len(DiseaseName)) if i not in NullList]

with open(r'./processeddata/DiseaseName', 'w') as fp:
    for disease in DiseaseName:
        # write each item on a new line
        fp.write("%s\n" % disease)
    print('Done')

Disease_Dict = dict(zip(DiseaseName, range(0, len(DiseaseName))))

write_csv(Drug_Dict, Directory = "./processeddata/Drug_Dict.csv")
write_csv(Drug_Dict, Directory = "./processeddata/Disease_Dict.csv")

Done
