## Get seed cids, ppi cids

In [10]:
import sqlite3
import tqdm

In [None]:
# Database information
db_path = ""
stitch_table_name = ""
string_table_name = ""

# Qunatmap parameters
chem_score = 700
chem_max    = 10
prot_max    = 150
prot_score  = 700
ppi_max     = 200
number_of_workers = 8

## Data preprocessing

In [11]:
conn = sqlite3.connect(db_path)
c = conn.cursor()

# Get all cids from stitch
c.execute("select cid from " + str(stitch_table_name) + ";")
data = c.fetchall()
data = list(set(data))
all_cids = data

('stitch_protchem_man',)
('string_protlink_man',)
('akshai_chem_hubs',)


In [15]:
# Get seed CIDs for quantmap
seed_cid = []
loop = tqdm.tqdm(all_cids, total=len(all_cids),leave=False)
for cid in loop:
    c.execute("select distinct protein from " + stitch_table_name + " where cid = " + str(cid[0]) + " and sc_all >=  " + str(chem_score) + " order by sc_all desc, \
              sc_exp desc limit " + str(chem_max) + " ")
    if len(c.fetchall()) > 0:
        seed_cid.append(cid[0])
        
seed_cid = sorted(seed_cid)
outfile = open("seed_cids.txt","w")
for entry in seed_cid:
    outfile.write(str(entry) + "\n")
outfile.close()

print (len(seed_cid))

                                                          

130348




In [17]:
# Get seed CIDs with defined PPI cutoff for quantmap
seed_cid_file = open("seed_cids.txt","r").readlines()

seed_cid_file = list(map(int, seed_cid_file))

ppi_cid = []

loop = tqdm.tqdm(enumerate(seed_cid_file),total=len(seed_cid_file),leave=False)

for i,cid in loop:
    seeds = []
    c.execute("select distinct protein from " + stitch_table_name + " where cid = " + str(cid) + " and sc_all >=  " + str(chem_score) + " order by sc_all desc, \
              sc_exp desc limit " + str(chem_max))
    for row in c.fetchall():
        seeds.append(row[0])
        
    ppi_query = "select pro1,pro2,sc_all/1000.0 weight from " + string_table_name + " where pro1 in (" \
        + str(seeds)[1:-1] + ") and (pro1 < pro2 or pro2 not in (" \
        + str(seeds)[1:-1] + ")) and sc_all >= " + str(prot_score) + " order by sc_all desc, sc_exp desc, pro1, pro2  limit " + str(prot_max)

    c.execute(ppi_query)
    
    if len(c.fetchall()) > 0:
        ppi_cid.append(int(cid))
        
ppi_cid = sorted(ppi_cid)
outfile = open("seed_cids_with_ppi.txt","w")
for cid in ppi_cid:
    outfile.write(str(cid) + "\n")
outfile.close()

                                                         

In [18]:
print("Number of unique CIDs = " + str(len(all_cids)))
print("Number of CIDs with seed protein = " + str(len(seed_cid_file)))
print("Number of CIDs with seed protein and have protein-protein network= " + str(len(ppi_cid)))

### Get smiles for the cid using api

In [None]:
sys.path.append('//') # Supp scripts path
import supp_utils as su

In [None]:
cid_smiles = su.get_smiles_from_cid(ppi_cid,type_smiles="isomeric",get_from="SDF",remove_sdf=True)
# if all smiles are not found use below
# rest_cids = []
# for cid in ppi_cid:
#    try:
#        cid_smiles[cid]
#    except:
#        rest_cids.append(cid)
# rest_cid_smiles = su.get_smiles_from_cid(rest_cids,type_smiles="isomeric",get_from="canonical",save_output=False,remove_sdf=True)
# cid_smiles.update(rest_cid_smiles(

In [None]:
with open("cid_smiles.txt","w") as f:
    for cid in cid_smiles:
        f.write(str(cid) + " " + cid_smiles_human[cid] + "\n")

## Batched quantmap

In [20]:
import sys
from multiprocessing import Pool
import os
import glob
import time
import tqdm
import sqlite3
import json
#import yaml
from functools import partial
import numpy as np
import qmap_ppi_out as qmap
import subprocess

In [21]:
path_cur = os.environ.copy()
path_cur = '' # environment path for quantmap
subprocess.run(['which', 'python'], env={'PATH': path_cur})

CompletedProcess(args=['which', 'python'], returncode=0)

In [22]:
seed_cid_file = open("seed_cids_with_ppi.txt","r").readlines()
seed_cid_file = sorted(list(map(int, seed_cid_file)))
os.system("mkdir cid_list_splits")
os.system("mkdir ppi_results")

0

In [23]:
# Multiprocessing splits of qunatmap runs
split_number = 0
batch_size = 1000
out_cids = ""
for i,entry in enumerate(seed_cid_file):
    out_cids += (str(entry) + "\tdummy_text\n")
    if (i + 1) % batch_size == 0 or (i + 1) == len(seed_cid_file):
        outfile = open("cid_list_splits/split_" + str(split_number) + ".txt","w")
        outfile.write(out_cids)
        outfile.close()
        out_cids = ""
        split_number += 1
        
all_cid_files = glob.glob("cid_list_splits/*")

cids_list = []
for filename in all_cid_files:
    file_open = open(filename,"r").readlines()
    for entry in file_open:
        cid = entry.split()[0]
        cids_list.append(cid)

In [25]:
# Run quantmap
start_time = time.time()

def run_r_script(filename):
    file_count = str(filename.split(".")[-2].split("_")[-1])
    code = 'source ~/.bashrc && conda activate qmap_data && python qmap_ppi_out.py ' + str(filename) + ' ' + str(file_count)
    subprocess.run("bash -c '" + code + "'", shell=True)
    
if __name__ == '__main__':
    with Pool(number_of_workers) as p:
        p.map(run_r_script, all_cid_files)
        
print (time.time() - start_time)

225.81683087348938


## Spearman's footrule

In [29]:
import glob
import time
import tqdm
import os
from multiprocessing import Pool
import pandas as pd

In [None]:
## Load generated data to dict
def csv_to_dict(input_file):
    df = pd.read_csv(input_file).fillna(0)
    df_dict = df.to_dict(orient="list")
    output_dict = {cid : {} for cid in list(df_dict.keys())[1:]}
    enzyme_key = list(df_dict.keys())[0]
    for cid in output_dict:
        for i,values in enumerate(df_dict[cid]):
            if float(values) > 0:
                output_dict[cid][df_dict[enzyme_key][i]] = values
    return output_dict

start_time = time.time()
ppi_files = ["ppi_results/" + str(i) + ".csv" for i in range(len(glob.glob("ppi_results/*")))]
data_dict = {}
cids = []
#loop = tqdm.tqdm(ppi_files,total=len(ppi_files),leave=False)

if __name__ == '__main__':
    with Pool(20) as p:
        output_dicts = p.map(csv_to_dict, ppi_files)

for dicts in output_dicts:
    data_dict.update(dicts)
    
sorted_dict_keys = sorted(list(map(int,list(data_dict.keys()))))
sorted_data_dict = {}
for key in sorted_dict_keys:
    sorted_data_dict[key] = data_dict[str(key)]

data_dict = sorted_data_dict
    
print (time.time() - start_time)

In [30]:
# chunk size for  input for multiprocessing pool
chunk_size = 1000
chunked_cid_list = []
current_list = []
calculated_cid = sorted(list(data_dict.keys()))
for i,cid in enumerate(calculated_cid):
    current_list.append(cid)
    if (i + 1) % chunk_size == 0 or i + 1 == len(calculated_cid):
        chunked_cid_list.append(current_list)
        current_list = []
        
all_cid_list = calculated_cid

In [32]:
# Receives two dict of compounds with their each first row as enzymeid and then the ranking
def spearman_footrule(cmpd1,cmpd2):
    abs_diff = 0
    no_match_count_cmpd1 = 1
    match_count = 1
    for lines in cmpd1:
        eid1,rank1 = lines,cmpd1[lines]
        if eid1 in cmpd2:
            rank2 = cmpd2[eid1]
            abs_diff += abs(rank1 - rank2)
            match_count += 1
        else:
            no_match_count_cmpd1 += 1
    no_match_count_cmpd2 = (len(cmpd2)-match_count)
    no_match_count = no_match_count_cmpd1 + no_match_count_cmpd2
    return (abs_diff + (no_match_count*(match_count+no_match_count)))


# get spearman value for batch of cid
def process_spearman_footrule_data(input_cid_list):
    
    file_number = all_cid_list.index(input_cid_list[0])//chunk_size

    
    output_dict = {cid:[] for cid in input_cid_list}
    
    maximum = 0
    minimum = 100000
    for i,cid1 in enumerate(input_cid_list):
        cmpd1 = data_dict[cid1]
        for j,cid2 in enumerate(all_cid_list):
            if cid1 == cid2:
                output_dict[cid1].append(0)
            else:
                cmpd2 = data_dict[cid2]
                spearman_number = spearman_footrule(cmpd1,cmpd2)
                output_dict[cid1].append(spearman_number)
                if spearman_number < minimum:
                    minimum = spearman_number
                if spearman_number > maximum:
                    maximum = spearman_number
    
    output_file = open("spearman_value_db/db_file_" + str(file_number) + ".txt","w")
    
    for entry in output_dict:
        if len(output_dict[entry]) > 0:
            output_file.write('{"' + str(entry) + '":' + str(output_dict[entry]) + '}\n')

    output_file.close()
    
    return ([minimum,maximum])

In [33]:
os.system("mkdir spearman_value_db")

start_time = time.time()
input_list = chunked_cid_list
if __name__ == '__main__':
    with Pool(number_of_workers) as p:
        min_max_list = p.map(process_spearman_footrule_data, input_list)
p.close()      
print (time.time() - start_time)

# Get minimum and maximum spearman value for distance matrix creation
minimum_list = []
maximum_list = []
for entry in min_max_list:
    minimum_list.append(entry[0])
    maximum_list.append(entry[1])
minimum = min(minimum_list)
maximum = max(maximum_list)
print (minimum,maximum)

38026.794129133224


### Calculating distance matrix

In [1]:
import glob
import json
import time
import tqdm
import os
from multiprocessing import Pool

In [7]:
def read_json(filename):
    print (filename)
    json_dict_list = []
    with open(filename,"r") as jf:
        for json_object in jf:
            json_dict = json.loads(json_object)
            json_dict_list.append(json_dict)
    output_dict_list = [{int(k):[float(i) for i in v] for k,v in dicts.items()} for dicts in json_dict_list]
    return output_dict_list

def calculate_distance_matrix(filename):
    out_file = open("distance_matrix/" + filename.split("/")[1],"w")
    dict_list = read_json(filename)
    for each_dict in dict_list:
        key = int(list(each_dict.keys())[0]) 
        value = list((np.array(list(each_dict.values())[0]).astype('float32')  - minimum) / maximum)
        out_file.write('{"' + str(key) + '":' + str(value) + '}\n')
    out_file.close()

In [8]:
os.system("mkdir distance_matrix")

with Pool(number_of_workers) as p:
    output = p.map(calculate_distance_matrix, spearman_files)
p.close()      

In [9]:
dm_files = glob.glob("distance_matrix/*")
ordered_file_list = []
for i in range(len(dm_files)):
    ordered_file_list.append("distance_matrix/db_file_" + str(i) + ".txt")

In [10]:
def get_cids_from_file(filename):
    dict_list = read_json(filename)
    cid_list = []
    for each_dict in dict_list:
        key = int(list(each_dict.keys())[0]) 
        cid_list.append(key)
    return (cid_list)

In [1]:
cid_list = []
for files in ordered_file_list:
    cid_list.extend(get_cids_from_file(files))

In [12]:
# CID order in distance matrix (for reference)
cid_order_out = open("cid_order_file.txt","w")
for entry in cid_list:
    cid_order_out.write(str(entry) + "\n")
cid_order_out.close()

## Compress data (delete the distance matrix folder, if compression is successful or running out of storage)

In [None]:
import  tarfile
from multiprocessing import Pool
import os
import glob
import time
import tqdm
import sqlite3
import json
from functools import partial
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering

In [None]:
def compress_files(filename):
    os.system("mkdir " + str(filename.split("/")[-2]) + "_compressed")
    os.system("tar -czvf " + str(filename.split("/")[-2]) + "_compressed/" + str(filename.split("/")[-1]) + ".tar " + filename)
    
dm_files = glob.glob("distance_matrix/*")
with Pool(number_of_workers) as p:
    p.map(compress_files, dm_files)

## Reading compressed files (Memory intensive process)

In [None]:
ordered_file_list = ["distance_matrix_compressed/db_file_" + str(i) + ".txt.tar" for i in range(len(glob.glob("distance_matrix_compressed/*")))]
cid_order = list(map(lambda x:int(x),open("cid_order_file.txt","r").readlines()))

In [None]:
def read_from_tar(filename):
    print ("\n\n\nUnzipping file = " + str(filename))
    tar = tarfile.open(filename)
    inside_filename = filename.split("/")[-2][:-11] + "/" + filename.split("/")[-1][:-4]
    member = tar.getmember(inside_filename)
    f = tar.extractfile(member)
    content= f.read()
    str_content = content.decode("utf-8")
    str_content_split = str_content.split("\n")
    print ("Unzipped file = " + str(filename))
    return (str_content_split)

def read_json_from_tar(filename):
    json_dict_list = []
    for json_object in read_from_tar(filename):
        if len(json_object) > 0:
            json_dict = json.loads(json_object)
            json_dict_list.append(json_dict)
    
    output_dict_list = [{int(k):[float(i) for i in v] for k,v in dicts.items()} for dicts in json_dict_list]
    return output_dict_list

def get_distance_matrix(filename):
    output_list = []
    for each_dict in read_json_from_tar(filename):
        output_list.append(list(each_dict.values())[0])
    file_number = filename.split("/")[-1].split("_")[-1].split(".")[0]
    return (file_number,np.array(output_list))

print ("Loaded functions")

In [None]:
ordered_file_list = ["distance_matrix_compressed/db_file_" + str(i) + ".txt.tar" for i in range(len(glob.glob("distance_matrix_compressed/*")))]
cid_order = list(map(lambda x:int(x),open("cid_order_file.txt","r").readlines()))

print ("Filenames created")

print ("Loading distance matrix")
if __name__ == '__main__':
    with Pool(number_of_workers) as p:
        output = p.map(get_distance_matrix, ordered_file_list)
        
output_file_order = [int(entry[0]) for entry in output]
for i in range(len(output_file_order)):
    if i == 0:
        dm = output[output_file_order.index(i)][1]
    else:
        dm = np.concatenate((dm,output[output_file_order.index(i)][1]),axis=0)
        
print ("Loaded distance matrix")
del output
print ("Deleted distance matrix copy (SAVED MEMORY)")

## K-mean clustering

In [None]:
def do_clustering(distance_threshold):
    print ("\n\n\nClustering started for " + str(distance_threshold) )
    cluster = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage='average',distance_threshold=distance_threshold)
    clusters_list = cluster.fit_predict(dm)
    
    with open("clustering_details_" + str(distance_threshold) + ".csv","w") as of:
        ii = itertools.count(dm.shape[0])
        cluster_distances = cluster.distances_
        node_details = [{'node_id': next(ii), 'left': x[0], 'right':x[1], 'distance' : cluster_distances[i]} for i,x in enumerate(cluster.children_)]
        of.write("node_id,left,right,distance\n")
        for dicts in node_details:
            node_id = dicts["node_id"]
            left = dicts["left"]
            right = dicts["right"]
            distance = dicts["distance"]
            of.write(str(node_id) + "," + str(left) + "," + str(right) + "," + str(distance) + "\n")
    
    print ("Clustering finished for " + str(distance_threshold) )
    with open("cid_cluster_" + str(distance_threshold) + ".txt","w") as of:
        for i,cid in enumerate(cid_order):
            of.write(str(cid) + " " + str(clusters_list[i]) + "\n")
    print ("Saved CID-cluster data for " + str(distance_threshold) )

In [None]:
do_clustering(0.001)
do_clustering(0.005)
do_clustering(0.01)
do_clustering(0.05)
do_clustering(0.1)
do_clustering(1)