In [None]:
import os
import glob
import sqlite3
import tqdm
import pandas as pd
import yaml
import requests
from multiprocessing import Pool

In [None]:
! wget https://stringdb-static.org/download/protein.info.v11.5/9606.protein.info.v11.5.txt.gz
! gunzip -k 9606.protein.info.v11.5.txt.gz

output_folder = "../data/databases"
os.system("mkdir " + output_folder)
os.system("mv 9606.protein.info.v11.5.txt " + output_folder)
os.system("mv 9606.protein.info.v11.5.txt.gz " + output_folder)

In [None]:
# database information
db_path = "../data/qm_db.sqlite"
stitch_table_name = "stitch_protchem_man"
conn = sqlite3.connect(db_path)
c = conn.cursor()

number_of_workers = 12

cid_cluster_files = glob.glob("../data/preprocessed_data/cid_cluster_*.txt")

df_string = pd.read_csv("../data/databases/9606.protein.info.v11.5.txt",sep="\t") # String protein function information for homo sapiens

In [None]:
def get_item_count_list(input_list):
    count_dict = {}
    for entry in set(input_list):
        count_dict[entry] = input_list.count(entry)
    return count_dict

def get_annotation(df,protein):
    try:
        return (df.loc[df['protein_external_id'] ==("9606." + str(protein))])
    except:
        return (df.loc[df['#string_protein_id'] ==("9606." + str(protein))])
        

def open_cid_cluster(input_file):
    # cid-cluster information for the cids in the file
    cid_cluster = {int(entry.split()[0]):int(entry.split()[1]) for entry in open(input_file,"r")}
    cluster_cid = {}
    for cid in cid_cluster:
        cluster = cid_cluster[cid]
        if cluster not in cluster_cid:
            cluster_cid[cluster] = []
        cluster_cid[cluster].append(cid)
    return (cid_cluster,cluster_cid)

def get_protein_for_cids(cid_cluster):
    # Get protein relation and function for cids
    input_cids = list(map(int, cid_cluster.keys()))
    ppi_cid = []
    loop = tqdm.tqdm(enumerate(input_cids),total=len(input_cids),leave=False)
    cid_proteins = {}
    for i,cid in loop:
        seeds = []
        c.execute("select distinct protein from " + stitch_table_name + " where cid = " + str(cid) + " and sc_all >=  " + str(chem_score) + " order by sc_all desc, \
                  sc_exp desc limit " + str(chem_max))
        data = c.fetchall()
        for row in data:
            seeds.append(row[0])
        cid_proteins[cid] = seeds
    return (cid_proteins)

def get_protein_function(cluster_cid,cid_cluster,cid_proteins,distance_threshold,lcutoff=False):
    # Get cluster and proteins in the cluster information
    cluster_proteins = {}
    for cluster in cluster_cid:
        cid_in_cluster = cluster_cid[cluster]
        if cluster not in cluster_proteins:
            cluster_proteins[cluster] = []

        for cid in cid_in_cluster:
            cluster_proteins[cluster].extend(cid_proteins[cid])

    # Get frequency of found protein
    cluster_proteins_frequency = {}
    for cluster in cluster_proteins:
        cluster_proteins_frequency[cluster] = get_item_count_list(cluster_proteins[cluster])

    # Writing protein function, if not found, extracting using API
    final_results = {}

    for cluster in sorted(list(cluster_proteins_frequency.keys())):
        final_results[cluster] = []
        all_protein = list(cluster_proteins_frequency[cluster].keys())

        total_count_of_proteins = sum(list(cluster_proteins_frequency[cluster].values()))

        loop = tqdm.tqdm(all_protein,total=len(all_protein),leave=False)
        for protein in loop:
            loop.set_description("Cluster = " + str(cluster))
            current_protein_results = {"protein":0,"function":0,"count":0,"frequency":0}
            current_protein_results["protein"] = protein
            count = int(cluster_proteins_frequency[cluster][protein])
            current_protein_results["count"] = count
            annotation = get_annotation(df_string,protein)["annotation"].tolist()
            if annotation != []:
                current_protein_results["function"] = annotation[0]
            else:
                URL = "https://string-db.org/api/json/get_string_ids?identifiers=" + str(protein)
                r = requests.get(URL)
                result = r.json()
                if result != []:
                    current_protein_results["function"] = result[0]['annotation']
                else:
                    current_protein_results["function"] = "Not found"

            current_protein_results["frequency"] = float(count/total_count_of_proteins)

            final_results[cluster].append(current_protein_results)

    # Sorting based on clusters
    final_results_sorted = {}
    for cluster in final_results:
        final_results_sorted[cluster] = sorted(final_results[cluster], key = lambda i: i['frequency'],reverse=True)

    # Writing final output
    if lcutoff:
        with open('../data/preprocessed_data/protein_function_' + str(distance_threshold) + '_lcutoff.yaml', 'w') as outfile:
            yaml.dump(final_results_sorted, outfile, default_flow_style=False)
    else:
        with open('../data/preprocessed_data/protein_function_' + str(distance_threshold) + '_qcutoff.yaml', 'w') as outfile:
            yaml.dump(final_results_sorted, outfile, default_flow_style=False)

In [None]:
# Quantmap parameters used
# To get qcutoff protein function data
lcutoff = False
chem_score = 700
chem_max    = 10
prot_max    = 150
prot_score  = 700
ppi_max     = 200
def run_main(files):
    distance_threshold = files.split("_")[-1][:-4]
    cid_cluster,cluster_cid = open_cid_cluster(files)
    cid_proteins = get_protein_for_cids(cid_cluster)
    get_protein_function(cluster_cid,cid_cluster,cid_proteins,distance_threshold,lcutoff)
    
for files in cid_cluster_files:
    print (files)
    run_main(files)

In [None]:
# To get lcutoff protein function data
chem_score = 500
chem_max    = 10
prot_max    = 150
prot_score  = 500
ppi_max     = 200
lcutoff = True

for files in cid_cluster_files:
    print (files)
    run_main(files)