In [118]:
import sqlite3
import tqdm
import pandas as pd
import yaml
import requests

In [2]:
db_path = ""
distance_threshold = 0.005

stitch_table_name = "stitch_protchem_man"
chem_score = 700
chem_max    = 10
prot_max    = 150
prot_score  = 700
ppi_max     = 200

df_string = pd.read_csv("9606.protein.info.v11.0.txt",sep="\t") # String protein function information for homo sapiens

('stitch_protchem_man',)
('string_protlink_man',)
('akshai_chem_hubs',)


In [None]:
def get_item_count_list(input_list):
    count_dict = {}
    for entry in set(input_list):
        count_dict[entry] = input_list.count(entry)
    return count_dict

In [None]:
def get_annotation(df,protein):
    return (df.loc[df['protein_external_id'] ==("9606." + str(protein))])

In [3]:
# cid-cluster information for the cids in the file
conn = sqlite3.connect(db_path)
c = conn.cursor()
    
cid_cluster = {int(entry.split()[0]):int(entry.split()[1]) for entry in open("cid_cluster_" + str(distance_threshold) + ".txt","r")}

cluster_cid = {}
for cid in cid_cluster:
    cluster = cid_cluster[cid]
    if cluster not in cluster_cid:
        cluster_cid[cluster] = []
    cluster_cid[cluster].append(cid)

In [40]:
# Get protein function for cids
input_cids = list(map(int, cid_cluster.keys()))

ppi_cid = []

loop = tqdm.tqdm(enumerate(input_cids),total=len(input_cids),leave=False)

cid_proteins = {}
for i,cid in loop:
    seeds = []
    c.execute("select distinct protein from " + stitch_table_name + " where cid = " + str(cid) + " and sc_all >=  " + str(chem_score) + " order by sc_all desc, \
              sc_exp desc limit " + str(chem_max))
    data = c.fetchall()
    for row in data:
        seeds.append(row[0])
    cid_proteins[cid] = seeds

                                                        

In [51]:
# Get cluster and proteins in it information
cluster_proteins = {}
for cluster in cluster_cid:
    cid_in_cluster = cluster_cid[cluster]
    if cluster not in cluster_proteins:
        cluster_proteins[cluster] = []
        
    for cid in cid_in_cluster:
        cluster_proteins[cluster].extend(cid_proteins[cid])
        
# Get frequency of found protein
cluster_proteins_frequency = {}
for cluster in cluster_proteins:
    cluster_proteins_frequency[cluster] = get_item_count_list(cluster_proteins[cluster])

In [None]:
# Writing protein function, if not found, extracting using API
final_results = {}

for cluster in sorted(list(cluster_proteins_frequency.keys())):
    final_results[cluster] = []
    all_protein = list(cluster_proteins_frequency[cluster].keys())
    
    total_count_of_proteins = sum(list(cluster_proteins_frequency[cluster].values()))
    
    loop = tqdm.tqdm(all_protein,total=len(all_protein),leave=False)
    for protein in loop:
        loop.set_description("Cluster = " + str(cluster))
        current_protein_results = {"protein":0,"function":0,"count":0,"frequency":0}
        current_protein_results["protein"] = protein
        count = int(cluster_proteins_frequency[cluster][protein])
        current_protein_results["count"] = count
        annotation = get_annotation(df_string,protein)["annotation"].tolist()
        if annotation != []:
            current_protein_results["function"] = annotation[0]
        else:
            URL = "https://string-db.org/api/json/get_string_ids?identifiers=" + str(protein)
            r = requests.get(URL)
            result = r.json()
            if result != []:
                current_protein_results["function"] = result[0]['annotation']
            else:
                current_protein_results["function"] = "Not found"
        
        current_protein_results["frequency"] = float(count/total_count_of_proteins)
        
        final_results[cluster].append(current_protein_results)

In [124]:
# Sorting based on clusters
final_results_sorted = {}
for cluster in final_results:
    final_results_sorted[cluster] = sorted(final_results[cluster], key = lambda i: i['frequency'],reverse=True)

In [125]:
# Writing final output
with open('protein_function_' + str(distance_threshold) + '.yaml', 'w') as outfile:
    yaml.dump(final_results_sorted, outfile, default_flow_style=False)

In [129]:
# Reading final output
with open("protein_function_" + str(distance_threshold) + ".yaml", 'r') as stream:
    final_results_sorted = yaml.safe_load(stream)