In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
import re
sys.path.append('../')
from common import *
from tqdm import tqdm
import multiprocessing
from functools import partial
tqdm.pandas()
import numpy as np

In [2]:
###input
cut_off = 0.3
num = 'all'
cut_off_path = f'../../../Results/lipid/top{num}_{cut_off}_re/'
cut_off_remain_path = f'../../../Results/lipid/top{num}_{cut_off}_remain_re/'
rxndb_to_model_path = cut_off_path + f'rxndb_to_model_top{num}_{cut_off}.csv'
rxndb_to_model_add_no_ec_path = cut_off_remain_path + f'rxndb_to_model_top{num}_{cut_off}_remain.csv'
ymdb_output_final = '../../../Data/ymdb/ymdb_final_result.xlsx'
yeast_model_path = '../../../Data/model/yeast-GEM-final.csv'
uptake_met_path = '../../../Data/ymdb/ymdb_uptake.csv'
rxndb_met_max_score_file = cut_off_path + f'rxndb_met_max_score_pd_top{num}_{cut_off}.csv'
lipid_YMDB_success_met_smile_file_remian ='../../../Results/lipid/topall_0.3_remain_re/YMDB_success_met_smile_topall_0.3_remain.pickle'
lipid_YMDB_success_met_smile_file = '../../../Results/lipid/topall_0.3_re/YMDB_success_met_smile_topall_0.3.pickle'
###output
rxndb_met_max_score_output_file = cut_off_path + f'rxndb_met_max_score_pd_top{num}_{cut_off}_merge.csv'
rxndb_to_model_with_target_path = cut_off_path + f'rxndb_to_model_with_target_topall_0.3.csv'
rxndb_to_model_with_target_success_path = cut_off_path + 'rxndb_to_model_with_target_topall_0.3_success.csv'

In [3]:
def get_score_from_smiles(input_smiles,smile_max_score_pd): ###return highest similarity score
    row = smile_max_score_pd[smile_max_score_pd['inchikey0'] == smiles2inchikey0(input_smiles)]
    if not row.empty:
        return row['score'].max()
    else:
        return 0
def process_reaction_product(index_row,smile_max_score_pd):
    index, row = index_row
    reactant_smiles = row['rxn_smiles_basic'].split('>>')[0].split('.')
    product_smiles = row['rxn_smiles_basic'].split('>>')[1].split('.')
    scores = []

    for i in product_smiles:
        scores.append(get_score_from_smiles(i, smile_max_score_pd))

    scores_all_0 = all(score == 1 for score in scores)

    if scores_all_0 and len(reactant_smiles)>0:
        return reactant_smiles, row['NO']
    else:
        return [], None
def process_reaction_reactant(index_row,smile_max_score_pd):
    index, row = index_row
    reactant_smiles = row['rxn_smiles_basic'].split('>>')[0].split('.')
    product_smiles = row['rxn_smiles_basic'].split('>>')[1].split('.')
    scores = []

    for i in reactant_smiles:
        scores.append(get_score_from_smiles(i, smile_max_score_pd))

    scores_all_0 = all(score == 1 for score in scores)

    if scores_all_0 and len(product_smiles)>0:
        return product_smiles, row['NO']
    else:
        return [], None
def process_chunk_reactant(chunk_df,smile_max_score_pd):
    smiles_success = []
    success_rxndbid = []
    for result in map(partial(process_reaction_reactant,smile_max_score_pd=smile_max_score_pd), chunk_df.iterrows()):
        if result[0]:  # check whether the result is non-empty
            smiles_success.extend(result[0])
        if result[1] is not None:
            success_rxndbid.append(result[1])
    return smiles_success, success_rxndbid
def process_chunk_product(chunk_df,smile_max_score_pd):
    smiles_success = []
    success_rxndbid = []
    for result in map(partial(process_reaction_product,smile_max_score_pd=smile_max_score_pd), chunk_df.iterrows()):
        if result[0]:  # check whether the result is non-empty
            smiles_success.extend(result[0])
        if result[1] is not None:
            success_rxndbid.append(result[1])
    return smiles_success, success_rxndbid
def process_reactions_in_parallel_reactant(rxndb,origin_smile_max_score_pd, num_processes=5, num_iterations=1):
    num = 0
    tmp_smile_max_score_pd = origin_smile_max_score_pd
    while num < num_iterations:
        num+=1
        # pool = multiprocessing.Pool(num_processes)
        smiles_success = []
        success_rxndbid = []
        chunks = np.array_split(rxndb, 5)
        with multiprocessing.Pool(num_processes) as pool:
            for result in tqdm(pool.imap(partial(process_chunk_reactant,smile_max_score_pd=tmp_smile_max_score_pd), chunks), total=len(chunks)):
                success_inchikey0 = [smiles2inchikey0(x) for x in result[0]]
                if any([x not in smiles_success for x in success_inchikey0]):
                    smiles_success.extend(success_inchikey0)
                    success_rxndbid.extend(result[1])
            for result in tqdm(pool.imap(partial(process_chunk_product,smile_max_score_pd=tmp_smile_max_score_pd), chunks), total=len(chunks)):
                success_inchikey0 = [smiles2inchikey0(x) for x in result[0]]
                if any([x not in smiles_success for x in success_inchikey0]):
                    smiles_success.extend(success_inchikey0)
                    success_rxndbid.extend(result[1])
        # pool.close()
        # pool.join()
        
        smiles_success = list(set(smiles_success))
        success_rxndbid = list(set(success_rxndbid))
        # Process cumulative successful SMILES
        for smile in smiles_success:
            if get_score_from_smiles(smile, tmp_smile_max_score_pd) < 1:
                new_row = {'smile': smile, 'score': 1, 'sim_smile': 'sys', 'inchikey0': smiles2inchikey0(smile)}
                tmp_smile_max_score_pd = tmp_smile_max_score_pd._append(new_row, ignore_index=True)

        # smiles_success = [normalize_smiles(met) for met in smiles_success]
        # smiles_success = list(set(smiles_success))

        print(f'Iteration {num} - Current success count: {len(smiles_success)}')
        print(f'Iteration {num} - Current success_rxndbid count: {len(success_rxndbid)}')
        print('============================================================================')

    print('final success', len(smiles_success))
    print('final success_rxndbid', len(success_rxndbid))
    # print(success_rxndbid)
    return smiles_success, success_rxndbid,tmp_smile_max_score_pd

In [None]:
yeast8_total_smiles = get_all_smiles_in_model(yeast_model_path,ymdb_output_final)
lipid_success_smiles = load_pickle(lipid_YMDB_success_met_smile_file)   
uptake_met = pd.read_csv(uptake_met_path)   
uptake_smiles = uptake_met['SMILES'].to_list()
sink_smiles = yeast8_total_smiles  + uptake_smiles + lipid_success_smiles 
sink_smiles = list(set(sink_smiles))
sink0 = [smiles2inchikey0(x) for x in sink_smiles]
sink0 = list(set(sink0))
print(len(sink0))


In [None]:
lipid_success_smiles_remain = load_pickle(lipid_YMDB_success_met_smile_file_remian)

differenct_metabolites_inchikey0 = [smiles2inchikey0(x) for x in lipid_success_smiles_remain]
rxndb_to_model_add_no_ec = pd.read_csv(rxndb_to_model_add_no_ec_path)
rxndb_to_model_add_no_ec.head()

In [9]:
def find_reaction_containing_metabolite(rxndb_to_model_add_no_ec,inchikey0):
    rxndb_to_model_add_no_ec['product_inchikey0'] = rxndb_to_model_add_no_ec['rxn_smiles_basic'].progress_apply(lambda x: [smiles2inchikey0(i) for i in x.split('>>')[1].split('.')])
    rxndb_to_model_add_no_ec['target_num'] = rxndb_to_model_add_no_ec['product_inchikey0'].progress_apply(lambda x: sum(i in inchikey0 for i in x))
    return rxndb_to_model_add_no_ec[rxndb_to_model_add_no_ec['target_num'] > 0]
def find_reaction_containing_metabolite_( rxndb_to_model_add_no_ec,inchikey0):
    rxndb_to_model_add_no_ec['reactant_inchikey0'] = rxndb_to_model_add_no_ec['rxn_smiles_basic'].progress_apply(lambda x: [smiles2inchikey0(i) for i in x.split('>>')[0].split('.')])
    rxndb_to_model_add_no_ec['target_num'] = rxndb_to_model_add_no_ec['reactant_inchikey0'].progress_apply(lambda x: sum(i in inchikey0 for i in x))
    return rxndb_to_model_add_no_ec[rxndb_to_model_add_no_ec['target_num'] > 0]

In [None]:

rxndb_to_model_add_no_ec_with_target = find_reaction_containing_metabolite( rxndb_to_model_add_no_ec,differenct_metabolites_inchikey0)

In [11]:
rxndb_to_model_add_no_ec_with_target = rxndb_to_model_add_no_ec_with_target.drop(['product_inchikey0','target_num'],axis=1)


In [14]:
rxndb_to_model_add_no_ec_with_target.to_csv(rxndb_to_model_with_target_path, index=False)

In [10]:
rxndb_met_max_score_pd = pd.DataFrame({'smile': sink_smiles})
rxndb_met_max_score_pd['score'] = 1
rxndb_met_max_score_pd['sim_smile'] = ''
rxndb_met_max_score_pd['inchikey0'] = rxndb_met_max_score_pd['smile'].apply(smiles2inchikey0)
rxndb_met_max_score_pd.to_csv(rxndb_met_max_score_output_file, index=False)

In [None]:
smiles_success,success_rxndbid,smiles_max_score_tmp_pd = process_reactions_in_parallel_reactant(rxndb_to_model_add_no_ec_with_target, rxndb_met_max_score_pd, num_processes=5, num_iterations=2)


In [None]:
smiles_success,success_rxndbid,smiles_max_score_tmp_pd = process_reactions_in_parallel_reactant(rxndb_to_model_add_no_ec_with_target, smiles_max_score_tmp_pd, num_processes=5, num_iterations=2)


In [None]:
rxndb_to_model_success = rxndb_to_model_add_no_ec_with_target[rxndb_to_model_add_no_ec_with_target['NO'].isin(success_rxndbid)]
rxndb_to_model_success.shape

In [14]:
rxndb_to_model_success.to_csv(rxndb_to_model_with_target_success_path, index=False)

In [None]:
rxndb_to_model_success.head()

In [None]:
success_inchikey0 = [smiles2inchikey0(x) for x in smiles_success]
intesect = set(success_inchikey0).intersection(set(differenct_metabolites_inchikey0))
print(len(intesect))

In [None]:
len(list(set(differenct_metabolites_inchikey0)))