In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
import re
sys.path.append('../')
from common import *
from tqdm import tqdm
import multiprocessing
from functools import partial
tqdm.pandas()

In [16]:
## input
YMDB_total_smiles_path = '../../../Data/ymdb/ymdb_final_result.xlsx'
yeast_model_path = '../../../Data/model/yeast-GEM-final.csv'
path_enzyme_path = '../../../Data/pathway_enzyme.list'
mnxmeta_smile_file_path = '../../../Data/database/mnx_chem_prop.tsv'
ymdb_output_final = '../../../Data/ymdb/ymdb_final_result.xlsx'

## output
mnxmeta_smile_file_carbon_path = '../../../Data/database/mnxmeta_smile_carbon.csv'
mnxmeta_smile_inchikey_dict_path = '../../../Data/database/mnxmeta_smile_inchikey_dict.json'
not_lipid_target_smiles_path = '../../../Results/not_lipid_filter20percent/target_smiles_not_lipid.pickle'
not_lipid_target_smiles_filter20percent_path = '../../../Results/not_lipid_filter20percent/target_smiles_not_lipid_filter20percent.pickle'
not_lipid_target_smiles_filter20percent_else_path = '../../../Results/not_lipid_filter20percent/target_smiles_not_lipid_filter20percent_else.pickle'
not_lipid_target_smiles_complex_path = '../../../Results/not_lipid_filter20percent/target_smiles_not_lipid_complex.pickle'
not_lipid_yeast_met_file = '../../../Results/not_lipid_filter20percent/yeast_met_not_lipid.pickle'
total_met_inchikey0_file = '../../../Results/analysis/total_met_inchikey0_filter.pickle'
not_lipid_smiles2metnetx_path = '../../../Results/not_lipid_filter20percent/not_lipid_smiles2metnetx.pickle'

# function

In [17]:
def get_total_inchikey0_filter(model_path, YMDB_total_smiles_path,drop_smiles,output_file):
    model = pd.read_csv(model_path)
    model = model.dropna(subset=['standard_smiles']).reset_index(drop=True)
    def filter_smiles(x, drop_smiles):
        return not any(compare_smiles_inchikey(x, met) for met in drop_smiles)
    print(model.shape)
    model = model[model['standard_smiles'].progress_apply(lambda x: filter_smiles(x, drop_smiles))]
    print(model.shape)

    YMDB_Data = pd.read_excel(YMDB_total_smiles_path)
    YMDB_Data = YMDB_Data.dropna(subset=['standard_smiles']).reset_index(drop=True)
    print(YMDB_Data.shape)
    YMDB_Data = YMDB_Data[YMDB_Data['standard_smiles'].progress_apply(lambda x: filter_smiles(x, drop_smiles))]
    print(YMDB_Data.shape)

    model['inchikey0'] = model['standard_smiles'].apply(smiles2inchikey0)
    inchikey0_model = model['inchikey0'].values
    inchikey0= list(inchikey0_model) + list(YMDB_Data['inchikey0'].values)
    inchikey0 = list(set(inchikey0))
    print(len(inchikey0))
    dump_file(inchikey0, output_file)

def get_mnxmeta_smiles_carbon(mnxmeta_smile_file_path,mnxmeta_smile_file_carbon_path):
    mnxmeta_smile = pd.read_csv(mnxmeta_smile_file_path,sep='\t')
    print(mnxmeta_smile.shape)
    mnxmeta_smile = mnxmeta_smile.loc[1:,:]
    mnxmeta_smile = mnxmeta_smile.dropna(subset=['SMILES'])
    mnxmeta_smile['has_carbon'] = mnxmeta_smile['SMILES'].progress_apply(smiles_has_carbon)
    mnxmeta_smile_filtered = mnxmeta_smile[mnxmeta_smile['has_carbon'] == 1]
    mnxmeta_smile_filtered['inchikey0'] = mnxmeta_smile_filtered['SMILES'].apply(smiles2inchikey0)
    mnxmeta_smile_filtered = mnxmeta_smile_filtered.groupby('inchikey0')['SMILES'].agg(list).reset_index()
    mnxmeta_smile_filtered.to_csv(mnxmeta_smile_file_carbon_path, index=False)
    print(mnxmeta_smile_filtered.shape)
    return mnxmeta_smile_filtered

def compare_smiles_mnxmeta(smiles, compare_total_smiles):
    matched_smiles = []
    for compare_smiles in compare_total_smiles:
        if compare_smiles_inchikey(smiles, compare_smiles):
            matched_smiles.append(compare_smiles)
    return smiles, matched_smiles

def process_complex_smiles(not_lipid_target_smiles,not_lipid_target_smiles_complex_path):
    not_lipid_target_smiles_complex = [x for x in not_lipid_target_smiles if '.' in x]
    not_lipid_target_smiles_complex_with_carbon = []
    for smiles in tqdm(not_lipid_target_smiles_complex):
        not_lipid_target_smiles_complex_with_carbon.extend(smiles.split('.'))
    not_lipid_target_smiles_complex_with_carbon = [x for x in not_lipid_target_smiles_complex_with_carbon if smiles_has_carbon(x)]
    not_lipid_target_smiles = [x for x in not_lipid_target_smiles if x not in not_lipid_target_smiles_complex]
    not_lipid_target_smiles = not_lipid_target_smiles + not_lipid_target_smiles_complex_with_carbon
    dump_file(not_lipid_target_smiles,not_lipid_target_smiles_complex_path)
    return not_lipid_target_smiles


# yeast8 metabolites, ymdb metabolites

In [18]:
YMDB_Data = pd.read_excel(YMDB_total_smiles_path)
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00074',['in_model']] = 1 #cyanide
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00108',['in_model']] = 1 #fructose 1,6-bisphosphate
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00121',['in_model']] = 1 #sulfide
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00253',['in_model']] = 1 #d-ribose
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00657',['in_model']] = 1 #D-fructose
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00789',['in_model']] = 1 #Galactose
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00797',['in_model']] = 1 #udp-galactose
YMDB_Data.loc[YMDB_Data['ID']=='YMDB01490',['in_model']] = 1 #Ammonium phosphate
YMDB_Data.loc[YMDB_Data['ID']=='YMDB01491',['in_model']] = 1 #Ammonium chloride
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00032',['in_model']] = 1 #purine
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00020',['in_model']] = 1 #Sodium
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00207',['in_model']] = 1 #Copper(2+)
YMDB_Data.loc[YMDB_Data['ID']=='YMDB01517',['in_model']] = 1 #chloride
YMDB_Data.loc[YMDB_Data['ID']=='YMDB00358',['in_model']] = 1 #Calcium(2+)
YMDB_Data.loc[YMDB_Data['ID']=='YMDB16117',['in_model']] = 1 #Potassium
YMDB_Data.loc[YMDB_Data['ID']=='YMDB01523',['in_model']] = 1 #Phosphorus
YMDB_Data['has_carbon'] = YMDB_Data['SMILES'].progress_apply(smiles_has_carbon)
YMDB_Data.loc[YMDB_Data['has_carbon']==0,'in_model'] = 1
YMDB_Data.to_excel(YMDB_total_smiles_path,index=False)

print('YMDB metabolites in model',len(YMDB_Data[YMDB_Data['in_model']==1]))

not_lipid_target_smiles = YMDB_Data[(YMDB_Data['super_class']!='Lipids and lipid-like molecules')&(YMDB_Data['in_model']==0)]['SMILES'].to_list()
not_lipid_target_smiles = get_target_smiles(not_lipid_target_smiles,not_lipid_target_smiles_path)

100%|██████████| 16042/16042 [00:07<00:00, 2133.49it/s]


YMDB metabolites in model 1160
target_smiles: 572


## sample 0.8

In [19]:
random.seed(42)  # You can use any integer value as the seed
sample_size = int(len(not_lipid_target_smiles) * 0)
not_lipid_target_filter20percent_else = [x for x in not_lipid_target_smiles if x not in random.sample(not_lipid_target_smiles, sample_size)]
not_lipid_target_smiles = random.sample(not_lipid_target_smiles, sample_size)

In [20]:
dump_file(not_lipid_target_smiles,not_lipid_target_smiles_filter20percent_path)
dump_file(not_lipid_target_smiles,not_lipid_target_smiles_path)
dump_file(not_lipid_target_filter20percent_else,not_lipid_target_smiles_filter20percent_else_path)

## continue

In [21]:
not_lipid_target_smiles = process_complex_smiles(not_lipid_target_smiles,not_lipid_target_smiles_complex_path)

0it [00:00, ?it/s]


In [22]:
#combine model smiles with target smiles
# get_total_smiles(ymdb_output_final,yeast_model_path, not_lipid_target_smiles, not_lipid_yeast_met_file)
get_total_smiles(yeast_model_path, not_lipid_target_smiles, not_lipid_yeast_met_file)

total_smiles 945


In [23]:
# get total inchikey0
# get_total_inchikey0(yeast_model_path, YMDB_total_smiles_path, total_met_inchikey0_file)
get_total_inchikey0_filter(yeast_model_path, YMDB_total_smiles_path, not_lipid_target_filter20percent_else,total_met_inchikey0_file)

(2124, 17)


  0%|          | 0/2124 [00:00<?, ?it/s]

100%|██████████| 2124/2124 [19:04<00:00,  1.86it/s]


(2124, 17)
(16036, 10)


100%|██████████| 16036/16036 [3:54:52<00:00,  1.14it/s]  


(15463, 10)
15318


In [24]:
mnxmeta_smile_filtered = get_mnxmeta_smiles_carbon(mnxmeta_smile_file_path,mnxmeta_smile_file_carbon_path)

(1292154, 9)


  8%|▊         | 97291/1248094 [00:48<05:05, 3767.98it/s]

100%|██████████| 1248094/1248094 [06:36<00:00, 3150.38it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(906669, 2)


In [25]:
mnxmeta_smile_inchikey_dict = dict(zip(mnxmeta_smile_filtered['inchikey0'], mnxmeta_smile_filtered['SMILES']))
with open(mnxmeta_smile_inchikey_dict_path, 'w') as f:
    json.dump(mnxmeta_smile_inchikey_dict, f,indent=4)

with open(mnxmeta_smile_inchikey_dict_path, 'r') as f:
    mnxmeta_smile_inchikey_dict = json.load(f)

not_lipid_yeast_total_smiles = process_yeast_smiles(not_lipid_yeast_met_file)
print(len(not_lipid_yeast_total_smiles))
not_lipid_total_inchikey0 = [smiles2inchikey0(x) for x in not_lipid_yeast_total_smiles]

total_smiles 945
total_smiles 940
total_smiles 915
total_smiles 915
915


In [26]:
get_smiles2metnetx(list(mnxmeta_smile_inchikey_dict.keys()), not_lipid_total_inchikey0,not_lipid_smiles2metnetx_path,num_processes=60)

100%|██████████| 906669/906669 [00:03<00:00, 244732.43it/s]


In [27]:
smiles_metnetx_reverse(not_lipid_smiles2metnetx_path,not_lipid_total_inchikey0,mnxmeta_smile_inchikey_dict)

100%|██████████| 906669/906669 [00:00<00:00, 4449203.44it/s]
100%|██████████| 915/915 [00:30<00:00, 30.21it/s]
100%|██████████| 912/912 [00:00<00:00, 98636.07it/s]


In [28]:
merge_metnetx_smiles(not_lipid_yeast_total_smiles,not_lipid_smiles2metnetx_path,not_lipid_yeast_met_file)

915


100%|██████████| 912/912 [00:00<00:00, 2518239.14it/s]

2709



