In [1]:
import numpy as np
import pandas as pd
import torch
from scipy.sparse import save_npz, load_npz
import os
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from melloddy_tuner.utils.single_row_prep2pred import SingleRowPreparator
from  melloddy_predictor.predictor_single import PredictorSingle

from melloddy_tuner.tunercli import do_prepare_prediction_online

from melloddy_tuner.utils.standardizer import Standardizer
from melloddy_tuner.utils.descriptor_calculator import DescriptorCalculator

Specify environment and paths

In [3]:
#conda environment to use when running shell script for reference output
conda_env = " melloddy_peipeline_dev"

root_path = "/home/schufan1/projects/Melloddy/git_repos/melloddy_predictor"
test_path = os.path.join(root_path,"tests/begin_to_end_test")
model_path = os.path.join(root_path,"inputs/models")
config_path = os.path.join(root_path,"inputs/config")

params_file = os.path.join(config_path,"example_parameters.json")
key_file = os.path.join(config_path,"example_key.json")

tunercli_path = "/home/schufan1/projects/Melloddy/git_repos/data_prep/melloddy_tuner"
predict_py_path = "/home/schufan1/projects/Melloddy/git_repos/sparsechem/sparsechem"

read input data

In [4]:
smiles_df = pd.read_csv(os.path.join(test_path,"T2_100samples.csv"))
smiles_df.head()

Unnamed: 0.1,Unnamed: 0,input_compound_id,smiles
0,80540,836392,Cc1ccc2ncccc2c1
1,319232,1835024,CCNC(=O)c1cc(C(C)Nc2cc(F)cc(F)c2)c3OC(=CC(=O)c...
2,89421,829337,CCOc1ccc(cc1)C(=O)c2cc(ccc2N3CCOCC3)[N+](=O)[O-]
3,353802,1020194,Oc1nc(CSc2nnc(Cc3ccccc3)n2CC=C)nc4ccccc14
4,466566,1520639,Cc1cc(nn1c2cccc(Oc3ccc(cc3C#N)S(=O)(=O)Nc4nccs...


pepare the x-matrix with conventional functions

In [5]:
x_matrix, df_failed, mapping_table = do_prepare_prediction_online(input_structure = smiles_df, key_path = key_file, config_file = params_file, num_cpu = 1)

mt_out_folder = os.path.join(test_path, "mt_output")
os.makedirs(mt_out_folder,exist_ok=True)
save_npz(os.path.join(mt_out_folder,"pred_x.npz"), x_matrix)
mapping_table.to_csv(os.path.join(mt_out_folder,"mapping_table.csv"),index=False)

Read new config file.
Read new key file.


create script to generate reference output

In [6]:
script = """
conda activate {cenv}

cd {sp_path}
mkdir -p {test_path}/sc_output
python predict.py --x {test_path}/mt_output/pred_x.npz --dev cpu --conf {model_path}/example_cls_model/hyperparameters.json --model {model_path}/example_cls_model/model.pth --outprefix {test_path}/sc_output/cls_model
python predict.py --x {test_path}/mt_output/pred_x.npz --dev cpu --conf {model_path}/example_clsaux_model/hyperparameters.json --model {model_path}/example_clsaux_model/model.pth --outprefix {test_path}/sc_output/clsaux_model
python predict.py --x {test_path}/mt_output/pred_x.npz --dev cpu --conf {model_path}/example_reg_model/hyperparameters.json --model {model_path}/example_reg_model/model.pth --outprefix {test_path}/sc_output/reg_model --inverse_normalization 1
python predict.py --x {test_path}/mt_output/pred_x.npz --dev cpu --conf {model_path}/example_hyb_model/hyperparameters.json --model {model_path}/example_hyb_model/model.pth --outprefix {test_path}/sc_output/hyb_model --inverse_normalization 1
"""
ref_gen_sh = script.format(tcpath=tunercli_path,test_path=test_path,pfile=params_file,kfile=key_file,sp_path=predict_py_path,model_path=model_path,cenv=conda_env)
ref_gen_sh

'\nconda activate  melloddy_peipeline_dev\n\ncd /home/schufan1/projects/Melloddy/git_repos/sparsechem/sparsechem\nmkdir -p /home/schufan1/projects/Melloddy/git_repos/melloddy_predictor/tests/begin_to_end_test/sc_output\npython predict.py --x /home/schufan1/projects/Melloddy/git_repos/melloddy_predictor/tests/begin_to_end_test/mt_output/pred_x.npz --dev cpu --conf /home/schufan1/projects/Melloddy/git_repos/melloddy_predictor/inputs/models/example_cls_model/hyperparameters.json --model /home/schufan1/projects/Melloddy/git_repos/melloddy_predictor/inputs/models/example_cls_model/model.pth --outprefix /home/schufan1/projects/Melloddy/git_repos/melloddy_predictor/tests/begin_to_end_test/sc_output/cls_model\npython predict.py --x /home/schufan1/projects/Melloddy/git_repos/melloddy_predictor/tests/begin_to_end_test/mt_output/pred_x.npz --dev cpu --conf /home/schufan1/projects/Melloddy/git_repos/melloddy_predictor/inputs/models/example_clsaux_model/hyperparameters.json --model /home/schufan1/p

In [7]:
script_file_name = os.path.join(test_path,"make_reference_output.sh")
with open(script_file_name ,"w") as script_file:
    script_file.write(ref_gen_sh)

**You need now to generate you reference outpt as egnerated through the command line tools** . Launch the command as generated by the output below in a shell

In [8]:
print("source "+script_file_name)

source /home/schufan1/projects/Melloddy/git_repos/melloddy_predictor/tests/begin_to_end_test/make_reference_output.sh


Read the reference input

generate X-input from smiles

In [11]:
my_srprep = SingleRowPreparator(params = params_file, secret = key_file)

In [12]:
x_tensors_test = {k:my_srprep.process_smiles(smi) for k,smi in smiles_df.set_index("input_compound_id")["smiles"].items()}

In [13]:
x_dataframe_test = pd.concat({ix:pd.Series(v.to_dense().numpy()[0]) for ix,v in x_tensors_test.items()}).unstack().astype("int8")

In [14]:
x_dataframe_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31990,31991,31992,31993,31994,31995,31996,31997,31998,31999
836392,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1835024,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
829337,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1020194,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1520639,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919493,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
957133,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
900391,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1017936,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


load the reference x_csr matrix

In [15]:
x_csr_ref = load_npz(os.path.join(test_path,"mt_output/pred_x.npz"))

In [16]:
x_ndarray_ref = x_csr_ref.toarray()

In [17]:
x_dataframe_ref = pd.DataFrame(x_ndarray_ref).rename(index=mapping_table.set_index("cont_descriptor_vector_id")["input_compound_id"])
x_dataframe_ref

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31990,31991,31992,31993,31994,31995,31996,31997,31998,31999
3049,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9191,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
66045,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
114698,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
232954,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2191976,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2202594,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2219702,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2247898,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


compare x-matrices

In [18]:
x_mat_test = np.equal(x_dataframe_test.sort_index().values, x_dataframe_ref.sort_index().values).all()
x_mat_test

True

### Test the predictions

Define some task slices for testing

In [19]:
class_task_map = {'class_570':570,'class_581':581,'class_2276':2276}
regr_task_map = {"regr_633":633,"regr_740":740,"regr_2":2}

In [20]:
my_preds = {'cls' : PredictorSingle(model= os.path.join(model_path,"example_cls_model/model.pth"), conf=os.path.join(model_path,"example_cls_model/hyperparameters.json"), class_task_map = class_task_map),\
           'clsaux' : PredictorSingle(model= os.path.join(model_path,"example_clsaux_model/model.pth"), conf=os.path.join(model_path,"example_clsaux_model/hyperparameters.json"), class_task_map = class_task_map),\
           'reg' : PredictorSingle(model= os.path.join(model_path,"example_reg_model/model.pth"), conf=os.path.join(model_path,"example_reg_model/hyperparameters.json"), regr_task_map = regr_task_map),\
           'hyb' : PredictorSingle(model= os.path.join(model_path,"example_hyb_model/model.pth"), conf=os.path.join(model_path,"example_hyb_model/hyperparameters.json"), class_task_map = class_task_map, regr_task_map = regr_task_map)
          }

In [21]:
cls_res = {}
reg_res = {}
for mtype, my_pred in my_preds.items():
    for ix, x_tens in x_tensors_test.items():
        y_class_array, y_regr_array = my_pred.predict_from_tensor(x_tens)
        if y_class_array.shape[1] > 0:
            if not mtype in cls_res:
                cls_res[mtype] = {}
            cls_res[mtype][ix] = y_class_array
        if y_regr_array.shape[1] > 0:
            if not mtype in reg_res:
                reg_res[mtype] = {}
            reg_res[mtype][ix] = y_regr_array
    

In [22]:
cls_res.keys()

dict_keys(['cls', 'clsaux', 'hyb'])

In [23]:
cls_res_df_test = {mtype : pd.concat({i: pd.Series(j[0]) for i,j in cls_res[mtype].items()}).unstack() for mtype in cls_res.keys()}
reg_res_df_test = {mtype : pd.concat({i: pd.Series(j[0]) for i,j in reg_res[mtype].items()}).unstack() for mtype in reg_res.keys()}

Read in reference prediction output

In [24]:
y_refs_class = {"cls":np.load(os.path.join(test_path,"sc_output/cls_model-class.npy")),\
                "clsaux": np.load(os.path.join(test_path,"sc_output/clsaux_model-class.npy")),\
                "hyb": np.load(os.path.join(test_path,"sc_output/hyb_model-class.npy"))}
y_refs_regr = {"reg": np.load(os.path.join(test_path,"sc_output/reg_model-regr.npy")),\
               "hyb": np.load(os.path.join(test_path,"sc_output/hyb_model-regr.npy"))}

In [25]:
y_refs_class_dfs = {mtype : pd.DataFrame(y).rename(index=mapping_table.set_index("cont_descriptor_vector_id")["input_compound_id"]) for mtype, y in y_refs_class.items()}
y_refs_regr_dfs = {mtype : pd.DataFrame(y).rename(index=mapping_table.set_index("cont_descriptor_vector_id")["input_compound_id"]) for mtype, y in y_refs_regr.items()}

In [26]:
res_class = {mtype : np.allclose(y_df_test.sort_index().values,y_refs_class_dfs[mtype].sort_index().values) for mtype, y_df_test in cls_res_df_test.items()}
res_class

{'cls': True, 'clsaux': True, 'hyb': True}

We reproduce the classification predictions exactly, as values above a true

In [27]:
res_regr = {mtype : np.allclose(y_df_test.sort_index().values,y_refs_regr_dfs[mtype].sort_index().values) for mtype, y_df_test in reg_res_df_test.items()}
res_regr

{'reg': False, 'hyb': False}

We fail however to reproduce regression results, as the values above are false. This could be due to sparsechem issue https://github.com/melloddy/SparseChem/issues/5, meaning that the dense predictions from predict.py are not inversely normnalized. Let's try to normalize those reference predictions ourselves and then retry

In [28]:
y_refs_regr_inorm_dfs = {mtype : y*my_preds[mtype].reg_stddev + my_preds[mtype].reg_mean for mtype,y in y_refs_regr_dfs.items()}

In [29]:
y_refs_regr_inorm = {mtype : y*my_preds[mtype].reg_stddev + my_preds[mtype].reg_mean for mtype,y in y_refs_regr.items()}

In [30]:
res_regr_inorm = {mtype : np.allclose(y_df_test.sort_index().values,y_refs_regr_inorm_dfs[mtype].sort_index().values) for mtype, y_df_test in reg_res_df_test.items()}
res_regr_inorm

{'reg': False, 'hyb': False}

Still not working!!

In [32]:
colmask = ((reg_res_df_test['reg'] > 10) | (reg_res_df_test['reg'] < 3)).sum(axis=0) == 0

In [33]:
res_regr_inorm = {mtype : np.allclose(y_df_test.sort_index().values[:,colmask],y_refs_regr_inorm_dfs[mtype].sort_index().values[:,colmask]) for mtype, y_df_test in reg_res_df_test.items()}
res_regr_inorm

{'reg': True, 'hyb': True}

### Test single row predictor object

In [None]:
%%time
y_res_slice = {}
for k,smi in smiles_df.set_index("input_compound_id")["smiles"].items():
    try: 
        x = my_srprep.process_smiles(smi)
        y = my_preds["hyb"].predict_decorated_series_from_tensor(x)
        y_res_slice[k] = y
    except Error as e:
        print("prediction for {0} raised and error {1}".format(k, e.str()))


In [None]:
res_slice_df = pd.concat(y_res_slice).unstack()
res_slice_df.index.names = ['input_compound_id']
res_slice_df.to_csv(os.path.join(test_path,"selected_hyb_tasks.csv"),index=True)


In [None]:
res_slice_df

In [None]:
pd.read_csv(os.path.join(test_path,"selected_hyb_tasks.csv"),index_col = "input_compound_id")

In [None]:
smiles_failing_df = pd.read_csv(os.path.join(test_path,"T2_100samples_failing.csv"))
smiles_failing_df.head()

In [None]:
%%time
y_failing_res_slice = {}
for k,smi in smiles_failing_df.set_index("input_compound_id")["smiles"].items():
    try: 
        x = my_srprep.process_smiles(smi)
        y = my_preds["hyb"].predict_decorated_series_from_tensor(x)
        y_failing_res_slice[k] = y
    except Exception as e:
        print("prediction for {0} raised an {1}: {2}".format(k, type(e), e))


In [None]:
pd.concat(y_failing_res_slice).unstack()

In [None]:
my_srprep_trusting = SingleRowPreparator(params_file, key_file, trust_standardization = True)

In [None]:
%%time
y_trusting_res_slice = {}
for k,smi in smiles_df.set_index("input_compound_id")["smiles"].items():
    try: 
        x = my_srprep_trusting.process_smiles(smi)
        y = my_preds["hyb"].predict_decorated_series_from_tensor(x)
        y_trusting_res_slice[k] = y
    except Exception as e:
        print("prediction for {0} raised and error {1}".format(k, e))


In [None]:
pd.concat(y_trusting_res_slice).unstack() - pd.concat(y_res_slice).unstack()

In [None]:
pd.read_csv(os.path.join(mt_out_folder,"mapping_table.csv"))

In [None]:
y_refs_selected_class_tasks = y_refs_class["hyb"][:,np.array(list(class_task_map.values()))]
y_refs_selected_regr_tasks = y_refs_regr_inorm["hyb"][:,np.array(list(regr_task_map.values()))]

In [None]:
y_refs_select_class_df = pd.DataFrame(y_refs_selected_class_tasks, columns = list(class_task_map.keys())).rename(index=mapping_table.set_index("cont_descriptor_vector_id")["input_compound_id"])
y_refs_select_class_df 

In [None]:
y_refs_select_regr_df = pd.DataFrame(y_refs_selected_regr_tasks, columns = list(regr_task_map.keys())).rename(index=mapping_table.set_index("cont_descriptor_vector_id")["input_compound_id"])
y_refs_select_regr_df 

In [None]:
res_slice_ref_df = pd.concat([y_refs_select_class_df, y_refs_select_regr_df],axis=1)
res_slice_ref_df.sort_index()

In [None]:
from pandas._testing import assert_frame_equal

In [None]:
assert_frame_equal(res_slice_df.sort_index().astype("float32"), res_slice_ref_df.sort_index().astype("float32"))