## For each ChEMBL target, will create a pickle data file ready for modelling 
### NB: To be executed somewhere accessible by the training part or transfer later on

#### Notebook meant to be exported as .py

In [2]:
import pandas as pd

from sys import argv

import pickle

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import matthews_corrcoef as mcc, confusion_matrix, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV

import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
import os

In [None]:
if __name__ == '__main__':
    path, activity_file, fp_file, default_dir = argv

    if not os.path.exists(default_dir):
        os.makedirs(default_dir)
    
     # activity_file contains the activities that have been retained for modeling. It also contains 6 physico-chemical descriptors
    df_dataset = pd.read_excel(activity_file)
    
    # fp_file contains the fingerprints for the mols in activity_file
    df_mols = pickle.load(open(fp_file, 'rb'))


    for t in df_dataset.target_chemblid.unique():
        
        # isolate only the data related to the target t
        df_to_scale =  df_dataset.loc[df_dataset.target_chemblid == t].merge(df_mols[['usmiles', 'FP', 'logp', 'mwt', 'hbd', 'hba', 'rtb', 'tpsa']], on = 'usmiles')
        
        ### Scale the physicol-chemical descriptors
        # scale the physico-chemical descriptors to be between 0 and 1
        scaler = MinMaxScaler()

        df_scaled = pd.DataFrame(scaler.fit_transform(df_to_scale[['logp', 'mwt', 'hbd', 'hba', 'rtb', 'tpsa']]), columns=df_mols[['logp', 'mwt', 'hbd', 'hba', 'rtb', 'tpsa']].columns)

        df_prot = pd.concat([df_to_scale.drop(['logp', 'mwt', 'hbd', 'hba', 'rtb', 'tpsa'], axis = 1), df_scaled], axis=1)
        
        print(t)
        print(df_prot.shape)

        if not os.path.exists(default_dir+'/'+t):
            os.makedirs(default_dir+'/'+t)
        
        df_prot_filename = default_dir+'/'+t+'/'+t+'.pkl'
        pickle.dump(df_prot, open(df_prot_filename, 'wb'))
        
        # Export the scaler to scale new compound to test
        pickle.dump(scaler, open(default_dir+'/'+t+'/'+t+'_scaler.pkl', 'wb'))
            
    