## Run the model training
- ### To be more efficient, the model training run on our CPU cluster
- ### For each target, 2 models are trained, QSAR and Mondrian CP. Both have the same test set
- ### RF is used in all the cases with default parameters based on model optimisation performed previously
- ### 100 models are trained for each case using different splits internally but equivalent considering the 2 approaches
- ### The predictions of the test set are written as well as the mean values for the compounds are also gathered in a summary file.
- ### Models are also exported

#### Notebook meant to be exported as .py

In [23]:
import pandas as pd
import numpy as np

from sys import argv

import pickle
import dill, joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef as mcc, confusion_matrix
from sklearn.model_selection import train_test_split, KFold

from nonconformist.cp import IcpClassifier
from nonconformist.nc import NcFactory

import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
import os

In [3]:
def split_train_test_sets(dat, seed=1234):
    
    # the fingerprint has to be in a column nammed FP
    x_fp = [x.fp for x in dat['FP']]

    # gather the descriptors
    # that was a bad idea to select the column using their position
    x = np.column_stack((dat[['logp', 'mwt', 'hbd', 'hba', 'rtb', 'tpsa']], x_fp))
    
    # the activity variable (here active/inactive) has to be nammed activity_class
    y = dat.activity_class
    
    # split in data into training and test set (80%/20%)
    x_train, x_test, y_train, y_test, id_train, id_test = train_test_split(x, y, dat.index, test_size = 0.2, random_state = seed, stratify = y)
    
    return(x_train, x_test, y_train, y_test, id_train, id_test)

In [42]:
def export(dat_file, model, prediction, actual, id_test, model_type, method, outputfile, iteration):
    
    # Save the model for later
    joblib.dump(model, outputfile + '_{}_{}_model_{}'.format(method, model_type, str(iteration)), compress=3)

    # Save the probabilities for any other extra analysis
    iteration_results = pd.DataFrame(np.vstack([dat_file.loc[id_test]['target_chemblid'], dat_file.loc[id_test]['usmiles'], prediction.T, actual]).T, columns=['target_chemblid', 'usmiles', 'p0', 'p1', 'real_value'])
    iteration_results.to_csv(outputfile + '_{}_{}_model_proba_{}.csv'.format(method, model_type, str(iteration)),index=False)
    
    return(iteration_results)

In [1]:
def modelling(dat_file, default_dir, outputfile, method, t):
    
    dat_file = pickle.load(open(dat_file, 'rb'))
    dat_file['activity_class'] = np.where(dat_file['activity_class'] == 'active', 1, 0)

    
    if not os.path.exists(default_dir+'/'+t):
        os.makedirs(default_dir+'/'+t)
    
    l_qsar, l_cp, l_cp2 = [], [], []
    
    for i in range(0,100):
        
        x_train, x_test, y_train, y_test, id_train, id_test = split_train_test_sets(dat_file, seed=i)
        
        # QSAR
        
        ## Train
        model = RandomForestClassifier(n_estimators=300, max_depth=20, random_state=12345, class_weight='balanced')
        model.fit(x_train, y_train)
        
        ## Predict
        pred = model.predict_proba(x_test)
        
        ## Export
        df_iter_res = export(dat_file, model, pred, y_test, id_test, 'QSAR', method, outputfile, i)        
        l_qsar.append(df_iter_res)        
        
        # Mondrian Conformal prediction
        
        ## 1- calibration set is taken from the training set, test set is the same than in QSAR
        
        x_real_train, x_cal, y_real_train, y_cal, id_real_train, id_cal = train_test_split(x_train, y_train, y_train.index, test_size = 0.3, random_state = i)
        
        nc = NcFactory.create_nc(model) # Create a default nonconformity function
        icp = IcpClassifier(nc, condition=lambda x: x[1]) # Create a mondrian inductive conformal classifier
        
        # Fit the ICP using the proper training set
        icp.fit(x_real_train, y_real_train.values)
        # Calibrate the ICP using the calibration set
        icp.calibrate(x_cal, y_cal.values)
        # Produce predictions for the test set, with confidence 90%
        pred = icp.predict(x_test)

        # Clear the model caches before exporting the model
        icp.nc_function.model.last_x = None
        icp.nc_function.model.last_y = None
        
        ### Export
        df_iter_res = export(dat_file, icp, pred, y_test, id_test, 'CP_same_test', method, outputfile, i)
        l_cp.append(df_iter_res)
        
    
    ## Calculate the average p-values(CP)/probabilities(QSAR)
    
    df_qsar_results = pd.concat(l_qsar)
    ####numeric columns are parsed as non numerical when dataframe is created
    df_qsar_results.p0 = df_qsar_results.p0.map(np.float64)
    df_qsar_results.p1 = df_qsar_results.p1.map(np.float64)
    df_qsar_results.real_value = df_qsar_results.real_value.map(np.int)
    df_qsar_results_gp =  df_qsar_results.groupby(['target_chemblid', 'usmiles']).mean().reset_index()

    df_qsar_results_gp.to_csv(outputfile + '_{}_{}_model_proba_mean_results.csv'.format(method, 'QSAR'),index=False)
    
    df_cp_results = pd.concat(l_cp)
    ####numeric columns are parsed as non numerical when dataframe is created
    df_cp_results.p0 = df_cp_results.p0.map(np.float64)
    df_cp_results.p1 = df_cp_results.p1.map(np.float64)
    df_cp_results.real_value = df_cp_results.real_value.map(np.int)
    df_cp_results_gp =  df_cp_results.groupby(['target_chemblid', 'usmiles']).mean().reset_index()

    df_cp_results_gp.to_csv(outputfile + '_{}_{}_model_proba_mean_results.csv'.format(method, 'CP_same_test'),index=False)    

In [None]:
if __name__ == '__main__':
    path, df_prot, default_dir, target, method = argv
    
    # subdirectory for target
    if not os.path.exists(default_dir+'/'+target):
        os.makedirs(default_dir+'/'+target)
        
    # result file
    outputfile = '{}/{}/{}'.format(default_dir, target, target)
 
    modelling(df_prot, default_dir, outputfile, method, target)