In [33]:
import warnings
from itertools import product

import numpy as np
import pandas as pd
from mordred import Calculator, descriptors
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import (StratifiedKFold, KFold,
                                     cross_val_score, train_test_split)
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.svm import SVC

warnings.filterwarnings("ignore", category=Warning)

N_SPLITS = 2
RANDOM_STATE = 148260

In [34]:
def LoadCSV(path):
    df = pd.read_csv(path)
    return df

def LoadCSV_BACE(path, regression = False):
    df = pd.read_csv(path)
    df.drop_duplicates('mol')
    df = df.dropna()
    df.drop('CID', axis=1, inplace=True)
    if regression:
        df.drop('Class', axis=1, inplace=True)
        df['Target'] = df['pIC50']
    else:
        df.drop('pIC50', axis=1, inplace=True)
        df['Target'] = df['Class']
    return df

def split_data_BACE(df):
    X = df.drop('Target', axis=1)

    X_train = X[X['Model'] == 'Train']
    X_test = X[X['Model'] == 'Test']
    X_val = X[X['Model'] == 'Valid']

    y = df[['Target', 'Model']]

    y_train = y[y['Model'] == 'Train']
    y_test = y[y['Model'] == 'Test']
    y_val = y[y['Model'] == 'Valid']
    
    return X_train, y_train, X_test, y_test, X_val, y_val

In [35]:
df_regression = LoadCSV_BACE(r"C:\Users\wojci\Documents\GitHub\czasteczkowa-inzynierka\experiments\BACE\bace.csv", regression=True)
df_classification = LoadCSV_BACE(r"C:\Users\wojci\Documents\GitHub\czasteczkowa-inzynierka\experiments\BACE\bace.csv")

In [36]:
X_train_class, y_train_class, X_test_class, y_test_class, X_val_class, y_val_class = split_data_BACE(df_classification)

In [37]:
X_train_regre, y_train_regre, X_test_regre, y_test_regre, X_val_regre, y_val_regre = split_data_BACE(df_regression)

# WORRYING:

In [38]:
print(X_train_class.shape)
print(X_test_class.shape)
print(X_val_class.shape)

(203, 593)
(1265, 593)
(45, 593)


In [39]:
X_train_regre.head()

Unnamed: 0,mol,Model,pIC50,MW,AlogP,HBA,HBD,RB,HeavyAtomCount,ChiralCenterCount,...,PEOE6 (PEOE6),PEOE7 (PEOE7),PEOE8 (PEOE8),PEOE9 (PEOE9),PEOE10 (PEOE10),PEOE11 (PEOE11),PEOE12 (PEOE12),PEOE13 (PEOE13),PEOE14 (PEOE14),canvasUID
0,O1CC[C@@H](NC(=O)[C@@H](Cc2cc3cc(ccc3nc2N)-c2c...,Train,9.154901,431.56979,4.4014,3,2,5,32,2,...,53.205711,78.640335,226.85541,107.43491,37.133846,0.0,7.98017,0.0,0.0,1
1,Fc1cc(cc(F)c1)C[C@H](NC(=O)[C@@H](N1CC[C@](NC(...,Train,8.853872,657.81073,2.6412,5,4,16,47,6,...,73.817162,47.1716,365.67694,174.07675,34.923889,7.98017,24.148668,0.0,24.663788,2
2,S1(=O)(=O)N(c2cc(cc3c2n(cc3CC)CC1)C(=O)N[C@H](...,Train,8.69897,591.74091,2.5499,4,3,11,42,2,...,70.365707,47.941147,192.40652,255.75255,23.654478,0.230159,15.87979,0.0,24.663788,3
3,S1(=O)(=O)C[C@@H](Cc2cc(O[C@H](COCC)C(F)(F)F)c...,Train,8.69897,591.67828,3.168,4,3,12,40,4,...,56.657166,37.954151,194.35304,202.76335,36.498634,0.980913,8.188327,0.0,26.385181,4
4,S1(=O)(=O)N(c2cc(cc3c2n(cc3CC)CC1)C(=O)N[C@H](...,Train,8.69897,629.71283,3.5086,3,3,11,44,2,...,78.945702,39.361153,179.71288,220.4613,23.654478,0.230159,15.87979,0.0,26.100143,5


In [41]:
X_test_regre.head()

Unnamed: 0,mol,Model,pIC50,MW,AlogP,HBA,HBD,RB,HeavyAtomCount,ChiralCenterCount,...,PEOE6 (PEOE6),PEOE7 (PEOE7),PEOE8 (PEOE8),PEOE9 (PEOE9),PEOE10 (PEOE10),PEOE11 (PEOE11),PEOE12 (PEOE12),PEOE13 (PEOE13),PEOE14 (PEOE14),canvasUID
248,O=C(NCCC(C)(C)C)C(Cc1cc2cc(ccc2nc1N)-c1ccccc1C)C,Test,9.187087,403.55969,5.7644,2,2,7,30,0,...,84.122887,46.316166,247.78938,90.395477,37.133846,0.0,7.98017,0.0,0.0,276
249,Fc1cc(cc(F)c1)CC(NC(=O)C(N1CCC(NC(=O)C)(C(CC)C...,Test,9.05061,615.73102,1.4277,5,5,13,44,0,...,67.9608,38.272877,304.05246,152.16188,34.923889,7.98017,32.336994,0.0,24.663788,277
250,Fc1cc(ccc1C)CC(NC(=O)C)C(O)C[NH2+]C1CC2(Oc3ncc...,Test,9.004365,498.6525,3.387,4,3,9,36,0,...,48.077168,49.532818,332.80533,84.453911,34.435734,15.387257,8.188327,0.0,24.663788,278
251,Fc1cc(ccc1)CC(NC(=O)C)C(O)C[NH2+]C1CC2(Oc3ncc(...,Test,9.0,484.62601,2.9008,4,3,9,35,0,...,48.077168,45.445873,299.93298,95.216072,34.435734,15.387257,8.188327,0.0,24.663788,279
252,Fc1cc(cc(F)c1)CC(NC(=O)c1cc(cc(c1)/C(=N\OC)/C)...,Test,9.0,639.75238,3.8163,6,3,17,46,0,...,37.771442,88.147522,261.31158,250.92554,35.014828,0.0,23.571255,0.0,24.663788,280


In [40]:
df_classification.describe()

Unnamed: 0,Class,MW,AlogP,HBA,HBD,RB,HeavyAtomCount,ChiralCenterCount,ChiralCenterCountAllPossible,RingCount,...,PEOE7 (PEOE7),PEOE8 (PEOE8),PEOE9 (PEOE9),PEOE10 (PEOE10),PEOE11 (PEOE11),PEOE12 (PEOE12),PEOE13 (PEOE13),PEOE14 (PEOE14),canvasUID,Target
count,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,...,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0
mean,0.456709,479.661988,3.17708,3.732981,2.001322,8.04957,34.089227,0.522802,2.31725,3.769993,...,48.76374,181.83558,148.442348,30.371697,3.48865,11.74056,1.239762,14.387597,783.779907,0.456709
std,0.498287,122.083053,1.396633,1.444778,1.629343,4.741135,8.520088,1.162539,1.612558,0.87739,...,18.201519,99.717702,60.548833,12.162452,5.148336,9.073406,3.293804,13.32989,444.468806,0.498287
min,0.0,138.187,-4.3611,0.0,0.0,0.0,10.0,0.0,0.0,0.0,...,-3.551821,1.91697,-5.536391,-2.216191,-7.286308,-6.106466,-7.379991,-1.273524,1.0,0.0
25%,0.0,389.3313,2.3355,3.0,0.0,4.0,28.0,0.0,1.0,3.0,...,36.54715,102.23377,102.51045,20.13299,0.0,7.98017,0.0,0.0,407.0,0.0
50%,0.0,463.6283,3.1713,4.0,2.0,7.0,33.0,0.0,2.0,4.0,...,47.624382,171.91722,140.68362,30.107586,0.55013,8.188327,0.0,21.710098,788.0,0.0
75%,1.0,564.63953,4.0155,4.0,3.0,11.0,40.0,1.0,3.0,4.0,...,58.844093,253.67908,185.65926,37.133846,7.98017,15.87979,0.0,24.663788,1167.0,1.0
max,1.0,1350.4733,7.6174,12.0,15.0,40.0,97.0,10.0,12.0,7.0,...,124.27273,865.47333,378.51627,121.6719,29.823961,80.218018,16.681131,61.65947,1547.0,1.0
