# 1. Importing modules and functions

In [66]:
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
import chembl_structure_pipeline
from molvs import standardize_smiles
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import pairwise_distances
import joblib
import pickle
from numpy import savetxt
from padelpy import from_sdf
from IPython.display import HTML
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
import warnings
warnings.filterwarnings('ignore')

# 2.Data entry and curation work set

In [67]:
uploaded_file_ws="datasets/HDAC3_work.sdf"
supplier_ws = Chem.ForwardSDMolSupplier(uploaded_file_ws,sanitize=False)
failed_mols_ws = []
all_mols_ws =[]
wrong_structure_ws=[]
wrong_smiles_ws=[]
y_tr = []
y_bad_index=[]

for i, m in enumerate(supplier_ws):
    structure = Chem.Mol(m)
    all_mols_ws.append(structure)
    y_tr.append(m.GetProp("pchembl_value_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ws.append(m)
        wrong_smiles_ws.append(Chem.MolToSmiles(m))
        wrong_structure_ws.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ws), 'molecules')
print('Failed data: ', len(failed_mols_ws), 'molecules')
number_ws =[]
for i in range(len(failed_mols_ws)):
        number_ws.append(str(i+1))
bad_molecules_ws = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ws, 'SMILES of wrong structure: ': wrong_smiles_ws, 'No.': number_ws}, index=None)
bad_molecules_ws = bad_molecules_ws.set_index('No.')
bad_molecules_ws

Original data:  1400 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [68]:
y_tr[:] = [x for i,x in enumerate(y_tr) if i not in y_bad_index]

In [69]:
len(y_tr)

1400

# 3.Standardization SDF file for work set

In [70]:
all_mols_ws[:] = [x for i,x in enumerate(all_mols_ws) if i not in y_bad_index] 
records = []
for i in range(len(all_mols_ws)):
    record = Chem.MolToSmiles(all_mols_ws[i])
    records.append(record)

moldf_ws = []
for i,record in enumerate(records):
    standard_record = standardize_smiles(record)
    m = Chem.MolFromSmiles(standard_record)
    moldf_ws.append(m)
    
print('Kept data: ', len(moldf_ws), 'molecules')

Kept data:  1400 molecules


In [71]:
moldf_ws=pd.DataFrame(moldf_ws, columns=['Mol'])
moldf_ws

Unnamed: 0,Mol
0,<rdkit.Chem.rdchem.Mol object at 0x00000202D21...
1,<rdkit.Chem.rdchem.Mol object at 0x00000202CB3...
2,<rdkit.Chem.rdchem.Mol object at 0x00000202D21...
3,<rdkit.Chem.rdchem.Mol object at 0x00000202D21...
4,<rdkit.Chem.rdchem.Mol object at 0x00000202D21...
...,...
1395,<rdkit.Chem.rdchem.Mol object at 0x00000202D21...
1396,<rdkit.Chem.rdchem.Mol object at 0x00000202D21...
1397,<rdkit.Chem.rdchem.Mol object at 0x00000202D21...
1398,<rdkit.Chem.rdchem.Mol object at 0x00000202D21...


# 4.Data entry and curation test set

In [72]:
uploaded_file_ts="datasets/HDAC3_test.sdf"
supplier_ts = Chem.ForwardSDMolSupplier(uploaded_file_ts,sanitize=False)
failed_mols_ts = []
all_mols_ts =[]
wrong_structure_ts=[]
wrong_smiles_ts=[]
y_ts = []
y_bad_index=[]
for i, m in enumerate(supplier_ts):
    structure = Chem.Mol(m)
    all_mols_ts.append(structure)
    y_ts.append(m.GetProp("pchembl_value_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ts.append(m)
        wrong_smiles_ts.append(Chem.MolToSmiles(m))
        wrong_structure_ts.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ts), 'molecules')
print('Failed data: ', len(failed_mols_ts), 'molecules')
number_ts =[]
for i in range(len(failed_mols_ts)):
        number_ts.append(str(i+1))
bad_molecules_ts = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ts, 'SMILES of wrong structure: ': wrong_smiles_ts, 'No.': number_ts}, index=None)
bad_molecules_ts = bad_molecules_ts.set_index('No.')
bad_molecules_ts

Original data:  351 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [73]:
y_ts[:] = [x for i,x in enumerate(y_ts) if i not in y_bad_index]

In [74]:
len(y_ts)

351

# 5.Standardization SDF file for test set

In [75]:
all_mols_ts[:] = [x for i,x in enumerate(all_mols_ts) if i not in y_bad_index] 
records = []
for i in range(len(all_mols_ts)):
    record = Chem.MolToSmiles(all_mols_ts[i])
    records.append(record)

moldf_ts = []
for i,record in enumerate(records):
    standard_record = standardize_smiles(record)
    m = Chem.MolFromSmiles(standard_record)
    moldf_ts.append(m)
    
print('Kept data: ', len(moldf_ts), 'molecules')

Kept data:  351 molecules


In [76]:
moldf_ts=pd.DataFrame(moldf_ts, columns=['Mol'])
moldf_ts

Unnamed: 0,Mol
0,<rdkit.Chem.rdchem.Mol object at 0x00000202CB4...
1,<rdkit.Chem.rdchem.Mol object at 0x00000202D13...
2,<rdkit.Chem.rdchem.Mol object at 0x00000202D13...
3,<rdkit.Chem.rdchem.Mol object at 0x00000202D13...
4,<rdkit.Chem.rdchem.Mol object at 0x00000202D13...
...,...
346,<rdkit.Chem.rdchem.Mol object at 0x00000202D21...
347,<rdkit.Chem.rdchem.Mol object at 0x00000202D21...
348,<rdkit.Chem.rdchem.Mol object at 0x00000202D21...
349,<rdkit.Chem.rdchem.Mol object at 0x00000202D21...


# 6.Calculation MorganFingerprint for work set

In [77]:
def calcfp(mol,funcFPInfo=dict(radius=2, nBits=1024, useFeatures=False, useChirality=False)):
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, **funcFPInfo)
    fp = pd.Series(np.asarray(fp))
    fp = fp.add_prefix('Bit_')
    return fp

# Training set
desc_ws = moldf_ws.Mol.apply(calcfp)
desc_ws

Unnamed: 0,Bit_0,Bit_1,Bit_2,Bit_3,Bit_4,Bit_5,Bit_6,Bit_7,Bit_8,Bit_9,...,Bit_1014,Bit_1015,Bit_1016,Bit_1017,Bit_1018,Bit_1019,Bit_1020,Bit_1021,Bit_1022,Bit_1023
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1396,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1397,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1398,0,1,0,0,1,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [78]:
y_tr = np.array(y_tr, dtype=np.float32)
len(y_tr)

1400

# 7.Calculation MorganFingerprint for test set

In [79]:
desc_ts = moldf_ts.Mol.apply(calcfp)
desc_ts

Unnamed: 0,Bit_0,Bit_1,Bit_2,Bit_3,Bit_4,Bit_5,Bit_6,Bit_7,Bit_8,Bit_9,...,Bit_1014,Bit_1015,Bit_1016,Bit_1017,Bit_1018,Bit_1019,Bit_1020,Bit_1021,Bit_1022,Bit_1023
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
347,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
348,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
349,0,0,1,0,1,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [80]:
y_ts = np.array(y_ts, dtype=np.float32)

# BASELINE

 ## GradientBoostingRegressor model building and validation

In [81]:
seed = 42

In [82]:
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [83]:
estimator = GradientBoostingRegressor(learning_rate=0.01,subsample= 0.5, n_estimators=1000, max_depth= 10,  random_state=seed)

In [84]:
estimator.fit(desc_ws, y_tr)

In [85]:
y_pred_CV_GBR = cross_val_predict(estimator, desc_ws, y_tr, cv=cv, n_jobs=-1)

In [86]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_GBR), 2)
Q2_CV

0.69

In [87]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_GBR)), 2)
RMSE_CV

0.68

# VarianceThreshold

In [88]:
from sklearn.feature_selection import VarianceThreshold

In [89]:
def variance_threshold_selector(data, threshold=0.5):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

In [90]:
x_tr_const_and_nearcont=variance_threshold_selector(desc_ws, 0.05)
x_tr_const_and_nearcont

Unnamed: 0,Bit_1,Bit_4,Bit_8,Bit_14,Bit_15,Bit_25,Bit_31,Bit_33,Bit_36,Bit_41,...,Bit_967,Bit_971,Bit_980,Bit_997,Bit_999,Bit_1009,Bit_1010,Bit_1016,Bit_1017,Bit_1019
0,1,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,1,0,0,0,0,0,0,1,1,0,...,0,1,0,1,0,0,1,1,0,1
1396,1,0,0,0,0,0,0,1,1,0,...,0,1,0,1,0,0,1,1,0,1
1397,1,0,0,0,0,0,0,1,1,0,...,0,1,0,1,0,0,1,1,0,1
1398,1,1,1,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1


In [91]:
estimator.fit(x_tr_const_and_nearcont, y_tr)

In [92]:
y_pred_CV_GBR = cross_val_predict(estimator, x_tr_const_and_nearcont, y_tr, cv=cv, n_jobs=-1)

In [93]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_GBR), 2)
Q2_CV

0.7

In [94]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_GBR)), 2)
RMSE_CV

0.67

# Finding and removing duplicate features

In [95]:
def get_duplicates(X):
    pairs = {}
    dups = []
    for i in range(X.shape[1]):
        feat_outer = X.columns[i]
        if feat_outer not in dups:
           for feat_inner in X.columns[i + 1:]:
                if X[feat_outer].equals(X[feat_inner]):
                    pairs[feat_outer].append(feat_inner)
                    dups.append(feat_inner)
    result={}
    for key in  pairs:
        if len(pairs[key])>0:
            result[key]=pairs[key]     
    return result

In [96]:
get_duplicates(x_tr_const_and_nearcont)

{}

# Removing highly correlated features

In [97]:
cor_df = x_tr_const_and_nearcont.corr().abs()
cor_df

Unnamed: 0,Bit_1,Bit_4,Bit_8,Bit_14,Bit_15,Bit_25,Bit_31,Bit_33,Bit_36,Bit_41,...,Bit_967,Bit_971,Bit_980,Bit_997,Bit_999,Bit_1009,Bit_1010,Bit_1016,Bit_1017,Bit_1019
Bit_1,1.000000,0.023343,0.060250,0.009891,0.011247,0.051526,0.056199,0.309418,0.204014,0.181086,...,0.111018,0.466243,0.044719,0.087555,0.194876,0.077911,0.311692,0.403763,0.021016,0.351924
Bit_4,0.023343,1.000000,0.027675,0.014926,0.037057,0.026979,0.016688,0.129453,0.123931,0.068172,...,0.079457,0.007767,0.014474,0.081022,0.017261,0.042160,0.065133,0.001191,0.028612,0.185722
Bit_8,0.060250,0.027675,1.000000,0.221732,0.042426,0.127507,0.114649,0.072877,0.168621,0.345448,...,0.158420,0.014074,0.089367,0.061572,0.364084,0.142961,0.153835,0.123936,0.095435,0.185521
Bit_14,0.009891,0.014926,0.221732,1.000000,0.070198,0.036017,0.057341,0.065951,0.071944,0.050997,...,0.077259,0.068002,0.088052,0.069185,0.086880,0.029150,0.072931,0.069101,0.029323,0.024528
Bit_15,0.011247,0.037057,0.042426,0.070198,1.000000,0.026488,0.101954,0.021461,0.008593,0.030356,...,0.075230,0.048196,0.158765,0.048643,0.004722,0.053132,0.059986,0.067215,0.083452,0.009351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Bit_1009,0.077911,0.042160,0.142961,0.029150,0.053132,0.669497,0.724743,0.031466,0.005697,0.145736,...,0.003805,0.090313,0.006980,0.007684,0.077589,1.000000,0.021332,0.079413,0.382081,0.104191
Bit_1010,0.311692,0.065133,0.153835,0.072931,0.059986,0.003054,0.042885,0.226246,0.460361,0.145991,...,0.077332,0.486556,0.045322,0.187867,0.144834,0.021332,1.000000,0.750003,0.003429,0.272967
Bit_1016,0.403763,0.001191,0.123936,0.069101,0.067215,0.069502,0.025609,0.222045,0.515797,0.216998,...,0.075580,0.642871,0.047347,0.196146,0.128637,0.079413,0.750003,1.000000,0.015121,0.416002
Bit_1017,0.021016,0.028612,0.095435,0.029323,0.083452,0.202189,0.361635,0.070850,0.023592,0.096186,...,0.010623,0.018075,0.021782,0.012167,0.033495,0.382081,0.003429,0.015121,1.000000,0.027988


In [98]:
cor_df = x_tr_const_and_nearcont.corr().abs()
cor_df

Unnamed: 0,Bit_1,Bit_4,Bit_8,Bit_14,Bit_15,Bit_25,Bit_31,Bit_33,Bit_36,Bit_41,...,Bit_967,Bit_971,Bit_980,Bit_997,Bit_999,Bit_1009,Bit_1010,Bit_1016,Bit_1017,Bit_1019
Bit_1,1.000000,0.023343,0.060250,0.009891,0.011247,0.051526,0.056199,0.309418,0.204014,0.181086,...,0.111018,0.466243,0.044719,0.087555,0.194876,0.077911,0.311692,0.403763,0.021016,0.351924
Bit_4,0.023343,1.000000,0.027675,0.014926,0.037057,0.026979,0.016688,0.129453,0.123931,0.068172,...,0.079457,0.007767,0.014474,0.081022,0.017261,0.042160,0.065133,0.001191,0.028612,0.185722
Bit_8,0.060250,0.027675,1.000000,0.221732,0.042426,0.127507,0.114649,0.072877,0.168621,0.345448,...,0.158420,0.014074,0.089367,0.061572,0.364084,0.142961,0.153835,0.123936,0.095435,0.185521
Bit_14,0.009891,0.014926,0.221732,1.000000,0.070198,0.036017,0.057341,0.065951,0.071944,0.050997,...,0.077259,0.068002,0.088052,0.069185,0.086880,0.029150,0.072931,0.069101,0.029323,0.024528
Bit_15,0.011247,0.037057,0.042426,0.070198,1.000000,0.026488,0.101954,0.021461,0.008593,0.030356,...,0.075230,0.048196,0.158765,0.048643,0.004722,0.053132,0.059986,0.067215,0.083452,0.009351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Bit_1009,0.077911,0.042160,0.142961,0.029150,0.053132,0.669497,0.724743,0.031466,0.005697,0.145736,...,0.003805,0.090313,0.006980,0.007684,0.077589,1.000000,0.021332,0.079413,0.382081,0.104191
Bit_1010,0.311692,0.065133,0.153835,0.072931,0.059986,0.003054,0.042885,0.226246,0.460361,0.145991,...,0.077332,0.486556,0.045322,0.187867,0.144834,0.021332,1.000000,0.750003,0.003429,0.272967
Bit_1016,0.403763,0.001191,0.123936,0.069101,0.067215,0.069502,0.025609,0.222045,0.515797,0.216998,...,0.075580,0.642871,0.047347,0.196146,0.128637,0.079413,0.750003,1.000000,0.015121,0.416002
Bit_1017,0.021016,0.028612,0.095435,0.029323,0.083452,0.202189,0.361635,0.070850,0.023592,0.096186,...,0.010623,0.018075,0.021782,0.012167,0.033495,0.382081,0.003429,0.015121,1.000000,0.027988


In [99]:
mask=np.triu(np.ones_like(cor_df, dtype=bool))

In [100]:
tri_df=cor_df.mask(mask)
tri_df

Unnamed: 0,Bit_1,Bit_4,Bit_8,Bit_14,Bit_15,Bit_25,Bit_31,Bit_33,Bit_36,Bit_41,...,Bit_967,Bit_971,Bit_980,Bit_997,Bit_999,Bit_1009,Bit_1010,Bit_1016,Bit_1017,Bit_1019
Bit_1,,,,,,,,,,,...,,,,,,,,,,
Bit_4,0.023343,,,,,,,,,,...,,,,,,,,,,
Bit_8,0.060250,0.027675,,,,,,,,,...,,,,,,,,,,
Bit_14,0.009891,0.014926,0.221732,,,,,,,,...,,,,,,,,,,
Bit_15,0.011247,0.037057,0.042426,0.070198,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Bit_1009,0.077911,0.042160,0.142961,0.029150,0.053132,0.669497,0.724743,0.031466,0.005697,0.145736,...,0.003805,0.090313,0.006980,0.007684,0.077589,,,,,
Bit_1010,0.311692,0.065133,0.153835,0.072931,0.059986,0.003054,0.042885,0.226246,0.460361,0.145991,...,0.077332,0.486556,0.045322,0.187867,0.144834,0.021332,,,,
Bit_1016,0.403763,0.001191,0.123936,0.069101,0.067215,0.069502,0.025609,0.222045,0.515797,0.216998,...,0.075580,0.642871,0.047347,0.196146,0.128637,0.079413,0.750003,,,
Bit_1017,0.021016,0.028612,0.095435,0.029323,0.083452,0.202189,0.361635,0.070850,0.023592,0.096186,...,0.010623,0.018075,0.021782,0.012167,0.033495,0.382081,0.003429,0.015121,,


In [101]:
to_drop=[c for c in tri_df.columns if any(tri_df[c]>0.95) ]
to_drop

['Bit_109', 'Bit_238', 'Bit_331', 'Bit_607']

In [102]:
reduced_df_cor=x_tr_const_and_nearcont.drop(to_drop, axis=1)

In [103]:
reduced_df_cor

Unnamed: 0,Bit_1,Bit_4,Bit_8,Bit_14,Bit_15,Bit_25,Bit_31,Bit_33,Bit_36,Bit_41,...,Bit_967,Bit_971,Bit_980,Bit_997,Bit_999,Bit_1009,Bit_1010,Bit_1016,Bit_1017,Bit_1019
0,1,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,1,0,0,0,0,0,0,1,1,0,...,0,1,0,1,0,0,1,1,0,1
1396,1,0,0,0,0,0,0,1,1,0,...,0,1,0,1,0,0,1,1,0,1
1397,1,0,0,0,0,0,0,1,1,0,...,0,1,0,1,0,0,1,1,0,1
1398,1,1,1,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1


In [104]:
estimator.fit(reduced_df_cor, y_tr)

In [105]:
y_pred_CV_GBR = cross_val_predict(estimator, reduced_df_cor, y_tr, cv=cv, n_jobs=-1)

In [106]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_GBR), 2)
Q2_CV

0.7

In [107]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_GBR)), 2)
RMSE_CV

0.67

In [108]:
feature_name_reduced_df_cor = reduced_df_cor.columns.tolist()
len(feature_name_reduced_df_cor)

205

# Prediction for test set's molecules

In [49]:
x_ts=desc_ts[feature_name_reduced_df_cor]

In [50]:
y_ts = np.array(y_ts, dtype=np.float32)

In [51]:
y_pred_GBR = estimator.predict(x_ts)

In [52]:
Q2_TS = round(r2_score(y_ts, y_pred_GBR), 2)
Q2_TS

0.71

In [53]:
RMSE_TS=round(np.sqrt(mean_squared_error(y_ts, y_pred_GBR)), 2)
RMSE_TS

0.66

# Using PCA

Using components such that 95% of variance is retained

## All  Features

In [124]:
from sklearn.decomposition import PCA

pca = PCA(0.95)
X_pca = pca.fit_transform(desc_ws)
X_pca.shape

(1400, 319)

In [125]:
pca.n_components_

319

In [126]:
estimator.fit(X_pca, y_tr)

In [127]:
y_pred_CV_GBR = cross_val_predict(estimator, X_pca, y_tr, cv=cv, n_jobs=-1)

In [128]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_GBR), 2)
Q2_CV

0.58

In [129]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_GBR)), 2)
RMSE_CV

0.79

## Subset of features after removing highly correlated features

In [130]:
X_pca = pca.fit_transform(reduced_df_cor)
X_pca.shape

(1400, 107)

In [131]:
pca.n_components_

107

In [133]:
estimator.fit(X_pca, y_tr)

In [134]:
y_pred_CV_GBR = cross_val_predict(estimator, X_pca, y_tr, cv=cv, n_jobs=-1)

In [135]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_GBR), 2)
Q2_CV

0.61

In [136]:
RMSE_CV=round(np.sqrt(mean_squared_error(y_tr, y_pred_CV_GBR)), 2)
RMSE_CV

0.76