# 1. Importing modules and functions

In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
import chembl_structure_pipeline
from molvs import standardize_smiles
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import pairwise_distances
import joblib
import pickle
from numpy import savetxt
from padelpy import from_sdf
from IPython.display import HTML
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.svm import SVR

# 2.Data entry and curation work set

In [2]:
uploaded_file_ws="datasets/HDAC2_work.sdf"
supplier_ws = Chem.ForwardSDMolSupplier(uploaded_file_ws,sanitize=False)
failed_mols_ws = []
all_mols_ws =[]
wrong_structure_ws=[]
wrong_smiles_ws=[]
y_tr = []
y_bad_index=[]

for i, m in enumerate(supplier_ws):
    structure = Chem.Mol(m)
    all_mols_ws.append(structure)
    y_tr.append(m.GetProp("pchembl_value_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ws.append(m)
        wrong_smiles_ws.append(Chem.MolToSmiles(m))
        wrong_structure_ws.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ws), 'molecules')
print('Failed data: ', len(failed_mols_ws), 'molecules')
number_ws =[]
for i in range(len(failed_mols_ws)):
        number_ws.append(str(i+1))
bad_molecules_ws = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ws, 'SMILES of wrong structure: ': wrong_smiles_ws, 'No.': number_ws}, index=None)
bad_molecules_ws = bad_molecules_ws.set_index('No.')
bad_molecules_ws

Original data:  1288 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [3]:
y_tr[:] = [x for i,x in enumerate(y_tr) if i not in y_bad_index]

In [4]:
len(y_tr)

1288

# 3.Standardization SDF file for work set

In [5]:
records_ws = []
for i in range(len(all_mols_ws)):
    record = Chem.MolToMolBlock(all_mols_ws[i])
    records_ws.append(record)
            
mols_ws = []
for i,record in enumerate(records_ws):
    standard_record = chembl_structure_pipeline.standardize_molblock(record)
    m = Chem.MolFromMolBlock(standard_record)
    mols_ws.append(m)
           
moldf_ws = []
for val in mols_ws:
    if val != None:
        moldf_ws.append(val)
print('Kept data: ', len(moldf_ws), 'molecules')

Kept data:  1288 molecules


# 4.Data entry and curation test set

In [6]:
uploaded_file_ts="datasets/HDAC2_test.sdf"
supplier_ts = Chem.ForwardSDMolSupplier(uploaded_file_ts,sanitize=False)
failed_mols_ts = []
all_mols_ts =[]
wrong_structure_ts=[]
wrong_smiles_ts=[]
y_ts = []
y_bad_index=[]
for i, m in enumerate(supplier_ts):
    structure = Chem.Mol(m)
    all_mols_ts.append(structure)
    y_ts.append(m.GetProp("pchembl_value_mean"))
    try:
        Chem.SanitizeMol(structure)
    except:
        failed_mols_ts.append(m)
        wrong_smiles_ts.append(Chem.MolToSmiles(m))
        wrong_structure_ts.append(str(i+1))
        y_bad_index.append(i)
print('Original data: ', len(all_mols_ts), 'molecules')
print('Failed data: ', len(failed_mols_ts), 'molecules')
number_ts =[]
for i in range(len(failed_mols_ts)):
        number_ts.append(str(i+1))
bad_molecules_ts = pd.DataFrame({'No. failed molecule in original set': wrong_structure_ts, 'SMILES of wrong structure: ': wrong_smiles_ts, 'No.': number_ts}, index=None)
bad_molecules_ts = bad_molecules_ts.set_index('No.')
bad_molecules_ts

Original data:  323 molecules
Failed data:  0 molecules


Unnamed: 0_level_0,No. failed molecule in original set,SMILES of wrong structure:
No.,Unnamed: 1_level_1,Unnamed: 2_level_1


deleting activity values for substances with incorrect structure

In [7]:
y_ts[:] = [x for i,x in enumerate(y_ts) if i not in y_bad_index]

In [8]:
len(y_ts)

323

# 5.Standardization SDF file for test set

In [9]:
records_ts = []
for i in range(len(all_mols_ts)):
    record = Chem.MolToMolBlock(all_mols_ts[i])
    records_ts.append(record)
            
mols_ts = []
for i,record in enumerate(records_ts):
    standard_record = chembl_structure_pipeline.standardize_molblock(record)
    m = Chem.MolFromMolBlock(standard_record)
    mols_ts.append(m)
           
moldf_ts = []
for val in mols_ts:
    if val != None:
        moldf_ts.append(val)
print('Kept data: ', len(moldf_ts), 'molecules')

Kept data:  323 molecules


# 6.Descriptor calculation for work set

In [10]:
fp_tr = [Chem.RDKFingerprint(m) for m in moldf_ws]

In [11]:
def rdkit_numpy_convert(fp_tr):
    output = []
    for f in fp_tr:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [12]:
from numpy import savetxt
x_tr = rdkit_numpy_convert(fp_tr)

In [13]:
x_tr = pd.DataFrame(np.array(x_tr)) 

In [14]:
x_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
1,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1283,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0
1284,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
1285,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
1286,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [15]:
x_tr.to_csv('Models/Topological_fingerprints/x_ts_TP.csv', index=True)

In [16]:
x_tr.shape

(1288, 2048)

# 7.Descriptor calculation for test set

In [17]:
fp_ts = [Chem.RDKFingerprint(m) for m in moldf_ts]

In [18]:
def rdkit_numpy_convert(fp_ts):
    output = []
    for f in fp_ts:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f, arr)
        output.append(arr)
    return np.asarray(output)

In [19]:
x_ts = rdkit_numpy_convert(fp_ts)

In [20]:
x_ts = pd.DataFrame(np.array(x_ts)) 

In [21]:
x_ts

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
319,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
320,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0
321,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [22]:
x_ts.to_csv('Models/Topological_fingerprints/x_ts_TP.csv', index=True)

In [23]:
x_ts.shape

(323, 2048)

In [24]:
x_tr = np.array(x_tr, dtype=np.float32)
y_tr = np.array(y_tr, dtype=np.float32)

# 8. Random forest model building and validation

In [25]:
seed = 42

In [26]:
cv=KFold(n_splits=5, random_state=seed, shuffle=True)

In [27]:
param_grid = {"max_features": [x_tr.shape[1] // 10, x_tr.shape[1] // 7, x_tr.shape[1] // 5, x_tr.shape[1] // 3, x_tr.shape[1] // 2],
              "n_estimators": [100, 250, 500, 1000]}

In [28]:
m = GridSearchCV(RandomForestRegressor(), param_grid, n_jobs=2, cv=cv, verbose=1)

In [29]:
m.fit(x_tr, y_tr)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=RandomForestRegressor(), n_jobs=2,
             param_grid={'max_features': [204, 292, 409, 682, 1024],
                         'n_estimators': [100, 250, 500, 1000]},
             verbose=1)

In [30]:
m.best_params_
best_RF = m.best_estimator_

In [31]:
y_pred_CV_RF = cross_val_predict(best_RF, x_tr, y_tr, cv=cv)

In [32]:
y_pred_CV_RF

array([4.81018917, 6.41538472, 6.34974898, ..., 7.23449827, 8.02977996,
       7.79158994])

In [33]:
Q2_CV = round(r2_score(y_tr, y_pred_CV_RF), 2)
Q2_CV

0.54

In [34]:
RMSE_CV=round(np.sqrt(mean_absolute_error(y_tr, y_pred_CV_RF)), 2)
RMSE_CV

0.74

# 9. Prediction for test set's molecules

In [45]:
x_ts = np.array(x_ts, dtype=np.float32)
y_ts = np.array(y_ts, dtype=np.float32)

In [46]:
y_pred_rf = best_RF.predict(x_ts)

In [47]:
Q2_TS = round(r2_score(y_ts, y_pred_rf), 2)
Q2_TS

0.58

In [48]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts, y_pred_rf)), 2)
RMSE_TS

0.72

# save the model to disk

In [39]:
pickle.dump(best_RF, open('Models/Topological_fingerprints/HDAC2_RF_TF.pkl', 'wb'))

# load the model from disk

In [27]:
best_RF = pickle.load(open('Models/Topological_fingerprints/HDAC2_RF_TF.pkl', 'rb'))

# 10. Y-randomization RF model

In [28]:
permutations = 100
score, permutation_scores, pvalue = permutation_test_score(best_RF, x_tr, y_tr,
                                                           cv=cv, scoring='r2',
                                                           n_permutations=permutations,
                                                           n_jobs=-1,
                                                           verbose=1,
                                                           random_state=24)
print('True score = ', score.round(2),
      '\nY-randomization = ', np.mean(permutation_scores).round(2),
      '\np-value = ', pvalue.round(4))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 74.4min


True score =  0.54 
Y-randomization =  -0.17 
p-value =  0.0099


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 156.3min finished


# Estimating applicability domain. Method - Euclidian distances, K=1

In [49]:
neighbors_k= pairwise_distances(x_tr, n_jobs=-1)
neighbors_k.sort(0)

In [50]:
df_tr=pd.DataFrame(neighbors_k)
df_tr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1278,1279,1280,1281,1282,1283,1284,1285,1286,1287
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,9.273619,23.853722,19.924858,11.747340,7.937254,12.961481,21.095022,15.716233,22.248596,19.131126,...,6.082763,20.396078,7.348469,6.708204,2.000000,11.532562,4.582576,7.348469,7.416198,12.649111
2,10.630146,24.228083,20.024984,19.183327,10.099504,14.832397,21.377558,20.880613,22.671568,20.832666,...,14.866069,21.863211,15.066519,6.855655,3.162278,22.956480,5.656854,13.152946,7.681146,14.177447
3,11.269427,24.576412,20.124611,19.287302,12.369317,15.297058,22.248596,22.068077,22.912878,21.071308,...,16.911535,22.271057,17.691807,15.748015,3.605551,24.979992,6.480741,16.093477,8.831760,14.798649
4,12.767145,24.617067,20.297783,19.442223,13.527749,15.811388,22.627417,22.978251,23.086792,21.563858,...,20.174240,22.449944,19.364916,15.779734,4.242640,25.787594,6.633250,18.357559,8.888194,15.066519
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1283,33.704597,33.045422,34.322006,36.523964,33.660065,34.928497,33.674915,33.926392,32.155869,36.069378,...,33.778690,32.218006,32.295509,34.554306,33.075672,33.911652,34.452866,32.434551,32.403702,34.044090
1284,33.778690,33.120991,34.510868,37.242451,33.704597,34.957115,33.749073,33.970577,32.171417,36.715118,...,33.808285,32.233524,32.326458,34.626579,33.196384,34.044090,34.452866,32.496155,32.511536,34.073448
1285,34.117443,33.181320,34.525352,37.443291,33.719429,35.128338,33.763885,34.263683,32.280025,36.945908,...,34.073448,32.249031,32.326458,34.655445,33.226494,34.044090,34.496376,32.603680,32.572994,34.146744
1286,34.971416,33.585712,35.958309,37.960506,33.985291,36.565010,34.828148,34.684292,32.280025,38.209946,...,34.088120,32.280025,32.388271,34.669872,33.241539,34.088120,34.597687,32.756680,32.603680,34.249088


In [51]:
similarity= neighbors_k

In [52]:
Dmean=np.mean(similarity[1,:])

In [53]:
round(Dmean, 2)

10.38

In [54]:
std=np.std(similarity[1,:])

In [55]:
round(std, 2)

5.45

In [56]:
model_AD_limit=Dmean+std*0.5
print(np.round(model_AD_limit, 2))

13.1


In [57]:
neighbors_k_ts= pairwise_distances(x_tr,Y=x_ts, n_jobs=-1)
neighbors_k_ts.sort(0)

In [58]:
x_ts_AD=pd.DataFrame(neighbors_k_ts)
x_ts_AD

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,313,314,315,316,317,318,319,320,321,322
0,11.747340,9.848858,11.704700,11.789826,7.000000,8.660254,6.480741,12.206555,16.613247,6.480741,...,15.264338,5.291502,13.304134,3.162278,12.409674,4.472136,20.199009,21.283796,9.797959,11.618950
1,11.832160,12.688578,20.832666,17.464249,7.000000,10.862781,15.132746,24.617067,17.262676,18.601076,...,16.186415,14.594520,13.304134,3.316625,12.884099,12.649111,20.904545,21.307276,21.260292,14.352700
2,11.832160,19.467922,20.880613,21.377558,7.141428,12.288206,23.366642,25.019993,17.291616,19.849434,...,18.110771,21.610184,13.341664,4.898980,13.379088,13.076696,21.047565,21.447611,21.470911,14.798649
3,11.832160,19.974985,21.095022,21.470911,7.280110,12.529964,23.579653,25.059929,17.320509,20.928450,...,18.220867,21.863211,13.711309,5.656854,14.317822,13.527749,21.447611,21.494184,22.158520,14.832397
4,11.916375,19.974985,21.166010,21.656408,9.327379,12.569805,24.310492,25.298222,17.320509,23.853722,...,18.220867,22.693611,14.628738,5.830952,16.031219,13.674794,21.840330,21.587032,22.583179,14.866069
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1283,36.755951,36.646965,34.132095,35.213634,36.905285,33.763885,38.716923,33.466400,35.902645,33.763885,...,32.572994,33.808285,36.428013,34.336571,33.015148,35.185223,32.924156,34.292858,33.256577,32.572994
1284,37.121422,37.013512,34.234486,36.097092,37.269291,33.896904,38.781441,33.481339,36.180103,33.778690,...,32.603680,33.852623,36.441734,34.336571,33.151169,35.213634,32.924156,34.583233,33.301651,32.603680
1285,37.296112,37.242451,34.307434,36.304268,37.549965,34.336571,38.820099,33.555923,36.221542,33.793491,...,32.619015,33.867390,36.932369,34.380226,33.151169,35.242020,32.939339,34.756294,33.346664,32.634338
1286,38.522720,37.709415,35.608986,37.376465,39.127995,35.156792,39.433487,33.896904,36.796738,33.867390,...,32.726135,33.896904,38.223030,34.452866,33.256577,35.763111,33.181320,35.874783,33.451458,32.664967


In [59]:
similarity_ts= neighbors_k_ts
cpd_AD=similarity_ts[0,:]
cpd_value = np.round(cpd_AD, 3)
print(cpd_value)

[11.747  9.849 11.705 11.79   7.     8.66   6.481 12.207 16.613  6.481
  5.831  6.856 11.747  7.28  12.45   6.782  9.849  3.    18.     4.796
  8.888 15.937  8.718 14.629 12.166  7.071  8.718 15.652  6.083  6.481
 26.627 12.     5.385 16.125 10.63   6.481 22.869  6.481 11.489 11.79
  6.856  6.928  7.416  5.568  2.646 14.071 19.925  9.327 12.923  8.66
 23.324  0.     8.602  1.414  3.    15.166  8.602  1.     6.    10.247
 15.875 14.107  2.646  9.274 12.083  5.196  7.348 12.767  5.099  5.196
  6.782  5.831  1.     7.746 24.413  8.775  5.099 12.288 15.297 11.358
 11.18   6.633  2.    15.843  5.657 12.53  19.494  6.245 20.469  1.414
  8.775  3.606 14.213  4.899  1.414  1.414  8.062  3.317  1.414  8.888
  8.185 10.44   9.22   9.055 20.224 28.337  4.583  4.243 17.378 12.689
  3.162  5.385  5.916  0.    17.635  9.274  8.775 12.57   6.928  6.782
 15.199 12.961  7.874 27.423  9.487  1.414 12.689 11.874 12.728  5.831
 10.909  1.414 13.748 14.629  8.062 28.054 20.712 11.916 12.923 24.413
  5.099 

In [60]:
cpd_AD = np.where(cpd_value <= model_AD_limit, True, False)
print(cpd_AD)

[ True  True  True  True  True  True  True  True False  True  True  True
  True  True  True  True  True  True False  True  True False  True False
  True  True  True False  True  True False  True  True False  True  True
 False  True  True  True  True  True  True  True  True False False  True
  True  True False  True  True  True  True False  True  True  True  True
 False False  True  True  True  True  True  True  True  True  True  True
  True  True False  True  True  True False  True  True  True  True False
  True  True False  True False  True  True  True False  True  True  True
  True  True  True  True  True  True  True  True False False  True  True
 False  True  True  True  True  True False  True  True  True  True  True
 False  True  True False  True  True  True  True  True  True  True  True
 False False  True False False  True  True False  True  True False  True
  True  True False  True  True  True  True  True  True  True  True  True
 False False False  True  True False  True  True  T

In [61]:
print("Coverage = ", sum(cpd_AD) / len(cpd_AD))

Coverage =  0.7430340557275542


In [62]:
print("Indices of substances included in AD = ", np.where(cpd_AD != 0)[0])

Indices of substances included in AD =  [  0   1   2   3   4   5   6   7   9  10  11  12  13  14  15  16  17  19
  20  22  24  25  26  28  29  31  32  34  35  37  38  39  40  41  42  43
  44  47  48  49  51  52  53  54  56  57  58  59  62  63  64  65  66  67
  68  69  70  71  72  73  75  76  77  79  80  81  82  84  85  87  89  90
  91  93  94  95  96  97  98  99 100 101 102 103 106 107 109 110 111 112
 113 115 116 117 118 119 121 122 124 125 126 127 128 129 130 131 134 137
 138 140 141 143 144 145 147 148 149 150 151 152 153 154 155 159 160 162
 163 164 165 166 167 168 169 172 173 174 175 176 177 178 179 182 184 187
 188 189 190 191 192 193 194 195 196 198 199 200 201 204 205 206 207 208
 209 210 211 212 214 215 216 217 218 219 221 222 223 225 226 228 229 230
 232 233 235 236 237 238 240 242 244 245 248 249 250 251 253 255 256 260
 261 262 263 264 265 267 268 269 270 271 273 274 277 278 283 284 285 286
 288 289 290 291 293 294 295 298 299 300 302 303 304 305 307 309 311 312
 314 316 31

In [63]:
out_Ad=list(np.where(cpd_AD == 0)[0])

# Prediction only for molecules included in  AD

In [64]:
y_pred_rf_ad=list(y_pred_rf)

In [65]:
y_pred_rf_ad[:] = [x for i,x in enumerate(y_pred_rf_ad) if i not in out_Ad]

In [66]:
len(y_pred_rf_ad)

240

In [67]:
y_ts_ad=list(y_ts)

In [68]:
y_ts_ad[:] = [x for i,x in enumerate(y_ts_ad) if i not in out_Ad]

In [69]:
len(y_ts_ad)

240

In [70]:
Q2_TS = round(r2_score(y_ts_ad, y_pred_rf_ad), 2)
Q2_TS

0.66

In [71]:
RMSE_TS=round(np.sqrt(mean_absolute_error(y_ts_ad, y_pred_rf_ad)), 2)
RMSE_TS

0.68

In [72]:
x_tr

array([[0., 1., 1., ..., 1., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 0., 0., 1.],
       [1., 1., 1., ..., 0., 0., 1.]], dtype=float32)