In [1]:
import pandas as pd 
import numpy as np
import pickle
import matplotlib.pyplot as plt
import joblib

from sklearn import svm
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

# Single SVM

In [2]:
bestDesc = joblib.load('./pickle/with_temp/best_25neue_force1.pkl')
train = pickle.load(open('./pickle/train.pkl', 'rb'))
test = pickle.load(open('./pickle/test.pkl', 'rb'))

In [3]:
dfTrain = train.loc[:,bestDesc]
dfTest = test.loc[:,bestDesc]

dfTrain.shape, dfTest.shape, bestDesc

((74, 25),
 (19, 25),
 ['ATS5m',
  'EE_D',
  'PPSA-3',
  'SpDiam_Dzp',
  'MWC4',
  'ATS3m',
  'WTPT-3',
  'WNSA-1',
  'SpDiam_Dzv',
  'EE_Dt',
  'DELS',
  'ATS5s',
  'GRAVH-2',
  'MOMI-Y',
  'ATSC0e',
  'nHBa',
  'ATS0m',
  'piPC1',
  'GRAV-1',
  'SpMax4_Bhm',
  'SpMax6_Bhs',
  'ETA_Eta_R',
  'TWC',
  'Kier1',
  'VR3_D'])

In [4]:
# 0. Preparation Data
x_train = dfTrain.iloc[:,:]
x_test = dfTest.iloc[:,:]
y_train = train.iloc[:, [-1]]
y_test = test.iloc[:,[-1]]

In [5]:
# Feature Scaler Using MinMaxScaler()
scaler = MinMaxScaler()
scaler.fit(x_train)
scale_x_train = scaler.transform(x_train)
scale_x_test = scaler.transform(x_test)

In [6]:
param_grid={
            'C': [0.1, 1, 10, 100, 1000],
            'degree': [2,3,4,5],
}

In [7]:
gsc = GridSearchCV(
        estimator=SVR(kernel='poly'),
        param_grid=param_grid,
        cv=5, scoring='neg_mean_squared_error', verbose=4, n_jobs=-1)

In [8]:
gsc.fit(scale_x_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  93 out of 100 | elapsed: 25.1min remaining:  1.9min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 29.4min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='poly',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100, 1000], 'degree': [2, 3, 4, 5]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=4)

In [9]:
gsc.best_params_

{'C': 10, 'degree': 2}

In [10]:
# View the accuracy score
print('Best score:', gsc.best_score_) 

Best score: -0.21728544337200698


In [11]:
# View the best parameters for the model found using grid search
print('Best C:',gsc.best_estimator_.C) 
print('Best Kernel:',gsc.best_estimator_.kernel)
print('Best Degree:',gsc.best_estimator_.degree)

Best C: 10
Best Kernel: poly
Best Degree: 2


In [12]:
model = SVR(C=gsc.best_params_['C'],kernel="poly", degree=gsc.best_params_['degree'])
model

SVR(C=10, cache_size=200, coef0=0.0, degree=2, epsilon=0.1, gamma='scale',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [13]:
model.fit(scale_x_train, y_train)

SVR(C=10, cache_size=200, coef0=0.0, degree=2, epsilon=0.1, gamma='scale',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [14]:
y_train_pred = model.predict(scale_x_train)
y_test_pred = model.predict(scale_x_test)
y_train_pred, y_test_pred

(array([6.22657211, 7.06561153, 6.93025481, 7.20654037, 7.36664141,
        7.25114748, 6.21882124, 6.20325135, 7.71618496, 7.20394484,
        6.44419078, 7.93343648, 6.45876249, 6.2110151 , 6.37350634,
        6.4227005 , 7.4018077 , 6.64115087, 6.61532363, 6.18501573,
        7.89079864, 7.33644469, 6.25007038, 7.36835748, 6.20987447,
        7.68061175, 7.80462311, 6.4656494 , 6.30901972, 8.12788207,
        6.63032306, 7.47125224, 8.14627124, 6.90343276, 6.22312144,
        7.66348881, 6.19536109, 7.45097325, 6.89681031, 6.22278235,
        7.00331787, 8.01095843, 7.58725934, 7.78467267, 7.22023443,
        6.6257184 , 6.87307035, 7.36938974, 7.78636855, 7.00117503,
        7.91472279, 6.87012606, 7.5284629 , 7.73216413, 6.20698014,
        6.95593741, 6.25695436, 7.78262824, 7.11838817, 6.22076598,
        8.12456331, 6.53592359, 6.56827238, 6.77553908, 7.62249578,
        6.5494497 , 7.44412577, 6.94116461, 7.6397431 , 6.68897703,
        6.95620562, 6.89919257, 6.20363413, 8.21

In [15]:
r2_train = r2_score(y_train,y_train_pred)
r2_test = r2_score(y_test,y_test_pred)

r2_train, r2_test

(0.6649330673982252, 0.43075379817227244)

In [17]:
for i in range(len(y_train)):
    print(y_train.values[i], y_train_pred[i])

[5.638] 6.226572111668538
[7.086] 7.065611527556749
[7.921] 6.930254814934793
[7.921] 7.206540365134268
[7.046] 7.366641407696173
[7.921] 7.2511474841410095
[6.076] 6.218821243533371
[6.194] 6.203251353370403
[8.155] 7.71618495541467
[7.638] 7.203944843321702
[6.197] 6.444190780538259
[8.301] 7.93343648415229
[6.038] 6.4587624929660965
[7.276] 6.211015104260714
[6.167] 6.373506336187612
[6.523] 6.422700504041112
[7.745] 7.401807695583253
[6.288] 6.641150872334672
[7.091] 6.615323629933835
[7.244] 6.185015730825235
[8.155] 7.890798641597181
[6.62] 7.336444692063518
[6.509] 6.250070384230421
[6.721] 7.368357477684677
[6.31] 6.209874471772269
[8.155] 7.6806117475823985
[7.569] 7.804623113400263
[6.366] 6.4656494028553
[6.456] 6.309019718301316
[8.155] 8.127882071533728
[7.678] 6.630323055814339
[7.523] 7.471252235492935
[8.046] 8.146271236746012
[6.512] 6.90343276464578
[6.076] 6.223121444941693
[7.553] 7.663488805442208
[5.886] 6.195361093875912
[7.585] 7.45097325156352
[6.444] 6.8968103

In [16]:
for i in range(len(y_test)):
    print(y_test.values[i], y_test_pred[i])

[4.456] 6.338598671035593
[4.347] 6.370584096022066
[6.065] 6.450710020269932
[6.886] 6.231191530135003
[7.796] 7.82150537702743
[8.187] 7.150744324216797
[7.076] 6.799962758601064
[6.046] 6.4973969590930185
[8.155] 7.3602213469492686
[7.174] 6.229381776974657
[8.155] 7.014984595188404
[7.678] 6.742534152369851
[5.347] 6.218091165855183
[6.947] 6.827048693081091
[6.319] 6.582066106403412
[5.854] 6.127268274490934
[7.699] 7.065611527556749
[7.194] 7.431516312848013
[8.046] 7.669661669035955


In [19]:
# idx_train = y_train
# idx_test = y_test
tr_pos = ['train'] * len(y_train_pred)
ts_pos = ['test'] * len(y_test_pred)

#Create dataframe for y_true
ytrue = y_train
ytrue = ytrue.append(y_test)

#Create dataframe for y_pred
pred = np.append(y_train_pred,y_test_pred)
pred_pos = np.append(tr_pos,ts_pos)
ypred = pd.DataFrame(pred, columns=['pred'])
ypred['pos'] = pred_pos
ypred.index = ytrue.index

#Create dataframe for y_true & y_pred
poly_pred = pd.DataFrame()
poly_pred = pd.concat([ytrue, ypred], axis=1, ignore_index=False)
poly_sort = poly_pred.sort_index(axis=0)
poly_sort.to_csv (r'data\poly_pred.csv',index=True, header=True,sep=',')
poly_sort

Unnamed: 0,pIC50,pred,pos
0,4.456,6.338599,test
1,4.347,6.370584,test
2,6.046,6.497397,test
3,6.710,6.899193,train
4,6.947,6.827049,test
5,6.038,6.458762,train
6,6.903,7.003318,train
7,7.699,7.065612,test
8,7.119,7.118388,train
9,7.086,7.065612,train


# Looping  SVM

In [4]:
param_grid={
            'C': [0.1, 1, 10, 100, 1000],
            'epsilon': [0.1, 1, 10, 100, 1000],
            'degree': [2,3,4,5],
}

In [6]:
bd_list = []
bd_5 = joblib.load('./pickle/with_temp/best_5desc.pkl')
bd_10 = joblib.load('./pickle/with_temp/best_10desc.pkl')
bd_15 = joblib.load('./pickle/with_temp/best_15desc.pkl')
bd_20 = joblib.load('./pickle/with_temp/best_20desc.pkl')
bd_25 = joblib.load('./pickle/with_temp/best_25neue_force1.pkl')
bd_list.extend([bd_5,bd_10,bd_15,bd_20,bd_25])

In [7]:
r2_poly = pd.DataFrame(columns=['desc','C','degree','epsilon','r2_train','r2_test','train_pred','test_pred'])

for i in tqdm(range(len(bd_list))):
    # Assign labels to train and test
    dfTrain = train.loc[:,bd_list[i]]
    dfTest = test.loc[:,bd_list[i]]
    
    # Preparation data x & y
    x_train = dfTrain.iloc[:,:]
    x_test = dfTest.iloc[:,:]
    y_train = train.iloc[:, [-1]]
    y_test = test.iloc[:,[-1]]
    
    # Feature scaler Using MinMaxScaler()
    scaler = MinMaxScaler()
    scaler.fit(x_train)
    scale_x_train = scaler.transform(x_train)
    scale_x_test = scaler.transform(x_test)
    
    # Search best params for SVR()
    gsc = GridSearchCV(
        estimator=SVR(kernel='poly'),
        param_grid=param_grid,
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

    gsc.fit(scale_x_train, y_train)
    param_c = gsc.best_params_['C']
    param_degree = gsc.best_params_['degree']
    param_epsilon = gsc.best_params_['epsilon']
    
    # Assign best params to model
    model = SVR(C=param_c, kernel="poly", degree=param_degree, epsilon = param_epsilon)
    model.fit(scale_x_train, y_train)
    
    #Calculate prediction
    y_train_pred = model.predict(scale_x_train)
    y_test_pred = model.predict(scale_x_test)

    #Calculate r2 score
    r2_train = r2_score(y_train,y_train_pred)
    r2_test = r2_score(y_test,y_test_pred)
    
    r2_poly = r2_poly.append(pd.Series([len(bd_list[i]),param_c, param_degree, param_epsilon, r2_train, r2_test, y_train_pred, y_test_pred], index=r2_poly.columns ), ignore_index=True)

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [15:04<00:00, 180.96s/it]


In [9]:
r2_poly

Unnamed: 0,desc,C,degree,epsilon,r2_train,r2_test,train_pred,test_pred
0,5,10,2,0.1,0.664933,0.430754,"[6.226572111668538, 7.065611527556749, 6.93025...","[6.338598671035593, 6.370584096022066, 6.45071..."
1,10,10,2,0.1,0.665629,0.382835,"[6.196581285636014, 7.112180299080946, 7.10225...","[6.38988131050668, 6.489418195322997, 6.635390..."
2,15,1,2,0.1,0.602201,0.240142,"[6.338920763439411, 7.084958108420191, 6.77909...","[6.679333770864102, 6.8920217522346165, 6.5475..."
3,20,10,2,0.1,0.767293,0.339209,"[6.305315758378565, 7.093602952773423, 7.18563...","[6.442813166502915, 6.469632761044507, 6.64264..."
4,25,1,2,0.1,0.691145,0.463645,"[6.23477562714978, 7.013270297498258, 7.040723...","[6.337445118754583, 6.266665614097071, 6.48534..."


In [10]:
joblib.dump(r2_poly, './pickle/with_temp/r2_poly_neue.pkl')

['./pickle/with_temp/r2_poly_neue.pkl']