In [3]:
import pandas as pd 
import numpy as np
import pickle
import matplotlib.pyplot as plt
import joblib

from sklearn import svm
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

# Single SVM

In [10]:
# corr_y = joblib.load('./pickle/corr_100.pkl')
bestDesc = joblib.load('./pickle/with_temp/best_25neue_force1.pkl')
train = pickle.load(open('./pickle/train.pkl', 'rb'))
test = pickle.load(open('./pickle/test.pkl', 'rb'))

In [11]:
dfTrain = train.loc[:,bestDesc]
dfTest = test.loc[:,bestDesc]
# dfTrain.head()

In [12]:
# 0. Preparation Data
x_train = dfTrain.iloc[:,:]
x_test = dfTest.iloc[:,:]
y_train = train.iloc[:, [-1]]
y_test = test.iloc[:,[-1]]

In [13]:
# Feature Scaler Using MinMaxScaler()
scaler = MinMaxScaler()
scaler.fit(x_train)
scale_x_train = scaler.transform(x_train)
scale_x_test = scaler.transform(x_test)

In [14]:
param_grid={
            'C': [0.1, 1, 10, 100, 1000],
            'gamma': ['auto','scale'],
            'epsilon':[0.1, 1, 10, 100, 1000]
}

In [15]:
gsc = GridSearchCV(
        estimator=SVR(kernel='rbf'),
        param_grid=param_grid,
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

gsc.fit(scale_x_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'epsilon': [0.1, 1, 10, 100, 1000],
                         'gamma': ['auto', 'scale']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [16]:
gsc.best_params_


{'C': 10, 'epsilon': 0.1, 'gamma': 'scale'}

In [17]:
# View the accuracy score
print('Best score:', gsc.best_score_) 

Best score: -0.2211604721562269


In [18]:
# View the best parameters for the model found using grid search
print('Best C:',gsc.best_estimator_.C) 
print('Best Kernel:',gsc.best_estimator_.kernel)
print('Best Gamma:',gsc.best_estimator_.gamma)

Best C: 10
Best Kernel: rbf
Best Gamma: scale


In [19]:
model = SVR(C=gsc.best_params_['C'],kernel="rbf", gamma=gsc.best_params_['gamma'])
model

SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [20]:
model.fit(scale_x_train, y_train)

SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [21]:
y_train_pred = model.predict(scale_x_train)
y_test_pred = model.predict(scale_x_test)

In [22]:
r2_train = r2_score(y_train,y_train_pred)
r2_test = r2_score(y_test,y_test_pred)

r2_train, r2_test

(0.8986528653006999, 0.619819537133319)

In [63]:
for i in range(len(y_train)):
    print(y_train.values[i], y_train_pred[i])

[5.638] 6.265343286035508
[7.086] 7.174139167092239
[7.921] 7.066685764113828
[7.921] 7.643774835231581
[7.046] 7.485823728297835
[7.921] 7.4605987260432
[6.076] 6.11648917797229
[6.194] 6.14921484682162
[8.155] 8.043851886604823
[7.638] 7.328939632904187
[6.197] 6.296561244775274
[8.301] 7.978011850938216
[6.038] 6.204728099187532
[7.276] 6.828979059390967
[6.167] 5.970125998101578
[6.523] 6.622933752111291
[7.745] 7.6446731688641965
[6.288] 6.602725844244075
[7.091] 6.6304165343528085
[7.244] 7.143750973903688
[8.155] 7.9852289122553906
[6.62] 7.774672179837969
[6.509] 6.14958315116276
[6.721] 7.675856162444327
[6.31] 6.174215313911869
[8.155] 8.026327953402841
[7.569] 8.190075819646655
[6.366] 6.327133066891134
[6.456] 6.9398893526284855
[8.155] 8.055058260926872
[7.678] 6.574680361961288
[7.523] 7.422885760593127
[8.046] 7.980456890910359
[6.512] 6.683389139540472
[6.076] 6.176364404036471
[7.553] 7.652751590421957
[5.886] 5.966556515045265
[7.585] 7.695682778180304
[6.444] 7.29362

In [62]:
for i in range(len(y_test)):
    print(y_test.values[i], y_test_pred[i])

[4.456] 6.154785226860104
[4.347] 6.1107107098662174
[6.065] 6.122051070376378
[6.886] 6.245325626752472
[7.796] 7.964052325770385
[8.187] 7.593699861422129
[7.076] 6.923213042912474
[6.046] 6.281347260953152
[8.155] 7.452247355733293
[7.174] 6.567198949110574
[8.155] 7.099578322820442
[7.678] 6.935472576962193
[5.347] 6.617301489143201
[6.947] 6.64950915547399
[6.319] 5.899781401817434
[5.854] 5.557521087036809
[7.699] 7.174139167092239
[7.194] 7.748953090158609
[8.046] 8.072896546147012


In [65]:
# idx_train = y_train
# idx_test = y_test
tr_pos = ['train'] * len(y_train_pred)
ts_pos = ['test'] * len(y_test_pred)

#Create dataframe for y_true
ytrue = y_train
ytrue = ytrue.append(y_test)

#Create dataframe for y_pred
pred = np.append(y_train_pred,y_test_pred)
pred_pos = np.append(tr_pos,ts_pos)
ypred = pd.DataFrame(pred, columns=['pred'])
ypred['pos'] = pred_pos
ypred.index = ytrue.index

#Create dataframe for y_true & y_pred
rbf_pred = pd.DataFrame()
rbf_pred = pd.concat([ytrue, ypred], axis=1, ignore_index=False)
rbf_sort = rbf_pred.sort_index(axis=0)
rbf_sort.to_csv (r'data\rbf_pred.csv',index=True, header=True,sep=',')
rbf_sort

Unnamed: 0,pIC50,pred,pos
0,4.456,6.154785,test
1,4.347,6.110711,test
2,6.046,6.281347,test
3,6.710,6.809937,train
4,6.947,6.649509,test
5,6.038,6.204728,train
6,6.903,7.020454,train
7,7.699,7.174139,test
8,7.119,7.263984,train
9,7.086,7.174139,train


# Looping SVM

In [4]:
train = pickle.load(open('./pickle/train.pkl', 'rb'))
test = pickle.load(open('./pickle/test.pkl', 'rb'))

In [5]:
param_grid={
            'C': [0.1, 1, 10, 100, 1000],
            'gamma': ['auto','scale'],
            'epsilon':[0.1, 1, 10, 100, 1000]
}

In [6]:
bd_list = []
bd_5 = joblib.load('./pickle/with_temp/best_5neue.pkl')
bd_10 = joblib.load('./pickle/with_temp/best_10neue.pkl')
bd_15 = joblib.load('./pickle/with_temp/best_15neue.pkl')
bd_20 = joblib.load('./pickle/with_temp/best_20neue.pkl')
bd_25 = joblib.load('./pickle/with_temp/best_25neue_force1.pkl')
bd_list.extend([bd_5,bd_10,bd_15,bd_20,bd_25])

In [7]:
r2_rbf = pd.DataFrame(columns=['desc','C','Gamma','Epsilon','r2_train','r2_test','train_pred','test_pred'])

for i in tqdm(range(len(bd_list))):
    # Assign labels to train and test
    dfTrain = train.loc[:,bd_list[i]]
    dfTest = test.loc[:,bd_list[i]]
    
    # Preparation data x & y
    x_train = dfTrain.iloc[:,:]
    x_test = dfTest.iloc[:,:]
    y_train = train.iloc[:, [-1]]
    y_test = test.iloc[:,[-1]]
    
    # Feature scaler Using MinMaxScaler()
    scaler = MinMaxScaler()
    scaler.fit(x_train)
    scale_x_train = scaler.transform(x_train)
    scale_x_test = scaler.transform(x_test)
    
    # Search best params for SVR()
    gsc = GridSearchCV(
        estimator=SVR(kernel='rbf'),
        param_grid=param_grid,
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

    gsc.fit(scale_x_train, y_train)
    param_c = gsc.best_params_['C']
    param_gamma = gsc.best_params_['gamma']
    param_epsilon = gsc.best_params_['epsilon']
    
    # Assign best params to model
    model = SVR(C=param_c, kernel="rbf", gamma=param_gamma, epsilon=param_epsilon)
    model.fit(scale_x_train, y_train)
    
    #Calculate prediction
    y_train_pred = model.predict(scale_x_train)
    y_test_pred = model.predict(scale_x_test)

    #Calculate r2 score
    r2_train = r2_score(y_train,y_train_pred)
    r2_test = r2_score(y_test,y_test_pred)
    
    r2_rbf = r2_rbf.append(pd.Series([len(bd_list[i]),param_c, param_gamma, param_epsilon, r2_train, r2_test, y_train_pred, y_test_pred], index=r2_rbf.columns ), ignore_index=True)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.10s/it]


In [8]:
r2_rbf

Unnamed: 0,desc,C,Gamma,Epsilon,r2_train,r2_test,train_pred,test_pred
0,5,100,auto,0.1,0.710229,0.363688,"[6.217826773088923, 7.403007490002416, 6.86915...","[6.388194154152309, 6.7085793228074735, 6.1865..."
1,10,100,auto,0.1,0.664918,0.290628,"[6.196962094853949, 7.155056617984262, 6.67280...","[6.6631949880835935, 7.088123366267387, 6.3124..."
2,15,1,auto,0.1,0.566943,0.285819,"[6.317349530848247, 7.1530619412199465, 6.8215...","[6.547425131604254, 6.702612503820566, 6.68479..."
3,20,10,scale,0.1,0.899128,0.516734,"[5.737702617311482, 7.1457372100099805, 7.2917...","[6.1730391244322345, 6.380691821962466, 6.4346..."
4,25,10,scale,0.1,0.898653,0.61982,"[5.73791346221079, 7.185920198936896, 7.474285...","[5.801505179015026, 5.6651440114223455, 6.5158..."


In [14]:
joblib.dump(r2_rbf, './pickle/with_temp/r2_rbf_neue.pkl')

['./pickle/with_temp/r2_rbf_neue.pkl']