In [1]:
import pandas as pd 
import numpy as np
import pickle
import matplotlib.pyplot as plt
import joblib

from sklearn import svm
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

# Single SVM

In [2]:
bestDesc = joblib.load('./pickle/with_temp/best_25Desc_neue.pkl')
train = pickle.load(open('./pickle/train.pkl', 'rb'))
test = pickle.load(open('./pickle/test.pkl', 'rb'))

In [3]:
dfTrain = train.loc[:,bestDesc]
dfTest = test.loc[:,bestDesc]

dfTrain.shape, dfTest.shape, bestDesc

((74, 5),
 (19, 5),
 ['SpMAD_Dzp', 'nHeavyAtom', 'GRAV-4', 'GRAV-6', 'SpDiam_Dzp'])

In [56]:
# 0. Preparation Data
x_train = dfTrain.iloc[:,:]
x_test = dfTest.iloc[:,:]
y_train = train.iloc[:, [-1]]
y_test = test.iloc[:,[-1]]

In [5]:
# Feature Scaler Using MinMaxScaler()
scaler = MinMaxScaler()
scaler.fit(x_train)
scale_x_train = scaler.transform(x_train)
scale_x_test = scaler.transform(x_test)

In [6]:
param_grid={
            'C': [0.1, 1, 10, 100, 1000],
            'degree': [1],
}

In [7]:
gsc = GridSearchCV(
        estimator=SVR(kernel='poly'),
        param_grid=param_grid,
        cv=5, scoring='neg_mean_squared_error', verbose=4, n_jobs=-1)

In [8]:
gsc.fit(scale_x_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    7.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    7.5s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='poly',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100, 1000], 'degree': [1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=4)

In [9]:
gsc.best_params_

{'C': 100, 'degree': 1}

In [10]:
# View the accuracy score
print('Best score:', gsc.best_score_) 

Best score: -0.2256692877066066


In [11]:
# View the best parameters for the model found using grid search
print('Best C:',gsc.best_estimator_.C) 
print('Best Kernel:',gsc.best_estimator_.kernel)
print('Best Degree:',gsc.best_estimator_.degree)

Best C: 100
Best Kernel: poly
Best Degree: 1


In [12]:
model = SVR(C=gsc.best_params_['C'],kernel="poly", degree=gsc.best_params_['degree'])
model

SVR(C=100, cache_size=200, coef0=0.0, degree=1, epsilon=0.1, gamma='scale',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [13]:
model.fit(scale_x_train, y_train)

SVR(C=100, cache_size=200, coef0=0.0, degree=1, epsilon=0.1, gamma='scale',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [14]:
y_train_pred = model.predict(scale_x_train)
y_test_pred = model.predict(scale_x_test)
y_train_pred, y_test_pred

(array([6.10558121, 7.01652171, 6.81515535, 7.02179832, 7.37435662,
        7.15207206, 5.97625945, 6.00257797, 7.65351966, 7.27277819,
        6.49843881, 7.89225472, 6.31902878, 6.55452099, 6.34847506,
        6.7863566 , 7.30784447, 6.82744741, 6.82122083, 6.89205459,
        7.86171997, 7.17082   , 6.12083616, 7.19413034, 5.99840788,
        7.63399572, 7.68531797, 6.56707099, 6.76789282, 8.47001847,
        6.50753454, 7.42229309, 7.9989738 , 6.9088566 , 6.17108057,
        8.20369417, 6.38856612, 7.3869545 , 6.80245285, 6.06602508,
        6.92292992, 7.85869183, 7.50084643, 7.68605752, 7.00522056,
        6.70636358, 6.85146422, 7.22509804, 8.05289124, 6.96816971,
        7.90578404, 6.74715976, 7.35189682, 7.65061201, 6.02075408,
        6.93047624, 6.11510278, 7.74743144, 7.05780409, 6.5119061 ,
        8.09562776, 6.7064193 , 6.78456821, 6.84214512, 7.54930329,
        6.41457752, 7.51455755, 7.00612982, 7.53446931, 7.00665002,
        6.96436048, 6.8102439 , 6.36824059, 8.16

In [15]:
r2_train = r2_score(y_train,y_train_pred)
r2_test = r2_score(y_test,y_test_pred)

r2_train, r2_test

(0.6580718493491047, 0.44870625929559205)

In [17]:
for i in range(len(y_train)):
    print(y_train.values[i], y_train_pred[i])

[5.638] 6.105581209470048
[7.086] 7.016521711850632
[7.921] 6.815155354998441
[7.921] 7.021798320975935
[7.046] 7.3743566184594185
[7.921] 7.152072061436942
[6.076] 5.976259454265214
[6.194] 6.0025779664216286
[8.155] 7.653519659227103
[7.638] 7.272778187393473
[6.197] 6.498438806886074
[8.301] 7.892254722285688
[6.038] 6.319028776214142
[7.276] 6.554520986992318
[6.167] 6.348475064137719
[6.523] 6.786356598689994
[7.745] 7.307844468352786
[6.288] 6.827447405380869
[7.091] 6.821220833243305
[7.244] 6.892054592123389
[8.155] 7.861719967090866
[6.62] 7.170820003691605
[6.509] 6.120836158302214
[6.721] 7.19413033963784
[6.31] 5.9984078764416635
[8.155] 7.633995722984227
[7.569] 7.685317974020594
[6.366] 6.567070991381658
[6.456] 6.767892823136978
[8.155] 8.470018468099656
[7.678] 6.507534544676534
[7.523] 7.422293094967503
[8.046] 7.998973796034792
[6.512] 6.908856597393341
[6.076] 6.171080565456907
[7.553] 8.203694170959384
[5.886] 6.388566116864157
[7.585] 7.386954503747947
[6.444] 6.80

In [18]:
for i in range(len(y_test)):
    print(y_test.values[i], y_test_pred[i])

[4.456] 6.260537752318643
[4.347] 6.348923161974798
[6.065] 6.449012884702003
[6.886] 6.144982640032889
[7.796] 7.827040691994884
[8.187] 7.005820763187134
[7.076] 6.901258636186557
[6.046] 6.405954939456168
[8.155] 7.501103895464811
[7.174] 6.418103691396719
[8.155] 7.031394615572057
[7.678] 7.122204075350768
[5.347] 6.420902939238412
[6.947] 6.757415275420416
[6.319] 6.836812084388302
[5.854] 5.650839021742345
[7.699] 7.016521711850632
[7.194] 7.337849911990733
[8.046] 7.548584155301714


In [131]:
# idx_train = y_train
# idx_test = y_test
tr_pos = ['train'] * len(y_train_pred)
ts_pos = ['test'] * len(y_test_pred)

#Create dataframe for y_true
ytrue = y_train
ytrue = ytrue.append(y_test)

#Create dataframe for y_pred
pred = np.append(y_train_pred,y_test_pred)
pred_pos = np.append(tr_pos,ts_pos)
ypred = pd.DataFrame(pred, columns=['pred'])
ypred['pos'] = pred_pos
ypred.index = ytrue.index

#Create dataframe for y_true & y_pred
linear_pred = pd.DataFrame()
linear_pred = pd.concat([ytrue, ypred], axis=1, ignore_index=False)
lin_sort = linear_pred.sort_index(axis=0)
# lin_sort.to_csv (r'data\linear_pred.csv',index=True, header=True,sep=',')
lin_sort

Unnamed: 0,pIC50,pred,pos
0,4.456,6.260538,test
1,4.347,6.348923,test
2,6.046,6.405955,test
3,6.710,6.810244,train
4,6.947,6.757415,test
5,6.038,6.319029,train
6,6.903,6.922930,train
7,7.699,7.016522,test
8,7.119,7.057804,train
9,7.086,7.016522,train


In [77]:
# train_list = pd.DataFrame()
# train_list = train_list.append(idx_train)
# train_list['pred'] = y_train_pred
# train_list.sort_index(axis=0)

In [76]:
# test_list = pd.DataFrame()
# test_list = test_list.append(idx_test)
# test_list['pred'] = y_test_pred
# test_list.sort_index(axis=0)

In [49]:
idx_uni = idx_true

# Looping  SVM

In [2]:
train = pickle.load(open('./pickle/train.pkl', 'rb'))
test = pickle.load(open('./pickle/test.pkl', 'rb'))

In [3]:
param_grid={
            'C': [0.1, 1, 10, 100, 1000],
            'epsilon': [0.1, 1, 10, 100, 1000],
            'degree': [1],
}

In [4]:
bd_list = []
bd_5 = joblib.load('./pickle/with_temp/best_5desc.pkl')
bd_10 = joblib.load('./pickle/with_temp/best_10desc.pkl')
bd_15 = joblib.load('./pickle/with_temp/best_15desc.pkl')
bd_20 = joblib.load('./pickle/with_temp/best_20desc.pkl')
bd_25 = joblib.load('./pickle/with_temp/best_25neue_force2.pkl')
bd_list.extend([bd_5,bd_10,bd_15,bd_20,bd_25])

In [5]:
r2_linear = pd.DataFrame(columns=['desc','C','degree','epsilon','r2_train','r2_test','train_pred','test_pred'])

for i in tqdm(range(len(bd_list))):
    # Assign labels to train and test
    dfTrain = train.loc[:,bd_list[i]]
    dfTest = test.loc[:,bd_list[i]]
    
    # Preparation data x & y
    x_train = dfTrain.iloc[:,:]
    x_test = dfTest.iloc[:,:]
    y_train = train.iloc[:, [-1]]
    y_test = test.iloc[:,[-1]]
    
    # Feature scaler Using MinMaxScaler()
    scaler = MinMaxScaler()
    scaler.fit(x_train)
    scale_x_train = scaler.transform(x_train)
    scale_x_test = scaler.transform(x_test)
    
    # Search best params for SVR()
    gsc = GridSearchCV(
        estimator=SVR(kernel='poly'),
        param_grid=param_grid,
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

    gsc.fit(scale_x_train, y_train)
    param_c = gsc.best_params_['C']
    param_degree = gsc.best_params_['degree']
    param_epsilon = gsc.best_params_['epsilon']
    
    # Assign best params to model
    model = SVR(C=param_c, kernel="linear", degree=param_degree, epsilon = param_epsilon)
    model.fit(scale_x_train, y_train)
    
    #Calculate prediction
    y_train_pred = model.predict(scale_x_train)
    y_test_pred = model.predict(scale_x_test)

    #Calculate r2 score
    r2_train = r2_score(y_train,y_train_pred)
    r2_test = r2_score(y_test,y_test_pred)
    
    r2_linear = r2_linear.append(pd.Series([len(bd_list[i]),param_c, param_degree, param_epsilon, r2_train, r2_test, y_train_pred, y_test_pred], index=r2_linear.columns ), ignore_index=True)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.21s/it]


In [6]:
r2_linear

Unnamed: 0,desc,C,degree,epsilon,r2_train,r2_test,train_pred,test_pred
0,5,100.0,1,0.1,0.646213,0.433277,"[6.161268777959449, 7.024454925344428, 6.80713...","[6.290453953988986, 6.369107311264229, 6.46185..."
1,10,1000.0,1,0.1,0.735066,0.383717,"[6.114813422406908, 7.11184879231453, 7.343099...","[6.435277362132436, 6.564580360587596, 6.59022..."
2,15,10.0,1,0.1,0.646906,0.17667,"[6.456152601507498, 7.0942683964659965, 6.7839...","[6.9826344030002945, 7.091034293239329, 6.5321..."
3,20,1000.0,1,0.1,0.791319,0.216127,"[5.7385422873782765, 6.987586679435662, 7.4503...","[6.745952862234116, 6.91371001888292, 6.210958..."
4,25,0.1,1,0.1,0.610495,0.324558,"[6.408861473823444, 7.169741568140091, 6.99255...","[6.579959263885294, 6.620212934918186, 6.82138..."


In [15]:
joblib.dump(r2_linear, './pickle/with_temp/r2_linear_neue.pkl')

['./pickle/with_temp/r2_linear_neue.pkl']