In [1]:
import pandas as pd 
import numpy as np
import pickle
import matplotlib.pyplot as plt
import joblib

from sklearn import svm
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# import warnings
# warnings.filterwarnings('ignore')

# Single SVM

In [2]:
bestDesc = pickle.load(open('./pickle/trial_10/bestDesc_5.pkl', 'rb'))
train = pickle.load(open('./pickle/train.pkl', 'rb'))
test = pickle.load(open('./pickle/test.pkl', 'rb'))

In [3]:
dfTrain = train.loc[:,bestDesc]
dfTest = test.loc[:,bestDesc]

dfTrain.shape, dfTest.shape, bestDesc

((74, 5), (19, 5), ['ATS6s', 'SpDiam_Dzp', 'VR3_Dzs', 'GRAV-4', 'ETA_Eta_R'])

In [4]:
# 0. Preparation Data
x_train = dfTrain.iloc[:,:]
x_test = dfTest.iloc[:,:]
y_train = train.iloc[:, [-1]]
y_test = test.iloc[:,[-1]]

In [5]:
# Feature Scaler Using MinMaxScaler()
scaler = MinMaxScaler()
scaler.fit(x_train)
scale_x_train = scaler.transform(x_train)
scale_x_test = scaler.transform(x_test)

In [1]:
param_grid={
            'C': [0.1, 1, 10, 100, 1000],
            'degree': [2,3,4,5],
}

In [7]:
gsc = GridSearchCV(
        estimator=SVR(kernel='poly'),
        param_grid=param_grid,
        cv=5, scoring='neg_mean_squared_error', verbose=4, n_jobs=-1)

In [8]:
gsc.fit(scale_x_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed: 11.9min finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=5, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='poly',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'degree': [1, 2, 3, 4, 5]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=4)

In [9]:
gsc.best_params_

{'C': 10, 'degree': 1}

In [10]:
# View the accuracy score
print('Best score:', gsc.best_score_) 

Best score: -0.24089024202482232


In [11]:
# View the best parameters for the model found using grid search
print('Best C:',gsc.best_estimator_.C) 
print('Best Kernel:',gsc.best_estimator_.kernel)
print('Best Degree:',gsc.best_estimator_.degree)

Best C: 10
Best Kernel: poly
Best Degree: 1


In [12]:
model = SVR(C=gsc.best_params_['C'],kernel="poly", degree=gsc.best_params_['degree'])
model

SVR(C=10, cache_size=200, coef0=0.0, degree=1, epsilon=0.1, gamma='scale',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [13]:
model.fit(scale_x_train, y_train)

  y = column_or_1d(y, warn=True)


SVR(C=10, cache_size=200, coef0=0.0, degree=1, epsilon=0.1, gamma='scale',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [14]:
y_train_pred = model.predict(scale_x_train)
y_test_pred = model.predict(scale_x_test)
y_train_pred, y_test_pred

(array([6.10691129, 7.150529  , 6.99234366, 7.16485508, 7.2269229 ,
        7.23685949, 6.04422193, 6.09436401, 7.6136256 , 7.18997007,
        6.60086257, 7.76143232, 6.50985297, 6.27179132, 6.50867974,
        6.62289433, 7.71921112, 6.62898455, 6.69968721, 6.49279741,
        7.71419177, 7.06213571, 6.20942327, 7.60344337, 6.0530463 ,
        7.58042663, 7.70226363, 6.83612596, 6.42342045, 8.25436141,
        6.66629084, 7.64569639, 7.94635755, 7.03959769, 6.1843009 ,
        8.18738295, 6.67020754, 7.46733309, 6.98049099, 6.08729337,
        7.06841591, 7.87692865, 7.54630293, 7.63065297, 7.14344985,
        6.91285513, 6.97902344, 7.60206141, 8.01627299, 7.09995456,
        7.79210142, 6.8370366 , 7.42530069, 7.62368407, 6.1415538 ,
        7.05310469, 6.2220908 , 7.72593028, 7.19986774, 6.27782606,
        7.85369349, 6.68862117, 6.71615166, 6.94949168, 7.75107013,
        6.62743562, 7.32913346, 7.35889803, 7.60392267, 6.71961874,
        7.04860501, 6.96526426, 6.25719096, 8.07

In [15]:
r2_train = r2_score(y_train,y_train_pred)
r2_test = r2_score(y_test,y_test_pred)

r2_train, r2_test

(0.6147542475148267, 0.43743318709639334)

In [None]:
for i in range(len(y_test)):
    print(y_test.values[i], y_test_pred[i])

# Looping  SVM

In [2]:
param_grid={
            'C': [0.1, 1, 10, 100, 1000],
            'degree': [2,3,4,5],
}

In [17]:
bd_list = []
bd_5 = pickle.load(open('./pickle/trial_10/bestDesc_5.pkl', 'rb'))
bd_10 = pickle.load(open('./pickle/trial_10/bestDesc_10.pkl', 'rb'))
bd_15 = pickle.load(open('./pickle/trial_10/bestDesc_15.pkl', 'rb'))
bd_25 = pickle.load(open('./pickle/trial_10/bestDesc_25.pkl', 'rb'))
bd_list.extend([bd_5,bd_10,bd_15,bd_20,bd_25])

In [19]:
r2_poly = pd.DataFrame(columns=['desc','C','degree','r2_train','r2_test','train_pred','test_pred'])

for i in tqdm(range(len(bd_list))):
    # Assign labels to train and test
    dfTrain = train.loc[:,bd_list[i]]
    dfTest = test.loc[:,bd_list[i]]
    
    # Preparation data x & y
    x_train = dfTrain.iloc[:,:]
    x_test = dfTest.iloc[:,:]
    y_train = train.iloc[:, [-1]]
    y_test = test.iloc[:,[-1]]
    
    # Feature scaler Using MinMaxScaler()
    scaler = MinMaxScaler()
    scaler.fit(x_train)
    scale_x_train = scaler.transform(x_train)
    scale_x_test = scaler.transform(x_test)
    
    # Search best params for SVR()
    gsc = GridSearchCV(
        estimator=SVR(kernel='poly'),
        param_grid=param_grid,
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

    gsc.fit(scale_x_train, y_train)
    param_c = gsc.best_params_['C']
    param_degree = gsc.best_params_['degree']
    
    # Assign best params to model
    model = SVR(C=param_c, kernel="poly", degree=param_degree)
    model.fit(scale_x_train, y_train)
    
    #Calculate prediction
    y_train_pred = model.predict(scale_x_train)
    y_test_pred = model.predict(scale_x_test)

    #Calculate r2 score
    r2_train = r2_score(y_train,y_train_pred)
    r2_test = r2_score(y_test,y_test_pred)
    
    r2_poly = r2_poly.append(pd.Series([len(bd_list[i]),param_c, param_degree, r2_train, r2_test, y_train_pred, y_test_pred], index=r2_poly.columns ), ignore_index=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [14:51<00:00, 222.96s/it][A


In [23]:
r2_poly

Unnamed: 0,desc,C,degree,r2_train,r2_test,train_pred,test_pred
0,5,10,1,0.614754,0.437433,"[6.106911285149552, 7.150528995134584, 6.99234...","[6.325459028971485, 6.30702500712521, 6.601286..."
1,10,1000,1,0.657822,0.385383,"[6.114211496489238, 7.024390235536989, 6.83830...","[6.411437549469261, 6.514869302028833, 6.53516..."
2,15,10,1,0.687265,0.354355,"[6.303800273672581, 7.141007924788847, 7.10497...","[6.512726096759137, 6.639995273997616, 6.45153..."
3,20,100,1,0.741796,0.407337,"[6.0952113967988835, 7.186052811474222, 6.9685...","[6.4054054777079825, 6.699439485516346, 6.3100..."


In [24]:
joblib.dump(r2_poly, './pickle/trial_10/r2_poly.pkl')

['./pickle/trial_10/r2_poly.pkl']