In [1]:
import pandas as pd 
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [2]:
bestDesc = pickle.load(open('./data/bestDescV2.pkl', 'rb'))
train = pickle.load(open('./dtTrain.pkl', 'rb'))
test = pd.read_csv("data/TestSetNoId.csv")
bestDesc

['TDB2i', 'ATSC8i', 'VR1_D.1', 'AATS8i.1', 'VR2_Dzv.1', 'minHBint8']

In [3]:
dfTrain = train.loc[:,bestDesc]
dfTest = test.loc[:,bestDesc]

dfTrain.shape, dfTest.shape

((74, 6), (19, 6))

In [4]:
# 0. Preparation Data
x_train = dfTrain.iloc[:,:]
x_test = dfTest.iloc[:,:]
y_train = train.iloc[:, [-1]]
y_test = test.iloc[:,[-1]]

In [5]:
# Feature Scaler using standard scaler
# sc = StandardScaler()
# scale_x_train = sc.fit_transform(x_train)
# scala_x_test = sc.transform(x_test)

In [6]:
# Feature Scaler
# Using MinMaxScaler()
# scaler = MinMaxScaler()
# scaler.fit(x_train)
# scale_x_train = scaler.transform(x_train)
# scale_x_test = scaler.transform(x_test)

In [7]:
param_grid={
            'C': [0.1, 1, 10, 100, 1000],
            'degree': [1,2,3,4,5],
}

In [8]:
gsc = GridSearchCV(
        estimator=SVR(kernel='poly'),
        param_grid=param_grid,
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

gsc.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


GridSearchCV(cv=5, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='poly',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'degree': [1, 2, 3, 4, 5]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [9]:
gsc.best_params_

{'C': 100, 'degree': 5}

In [10]:
# View the accuracy score
print('Best score:', gsc.best_score_) 

Best score: -0.32537381049445596


In [17]:
# View the best parameters for the model found using grid search
print('Best C:',gsc.best_estimator_.C) 
print('Best Kernel:',gsc.best_estimator_.kernel)
print('Best Degree:',gsc.best_estimator_.degree)

Best C: 100
Best Kernel: poly
Best Degree: 5


In [12]:
model = SVR(C=gsc.best_params_['C'],kernel="poly", degree=gsc.best_params_['degree'])
model

SVR(C=100, cache_size=200, coef0=0.0, degree=5, epsilon=0.1, gamma='scale',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [13]:
model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


SVR(C=100, cache_size=200, coef0=0.0, degree=5, epsilon=0.1, gamma='scale',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [14]:
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)
y_train_pred, y_test_pred

(array([5.68598421, 5.24968763, 6.04567578, 6.40225509, 7.77369367,
        7.95441739, 6.66598541, 6.36577857, 8.23839543, 6.44055532,
        8.06017277, 7.94236715, 6.03891313, 6.62555649, 6.61693815,
        6.40932631, 8.07504955, 8.02084535, 8.14672811, 5.96119166,
        8.07504955, 7.87384442, 7.4819541 , 7.32136373, 7.51080502,
        6.11202221, 6.09397061, 8.21533918, 7.53601606, 6.29699366,
        7.63944939, 6.37673361, 6.30882028, 6.06726654, 6.18545259,
        8.14305027, 6.75497059, 7.1574557 , 6.59795748, 7.70811694,
        7.71143841, 5.98569832, 6.82262179, 6.03777371, 7.63830632,
        7.29478181, 6.58131111, 6.15342945, 8.25488282, 6.67454882,
        7.42286379, 7.13295937, 7.4329373 , 6.19203785, 7.86388987,
        6.63575768, 8.192586  , 7.34189745, 5.98175629, 7.002743  ,
        7.63154706, 8.02818649, 8.06925817, 7.77276081, 6.57509541,
        7.04596526, 6.85769212, 7.7915443 , 6.7091218 , 7.82039154,
        7.16216098, 7.68313867, 8.14430279, 5.91

In [15]:
r2_train = r2_score(y_train,y_train_pred)
r2_test = r2_score(y_test,y_test_pred)

r2_train, r2_test

(0.7352477951177665, -0.37681148180488133)

In [20]:
for i in range(len(y_test)):
    print(y_test.values[i], y_test_pred[i])

[6.71] 6.595694240152738
[7.119] 7.79737889331784
[6.347] 6.043534853434038
[6.057] 6.908664209510106
[6.108] 6.929055161903192
[6.468] 6.322069239669303
[7.301] 6.957084953823709
[7.26] 7.030581007791624
[6.921] 6.7277811198588315
[6.824] 6.563252633411593
[6.468] 6.884948482776735
[6.971] 7.215372291045695
[6.824] 6.859482267981206
[7.523] 7.4470653136216916
[7.337] 7.5691283188807565
[8.] 7.991488995585654
[7.62] 8.283952167396363
[7.222] 8.122202983156738
[8.046] 8.576014492649618


In [None]:
################################

In [18]:
# mse = mean_squared_error(y_test,y_pred)
# rmse = np.sqrt(mse)
# print("RMSE :" , rmse)

In [19]:
# r2 = r2_score(y_test,pred)
# print("r2 :",r2)

In [20]:
# Testing 1 descriptor
# df = pd.DataFrame(list(zip(dfDesc.iloc[:,0], dtTrain.iloc[:,-1])), 
#                columns =[dfDesc.columns.values[1], dtTrain.columns.values[-1]])
# df

In [23]:
# # Get x & y for plot
# X = df.iloc[:,0]
# y = df.iloc[:,-1]
# X = X.to_numpy()
# X = X.reshape(-1,1)

In [None]:
### SVM Here

# Fit regression model
svr_rbf = SVR(kernel='rbf', C=1000, gamma=0.1, epsilon=.1)
svr_lin = SVR(kernel='linear', C=1000, gamma=0.1)
svr_poly = SVR(kernel='poly', C=1000, gamma=0.1, degree=0, epsilon=.1,
               coef0=1)

In [None]:
plt.scatter(y_train,y_train_pred)