In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

In [None]:
data = np.loadtxt("E:/nmtuan97/1_scripts/ml_learning/data_test.csv", delimiter=',')
X = data[:,:1]
y = data[:,1:2]

X = MinMaxScaler().fit_transform(X)

X_for_train,X_test,y_for_train,y_test = train_test_split(X,y,test_size = 0.2)
# sorted_x = np.sort(X_train)
# sorted_y = np.sort(y_train)

plt.plot(X, y, 'o', label='data looks like')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# find optimal parameter for model

# TRAINING ERROR PER DEGREE
train_rmse_errors = []
# TEST ERROR PER DEGREE
test_rmse_errors = []

gamma_ = np.arange(25,1000,25)

for g in gamma_:
    X_train,X_cv,y_train,y_cv = train_test_split(X_for_train,y_for_train,test_size = 0.25)
    
    svr_model = SVR(kernel = 'rbf', C=350000, gamma=g)
    svr_model.fit(X_train, y_train)
    
    y_train_predicted_svr_ = svr_model.predict(X_train)
    y_cv_svr_ = svr_model.predict(X_cv)
    
    rmse_train_ = np.sqrt(mean_squared_error(y_train, y_train_predicted_svr_))
    rmse_cv_ = np.sqrt(mean_squared_error(y_cv, y_cv_svr_))
    
    train_rmse_errors.append(rmse_train_)
    test_rmse_errors.append(rmse_cv_)

# default_x_ticks = range(len(gamma_))   
plt.plot(gamma_, train_rmse_errors, '-', label='train rmse values')
plt.plot(gamma_, test_rmse_errors, '-', label='test rmse values')
# plt.xticks(default_x_ticks, gamma_)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# train model using svr method
r2_ = 0.99
r2_train_ = 0.0
r2_cv_ = 0.0
max_ = 0
while r2_cv_ < float(r2_):
    X_train,X_cv,y_train,y_cv = train_test_split(X_for_train,y_for_train,test_size = 0.25)
    
    svr_model = SVR(kernel = 'rbf', C=350000, gamma=325)
    svr_model.fit(X_train, y_train)
    
    y_train_predicted_svr_ = svr_model.predict(X_train)
    y_cv_svr_ = svr_model.predict(X_cv)
    
    rmse_train_ = np.sqrt(mean_squared_error(y_train, y_train_predicted_svr_))
    r2_train_ = r2_score(y_train, y_train_predicted_svr_)
    mae_train_ = mean_absolute_error(y_train, y_train_predicted_svr_)
    # Evaluating test dataset
    rmse_test_ = np.sqrt(mean_squared_error(y_cv, y_cv_svr_))
    r2_cv_ = r2_score(y_cv, y_cv_svr_)
    mae_test_ = mean_absolute_error(y_cv, y_cv_svr_)
    
    if r2_cv_ > max_:
        max_ = r2_cv_
        print("R2 score of test is {}".format(max_))
        
print("-------------------------------------------")
print("RMSE of training is {}".format(rmse_train_))
print("R2 score of training  is {}".format(r2_train_))
print("MAE of training is {}".format(mae_train_))
print("RMSE of test is {}".format(rmse_test_))
print("R2 score of test is {}".format(max_))
print("MAE of test is {}".format(mae_test_))

In [None]:
X_test = np.sort(X_test, axis=0)
y_test = np.sort(y_test, axis=0)

y_pred_svr = svr_model.predict(X_test)

r2_svr = r2_score(y_test, y_pred_svr)
print(f'R2 score of test is: {r2_svr}')

plt.plot(X_test, y_test, 'o', label='true')
plt.plot(X_test, y_pred_svr, '-', label='SVR')
plt.legend()
plt.tight_layout()
plt.show()