In [None]:
import numpy as np # linear algebra
import matplotlib.pyplot as plt
import math
%matplotlib inline

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [None]:
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

In [None]:
def rmse(model, x, y):
    y_pred = model.predict(x)
    mse = mean_squared_error(y_pred, y)
    rmse = np.sqrt(mse)
    return(rmse)

In [None]:
import data_pipeline as dp
from data_pipeline.data_preprocessor import PreProcessor as PP
# load train set, test set

pp = PP("total_data.csv")

x_train, y_train, x_test, y_test = pp.process()

pp_d = PP("final_data.csv")
# for conti / discrete comparing
x_train_d, y_train_d, x_test_d, y_test_d = pp_d.process()

y_train

In [None]:
# -----------------------------------------------------------------

In [None]:
# kernel Ridge regression

# KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) #kernel = 'rbf' , 'sigmoid' 

In [None]:
# Gradient Boosting Regression

GBoost = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1,
                                   max_depth=3, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

In [None]:
# LGB Regressor

model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=500,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1"


In [None]:
# XGBoost Algorithm (without GridSearch)

model_xgb = xgb.XGBRegressor(colsample_bytree=0.4, gamma=0.05, 
                             learning_rate=0.12, max_depth=4, 
                             min_child_weight=1.5, n_estimators=500,
                             reg_alpha=0.5, reg_lambda=0.7, tree_method="gpu_hist",
                             subsample=0.5, random_state =3, nthread = -1)

In [None]:
# Random Forest Regressor with GridSearch


regr = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=150, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=90, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

#grid_search.best_estimator_

In [None]:
# -----------------------------------------------------

In [None]:
model_xgb.fit(x_train, y_train)
model_lgb.fit(x_train, y_train)
GBoost.fit(x_train,y_train)

In [None]:
# Gradient Boosting score
score = rmse(GBoost, x_train, y_train)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
# Light GBM score
score = rmse(model_lgb, x_train, y_train)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

In [None]:
# XGBoost score
score = rmse(model_xgb, x_train, y_train)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
from joblib import load
def inverse_latency(X):
    robust, quantile = load("robust.pkl"), load("quantile.pkl")
    if isinstance(X, pd.Series):
        X = X.values.reshape(-1, 1)
    else:
        X = X.reshape(-1, 1)
    return robust.inverse_transform(quantile.inverse_transform(X)).reshape(-1)

In [None]:
#define a rmsle evaluation function

# def rmsle(y, y_pred): 
#    return np.sqrt(mean_squared_error(y, y_pred))

def rmse_with_inverse(model, x_test, y_test):
    y_pred = model.predict(x_test)
    y_test, y_pred = inverse_latency(y_test), inverse_latency(y_pred)
    return np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
print("Gradient Boosting rmse: {}".format(rmse_with_inverse(GBoost, x_test, y_test)))
print("LGBM rmse : {}".format(rmse_with_inverse(model_lgb, x_test, y_test)))
print("Xgboost rmse : {}".format(rmse_with_inverse(model_xgb, x_test, y_test)))

In [None]:
print("Gradient Boosting rmse: {}".format(rmse_with_inverse(GBoost, x_test_d, y_test_d)))
print("LGBM rmse : {}".format(rmse_with_inverse(model_lgb, x_test_d, y_test_d)))
print("Xgboost rmse : {}".format(rmse_with_inverse(model_xgb, x_test_d, y_test_d)))

In [None]:
def erratio_winverse(model, x_test, y_test):
    y_pred = model.predict(x_test)
    y_test, y_pred = np.array(inverse_latency(y_test)), np.array(inverse_latency(y_pred))
    err = y_pred - y_test
    ratio = err / y_test * 100
    return ratio.mean()

In [None]:
print("Gradient Boosting error ratio : {}".format(erratio_winverse(GBoost, x_test, y_test)))
print("LGBM rmse error ratio : {}".format(erratio_winverse(model_lgb, x_test, y_test)))
print("Xgboost error ratio : {}".format(erratio_winverse(model_xgb, x_test, y_test)))

In [None]:
print("Gradient Boosting error ratio : {}".format(erratio_winverse(GBoost, x_test_d, y_test_d)))
print("LGBM rmse error ratio : {}".format(erratio_winverse(model_lgb, x_test_d, y_test_d)))
print("Xgboost error ratio : {}".format(erratio_winverse(model_xgb, x_test_d, y_test_d)))

# XGBoost

In [None]:
# XGBoost score with discrete
score = rmse(model_xgb, x_test_d, y_test_d)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
xgb_train_pred = model_xgb.predict(x_train)

x = np.arange(-4,4) 
y = x

plt.plot(x, y, color = 'gray')
plt.scatter(y_train, xgb_train_pred, s=0.01, color='black')
plt.xlabel('predicted latency')
plt.ylabel('real latency')
plt.title('XGBoost Result with Continuous Data')
plt.show()
plt.savefig('xgboost_result.pdf')

In [None]:
xgb_pred = model_xgb.predict(x_test)

plt.plot(x, y, color = 'gray')
plt.scatter(y_test, xgb_pred, s=0.01, color='black')
plt.xlabel('input vector')
plt.ylabel('latency')
plt.title('XGBoost Result')
plt.show()

In [None]:
xgb_pred_d = model_xgb.predict(x_train_d)

plt.plot(x, y, color = 'gray')
plt.scatter(y_train_d, xgb_pred_d, s=0.01, color='black')
plt.xlabel('predicted latency')
plt.ylabel('real latency')
plt.title('XGBoost Result with Discrete Data')
plt.show()
plt.savefig('xgboost_result_with_d.pdf', format='pdf')

In [None]:
# comparing discrete

xgb_pred_d = model_xgb.predict(x_test_d)

plt.plot(x, y, color = 'gray')
plt.scatter(y_test_d, xgb_pred_d, s=0.01, color='black')
plt.xlabel('input vector')
plt.ylabel('latency')
plt.title('XGBoost Result with Discrete (TEST)')
plt.show()

In [None]:
print(model_xgb.feature_importances_)

In [None]:
plt.bar(range(len(model_xgb.feature_importances_)), model_xgb.feature_importances_, color='black')
plt.savefig('feature_importance_graph.pdf', format='pdf')
plt.show()


In [None]:
from xgboost import plot_importance
plot_importance(model_xgb, color='black')
plt.show()
plt.savefig('feature_importance_all.pdf', format='pdf')

In [None]:
plot_importance(model_xgb, max_num_features=10, color='black')
plt.show()
plt.savefig('feature_importance_top10.pdf', format='pdf')

# Gradient Boost

In [None]:
# Gradient Boosting score
score = rmse(GBoost, x_test_d, y_test_d)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
gb_pred = GBoost.predict(x_train)

plt.plot(x, y, color = 'gray')
plt.scatter(y_train, gb_pred, s=0.01, color='black')
plt.xlabel('predicted latency')
plt.ylabel('real latency')
plt.title('Gradient Boosting Result with Continuous Data')
plt.show()
plt.savefig('gb_result.pdf', format='pdf')

In [None]:
gb_pred = GBoost.predict(x_test)

plt.plot(x, y, color = 'gray')
plt.scatter(y_test, gb_pred, s=0.01, color='black')
plt.xlabel('input vector')
plt.ylabel('latency')
plt.title('Gradient Boost Result (TEST)')
plt.show()

In [None]:
gb_pred = GBoost.predict(x_train_d)

plt.plot(x, y, color = 'gray')
plt.scatter(y_train_d, gb_pred, s=0.01, color='black')
plt.xlabel('predicted latency')
plt.ylabel('real latency')
plt.title('Gradient Boosting Result with Discrete Data')
plt.show()
plt.savefig('gb_result.pdf', format='pdf')

In [None]:
gb_pred = GBoost.predict(x_test_d)

plt.plot(x, y, color = 'gray')
plt.scatter(y_test_d, gb_pred, s=0.01, color='black')
plt.xlabel('input vector')
plt.ylabel('latency')
plt.title('Gradient Boost Result with Discrete (TEST)')
plt.show()

# Light GBM

In [None]:
# Light GBM score with discrete
score = rmse(model_lgb, x_test_d, y_test_d)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

In [None]:
lgb_pred = model_lgb.predict(x_train)

plt.plot(x, y, color = 'gray')
plt.scatter(y_train, lgb_pred, s=0.01, color='black')
plt.xlabel('predicted latency')
plt.ylabel('real latency')
plt.title('LGBM Result with Continuous Data')
plt.show()
plt.savefig('lgbm_result.pdf', format='pdf')

In [None]:
lgb_pred = model_lgb.predict(x_test)

plt.plot(x, y, color = 'gray')
plt.scatter(y_test, lgb_pred, s=0.01, color='black')
plt.xlabel('input vector')
plt.ylabel('latency')
plt.title('LGBM Result (TEST)')
plt.show()

In [None]:
lgb_pred = model_lgb.predict(x_train_d)

plt.plot(x, y, color = 'gray')
plt.scatter(y_train_d, lgb_pred, s=0.01, color='black')
plt.xlabel('predicted latency')
plt.ylabel('real latency')
plt.title('LGBM Result with Discrete Data')
plt.show()
plt.savefig('lgbm_result_with_d.pdf', format='pdf')

In [None]:
lgb_pred = model_lgb.predict(x_test_d)

plt.plot(x, y, color = 'gray')
plt.scatter(y_test_d, lgb_pred, s=0.01, color='black')
plt.xlabel('input vector')
plt.ylabel('latency')
plt.title('LGBM Result with Discrete (TEST)')
plt.show()

In [None]:
# ISSUE : 시사하는 바?
# 일단은 30%니까 ㄱㅊ지 않을까?