In [1]:
import numpy as np # linear algebra
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import pandas as pd

import torchvision

In [3]:
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import GridSearchCV

In [4]:
#Validation function
n_folds = 10

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(x_train.values)
    rmse= np.sqrt(-cross_val_score(model, x_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [6]:
import data_pipeline as dp
from data_pipeline.data_preprocessor import PreProcessor as pp
# load train set, test set


x_train, y_train, x_test, y_test = pp("final_data").process2()
# print(X_train[0], Y_train[0])

# x_train = Variable(torch.tensor(X_train.value))
# y_train = Variable(torch.tensor(Y_train.value))
y_train

7747   -0.099009
7999    0.658063
6267   -0.389598
3189   -1.175357
8329    0.193445
          ...   
4344   -0.198006
833    -1.087837
547    -0.688432
9556    0.262130
8644    0.203304
Name: latency, Length: 11749, dtype: float64

In [None]:
# -----------------------------------------------------------------

In [7]:
# lasso regression

lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=3))

In [8]:
# ElasticNet regression 

ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

In [9]:
# kernel Ridge regression

KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) #kernel = 'rbf' , 'sigmoid' 

In [10]:
# Gradient Boosting Regression

GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

In [11]:
# LGB Regressor

model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [12]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1"


In [13]:
# XGBoost Algorithm (without GridSearch)

model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, random_state =7, nthread = -1)

In [14]:
# SVR with GridSearch

model_svr = SVR(C=1, cache_size=200, coef0=0, degree=3, epsilon=0.0, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)


#grid_search.best_estimator_

In [15]:
# Random Forest Regressor with GridSearch


regr = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=150, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=90, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

#grid_search.best_estimator_

In [None]:
# -----------------------------------------------------

In [16]:
# lasso score
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Lasso score: 0.4193 (0.0098)



In [17]:
# ElasticNet score
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

ElasticNet score: 0.4194 (0.0098)



In [18]:
# Kernel Rigde score
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Kernel Ridge score: 0.2753 (0.0119)



In [19]:
# Gradient Boosting score
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Gradient Boosting score: 0.2446 (0.0138)



In [20]:
# Light GBM score
score = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

LGBM score: 0.2707 (0.0127)



In [21]:
# XGBoost score
score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Xgboost score: 0.2402 (0.0130)



In [22]:
# SVR score 
score = rmsle_cv(model_svr)
print("SVR score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

SVR score: 0.2764 (0.0111)



In [23]:
# Random Forest score
score = rmsle_cv(regr)
print("Random Forest score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

Random Forest score: 0.2308 (0.0120)



In [None]:
# ----------------------prediction---------------------------

In [None]:
#define a rmsle evaluation function

def rmsle(y, y_pred): 
    return np.sqrt(mean_squared_error(y, y_pred))

In [None]:
# XGBoost

model_xgb.fit(x_train, y_train)
xgb_train_pred = model_xgb.predict(x_train)
xgb_pred = np.expm1(model_xgb.predict(X_test))
print(rmsle(y_train, xgb_train_pred))

In [None]:
# LightGBM

model_lgb.fit(x_train, y_train)
lgb_train_pred = model_lgb.predict(x_train)
lgb_pred = np.expm1(model_lgb.predict(x_test.values))
print(rmsle(y_train, lgb_train_pred))

In [None]:
GBoost.fit(x_train,y_train)
GB_train_pred = GBoost.predict(x_train)
GB_pred = np.expm1(GBoost.predict(x_test.values))
print(rmsle(y_train, GB_train_pred))

In [None]:
ENet.fit(train,y_train)
ENet_train_pred = ENet.predict(x_train)
ENet_pred = np.expm1(ENet.predict(x_test.values))
print(rmsle(y_train, ENet_train_pred))

In [None]:
# ensemble

ensemble = xgb_pred*0.25 + lgb_pred*0.25 + GB_pred*0.5