## Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from math import radians, cos, sin, asin, sqrt
import datetime as dt

import sklearn.metrics as metrics

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.pipeline import Pipeline
from xgboost.sklearn import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", 60)
pd.set_option("display.max_rows", 500)
%matplotlib inline
sns.set()

## Read Data

In [2]:
df_twn_hkg_flights = pd.read_csv('../data/filtered_flights_twn_hkg.csv', index_col=0)

In [3]:
df_twn_hkg_flights['scheduled_departure_dt'] = pd.to_datetime(df_twn_hkg_flights['scheduled_departure_dt'])
df_twn_hkg_flights['scheduled_arrival_dt'] = pd.to_datetime(df_twn_hkg_flights['scheduled_arrival_dt'])
df_twn_hkg_flights['real_departure_dt'] = pd.to_datetime(df_twn_hkg_flights['real_departure_dt'])
df_twn_hkg_flights['estimated_arrival_dt'] = pd.to_datetime(df_twn_hkg_flights['estimated_arrival_dt'])

In [4]:
df_twn_hkg_flights.isnull().sum()[df_twn_hkg_flights.isnull().sum() > 0]

aircraft_model               1038
aircraft_registration        1038
airline                      1038
scheduled_departure_utc      5766
scheduled_arrival_utc        5766
estimated_arrival_utc      167191
scheduled_departure_dt       5766
scheduled_arrival_dt         5766
weather_codes              147141
prev_latitude                   3
prev_longitude                  3
dtype: int64

## Feature Engineering

In [5]:
df_twn_hkg_flights['minutes_since_real_departure_interval'].value_counts().sort_index()

0 <= min <= 10      29067
10 < min <= 20       8202
100 < min <= 110     3880
110 < min <= 120      328
20 < min <=30        8881
30 < min <= 40       7532
40 < min <= 50       8446
50 < min <= 60       8480
60 < min <= 70      14669
70 < min <= 80      34607
80 < min <= 90      32189
90 < min <= 100     15485
Name: minutes_since_real_departure_interval, dtype: int64

In [6]:
df_for_modeling = df_twn_hkg_flights.groupby(['flight_id', 'minutes_since_real_departure_interval']).agg(
    time_since_real_departure=('time_since_real_departure', max),
    mean_speed=('speed', np.mean),
    std_dev_speed=('speed', np.std)).reset_index()

df_for_modeling.head()

Unnamed: 0,flight_id,minutes_since_real_departure_interval,time_since_real_departure,mean_speed,std_dev_speed
0,c0aabc0,0 <= min <= 10,583.0,266.609756,78.348541
1,c0aabc0,10 < min <= 20,1196.0,437.4,8.221922
2,c0aabc0,20 < min <=30,1781.0,432.444444,1.236033
3,c0aabc0,30 < min <= 40,2358.0,431.666667,1.732051
4,c0aabc0,40 < min <= 50,2964.0,437.214286,2.044827


In [7]:
df_combined = pd.merge(df_for_modeling,
                      df_twn_hkg_flights,
                      on=['flight_id', 'minutes_since_real_departure_interval', 
                          'time_since_real_departure'])

df_combined.head()

Unnamed: 0,flight_id,minutes_since_real_departure_interval,time_since_real_departure,mean_speed,std_dev_speed,timestamp_utc,latitude,longitude,altitude,heading,speed,flight_callsign,aircraft_model,aircraft_registration,airline,origin,destination,scheduled_departure_utc,scheduled_arrival_utc,real_departure_utc,estimated_arrival_utc,real_flight_duration,scheduled_departure_dt,scheduled_arrival_dt,real_departure_dt,estimated_arrival_dt,route,scheduled_flight_duration,forecasted_arrival_dt,forecasted_arrival_dt_nearest_hr,...,rel_humidity,wind_dir,wind_speed,altimeter_pressure,visibility,gust_speed,sky_level_1_coverage,sky_level_1_altitude,weather_codes,apparent_temp,weather_codes_+SHRA,weather_codes_+TSRA,weather_codes_-BR,weather_codes_-DZ,weather_codes_-RA,weather_codes_-SHRA,weather_codes_BR,weather_codes_DZ,weather_codes_HZ,weather_codes_RA,weather_codes_SHRA,weather_codes_TSRA,weather_codes_VCTS,speed_interval,prev_latitude,prev_longitude,calculated_flight_duration,calculated_time_before_arrival,displacement_to_hkg,displacement_fr_twn
0,c0aabc0,0 <= min <= 10,583.0,266.609756,78.348541,1483195308,24.916599,120.714203,25725,225,432,CAL,Boeing 747,B-18717,China Airlines Cargo,Taiwan Taoyuan International Airport,Hong Kong International Airport,1483193000.0,1483198000.0,1483195000.0,,4663.0,2016-12-31 14:00:00,2016-12-31 15:30:00,2016-12-31 14:32:05,2016-12-31 15:49:48,Taiwan Taoyuan International Airport -> Hong K...,5400.0,2016-12-31 15:49:48,2016-12-31 15:00:00,...,77.6,110,10.0,30.18,6.21,0,FEW,2500.0,,66.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,400 < speed <= 500,25.004745,120.811821,5115.0,4532.0,750.539171,53.93731
1,c0aabc0,10 < min <= 20,1196.0,437.4,8.221922,1483195921,24.04525,119.740303,38000,225,430,CAL,Boeing 747,B-18717,China Airlines Cargo,Taiwan Taoyuan International Airport,Hong Kong International Airport,1483193000.0,1483198000.0,1483195000.0,,4663.0,2016-12-31 14:00:00,2016-12-31 15:30:00,2016-12-31 14:32:05,2016-12-31 15:49:48,Taiwan Taoyuan International Airport -> Hong K...,5400.0,2016-12-31 15:49:48,2016-12-31 15:00:00,...,77.6,110,10.0,30.18,6.21,0,FEW,2500.0,,66.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,400 < speed <= 500,24.13603,119.841408,5115.0,3919.0,625.606802,188.339027
2,c0aabc0,20 < min <=30,1781.0,432.444444,1.236033,1483196506,23.224211,118.834839,38000,225,434,CAL,Boeing 747,B-18717,China Airlines Cargo,Taiwan Taoyuan International Airport,Hong Kong International Airport,1483193000.0,1483198000.0,1483195000.0,,4663.0,2016-12-31 14:00:00,2016-12-31 15:30:00,2016-12-31 14:32:05,2016-12-31 15:49:48,Taiwan Taoyuan International Airport -> Hong K...,5400.0,2016-12-31 15:49:48,2016-12-31 15:00:00,...,77.6,110,10.0,30.18,6.21,0,FEW,2500.0,,66.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,400 < speed <= 500,23.318529,118.937943,5115.0,3334.0,514.244163,317.562958
3,c0aabc0,30 < min <= 40,2358.0,431.666667,1.732051,1483197083,22.537901,117.843002,38000,237,431,CAL,Boeing 747,B-18717,China Airlines Cargo,Taiwan Taoyuan International Airport,Hong Kong International Airport,1483193000.0,1483198000.0,1483195000.0,,4663.0,2016-12-31 14:00:00,2016-12-31 15:30:00,2016-12-31 14:32:05,2016-12-31 15:49:48,Taiwan Taoyuan International Airport -> Hong K...,5400.0,2016-12-31 15:49:48,2016-12-31 15:00:00,...,77.6,110,10.0,30.18,6.21,0,FEW,2500.0,,66.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,400 < speed <= 500,22.611099,117.966003,5115.0,2757.0,404.190118,444.40611
4,c0aabc0,40 < min <= 50,2964.0,437.214286,2.044827,1483197689,22.315187,116.577507,31625,268,440,CAL,Boeing 747,B-18717,China Airlines Cargo,Taiwan Taoyuan International Airport,Hong Kong International Airport,1483193000.0,1483198000.0,1483195000.0,,4663.0,2016-12-31 14:00:00,2016-12-31 15:30:00,2016-12-31 14:32:05,2016-12-31 15:49:48,Taiwan Taoyuan International Airport -> Hong K...,5400.0,2016-12-31 15:49:48,2016-12-31 15:00:00,...,77.6,110,10.0,30.18,6.21,0,FEW,2500.0,,66.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,400 < speed <= 500,22.318268,116.714355,5115.0,2151.0,273.531907,563.43905


In [8]:
df_first_10_min = df_combined[df_combined['minutes_since_real_departure_interval'] == '0 <= min <= 10']

max flight time since departure in 10 min interval

In [9]:
df_first_10_min['time_since_real_departure'].max()

600.0

In [10]:
df_first_10_min['average_velocity'] = df_first_10_min['displacement_fr_twn'] / df_first_10_min['time_since_real_departure']

**Correlation of numerical variables**

In [12]:
df_first_10_min.dtypes

flight_id                                        object
minutes_since_real_departure_interval            object
time_since_real_departure                       float64
mean_speed                                      float64
std_dev_speed                                   float64
timestamp_utc                                     int64
latitude                                        float64
longitude                                       float64
altitude                                          int64
heading                                           int64
speed                                             int64
flight_callsign                                  object
aircraft_model                                   object
aircraft_registration                            object
airline                                          object
origin                                           object
destination                                      object
scheduled_departure_utc                         

In [None]:
# sky_level_1_coverage NEEDS TO BE DUMMY CONVERTED

In [None]:
target_var = 'calculated_time_before_arrival'

In [None]:
numeric_features = ['time_since_real_departure', 'latitude', 'longitude',
                    'altitude', 'heading', 'speed',
                    'prev_latitude', 'prev_longitude', 
                    'displacement_to_hkg', 'displacement_fr_twn',
                    'average_velocity', 'mean_speed', 'std_dev_speed',
                    'arrivals',
                    'air_temp', 'dewpoint_temp', 'rel_humidity']

air_temp                                        float64
dewpoint_temp                                   float64
rel_humidity                                    float64
wind_dir                                          int64
wind_speed                                      float64
altimeter_pressure                              float64
visibility                                      float64
gust_speed                                        int64
sky_level_1_altitude                            float64
displacement_to_hkg                             float64
displacement_fr_twn                             float64
average_velocity                                float64

In [None]:
# numeric_features = df_first_10_min.dtypes[(df_first_10_min.dtypes == np.int)|
#                                           (df_first_10_min.dtypes == np.float)].index.tolist()
# numeric_features.remove('calculated_time_before_arrival')

In [None]:
nominal_features = [
    'flight_callsign', 
    'aircraft_registration',
    'aircraft_model', 
    'airline'
]

In [None]:
corr = df_first_10_min[numeric_features + [target_var]].corr()[target_var]
corr.sort_values()

In [None]:
sns.pairplot(df_first_10_min, y_vars=numeric_features, x_vars=[target_var])

In [None]:
# # Set the default matplotlib figure size to 7x7:
# fix, ax = plt.subplots(figsize=(7,7))

# # Generate a mask for the upper triangle (taken from seaborn example gallery)
# mask = np.zeros_like(wine_corr, dtype=np.bool)
# mask[np.triu_indices_from(mask)] = True

# # Plot the heatmap with seaborn.
# # Assign the matplotlib axis the function returns. This will let us resize the labels.
# ax = sns.heatmap(wine_corr, mask=mask, ax=ax)

# # Resize the labels.
# ax.set_xticklabels(ax.xaxis.get_ticklabels(), fontsize=14)
# ax.set_yticklabels(ax.yaxis.get_ticklabels(), fontsize=14)

# # If you put plt.show() at the bottom, it prevents those useless printouts from matplotlib.
# plt.show()

**ANOVA of categorical (nominal) variables**

In [None]:
df_minus_cat_nan = df_first_10_min.dropna(subset=nominal_features)

In [None]:
df_cat_features = df_minus_cat_nan[nominal_features]

In [None]:
f_values, p_values = f_classif(df_cat_features.apply(LabelEncoder().fit_transform), 
                               df_minus_cat_nan[target_var])

p_values

In [None]:
df_minus_cat_nan.iloc[:, np.where(p_values < 0.05)[0]].columns.tolist()

In [None]:
selected_features = [
    'latitude', 'longitude', 
    'altitude', 
#     'heading', 
    'speed',
#     'flight_callsign', 
#     'aircraft_registration', 
#     'aircraft_model', 
#     'airline', 
#     'average_velocity',
    'displacement_to_hkg',
    'mean_speed',
    'std_dev_speed'
]

# selected_features = ['time_since_real_departure', 'latitude', 'longitude',
#                     'altitude', 'heading', 'speed',
#                     'prev_latitude', 'prev_longitude', 
#                     'displacement_to_hkg', 'displacement_fr_twn',
#                     'average_velocity']

nominal_features = [
#     'flight_callsign', 
#     'aircraft_registration',
#     'aircraft_model', 
#     'airline'
]

df_first_10_min[selected_features].head()

In [None]:
df_twn_hkg_flights.loc[df_twn_hkg_flights['flight_id'] == 'c0aabc0', 'calculated_time_before_arrival'].plot()

## Modeling

In [None]:
def r2_adj(y_true, y_preds, p):
    n = len(y_true)
    y_mean = np.mean(y_true)
    numerator = np.sum(np.square(y_true - y_preds)) / (n - p - 1)
    denominator = np.sum(np.square(y_true - y_mean)) / (n - 1)
    return (1 - (numerator / denominator))

def get_regression_metrics(y_true, y_pred, p):
    mse = metrics.mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
#     msle = metrics.mean_squared_log_error(y_true, y_pred)
    mae = metrics.median_absolute_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)
    r2a = r2_adj(y_true, y_pred, p)
    
    print('Mean squared error      = ', mse)
    print('Root mean squared error = ', rmse)
#     print('Mean squared log error  = ', msle)
    print('Median absolute error   = ', mae)
    print('R^2                     = ', r2)
    print('Adjusted R^2            = ', r2a)
    
    return {
        'mse': mse,
        'rmse': rmse,
#         'msle': msle,
        'mae': mae,
        'r2': r2,
        'r2_adjusted': r2a
    }

In [None]:
# Train data

# mask = df_twn_hkg_flights['time_since_real_departure'] <= (40*60)
# df_for_modeling = df_twn_hkg_flights[mask].copy()
X = df_first_10_min[selected_features].copy()
y = df_first_10_min['calculated_time_before_arrival'].copy()

In [None]:
y.hist()

In [None]:
X = pd.get_dummies(X, columns=nominal_features, drop_first=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)

### Linear Regression

In [None]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

In [None]:
# X_train.columns

In [None]:
# plt.figure(figsize=(12,10))
# plt.barh(X_train.columns, linreg.coef_)

In [None]:
y_train_pred = linreg.predict(X_train)

train_regression_metrics = get_regression_metrics(y_train, y_train_pred, X_train.shape[1])

In [None]:
X_test.shape

In [None]:
y_test_pred = linreg.predict(X_test)

test_regression_metrics = get_regression_metrics(y_test, y_test_pred, X_test.shape[1])

df_test_true_pred = pd.DataFrame(columns=['true', 'pred'])
df_test_true_pred = df_test_true_pred.assign(true=y_test)
df_test_true_pred = df_test_true_pred.assign(pred=y_test_pred)
df_test_true_pred['diff'] = df_test_true_pred['true'] - df_test_true_pred['pred']

In [None]:
df_test_true_pred.shape

In [None]:
df_test_true_pred[df_test_true_pred['diff'] < 0].shape # if true < pred => earlier than expected

In [None]:
# df_test_true_pred[df_test_true_pred.pred < 0].shape

In [None]:
sorted(list(zip(X_test.columns, abs(linreg.coef_))), 
    key=lambda item:item[1],
      reverse=True)

In [None]:
pd.Series(linreg.coef_, index=X_test.columns).plot.bar(figsize=(15, 7))

In [None]:
df_test_true_pred.true.hist()

In [None]:
def regression_diagnostic_plots(df_true_pred):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
    axes = axes.ravel()
    
    
    # Residual Plot: Validating the assumption of linearity
    sns.residplot('pred', 'true', data=df_true_pred,lowess=True,
                  line_kws={'color': 'red', 'lw': 1, 'alpha': 1}, 
                  ax=axes[0])
    axes[0].set_title('Residual Plot')
    axes[0].set_ylabel('Residuals')
    axes[0].set_xlabel('Fitted Values')
    
    residuals = df_true_pred['true'] - df_true_pred['pred']
    
    # Normal Q-Q Plot: Validating the assumption of normally distributed residuals (errors)
    stats.probplot(residuals, dist='norm', plot=axes[1])
    axes[1].set_title('Normal Q-Q Plot')
    
    
    # Scale-Location plot: Validating the assumption of homoscedasticity of residuals
    model_norm_residuals_abs_sqrt=np.sqrt(np.abs(residuals))

    sns.regplot(df_true_pred['pred'], model_norm_residuals_abs_sqrt,
                scatter=True,
                lowess=True,
                line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8}, 
                ax=axes[2])
    axes[2].set_title("Scale-Location Plot")
    axes[2].set_ylabel("Standarized Residuals")
    axes[2].set_xlabel("Fitted Values")
    
    axes[3].axis('off')
    
    fig.tight_layout()

In [None]:
regression_diagnostic_plots(df_test_true_pred)

**Standardization of features**

In [None]:
ss = StandardScaler()
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)

In [None]:
r_alphas = np.logspace(0, 5, 200)
ridge_model = RidgeCV(alphas=r_alphas, store_cv_values=True)
ridge_model = ridge_model.fit(X_train_sc, y_train)

y_test_pred = ridge_model.predict(X_test_sc)

test_regression_metrics = get_regression_metrics(y_test, y_test_pred, X_test.shape[1])
df_test_true_pred = pd.DataFrame(columns=['true', 'pred'])
df_test_true_pred = df_test_true_pred.assign(true=y_test)
df_test_true_pred = df_test_true_pred.assign(pred=y_test_pred)
df_test_true_pred['diff'] = df_test_true_pred['true'] - df_test_true_pred['pred']

In [None]:
df_test_regression_metrics = pd.DataFrame()
df_test_true_pred = pd.DataFrame(columns=['model_name', 'true', 'pred'])

In [None]:
def gridsearch_model_evaluation(model_name, model, hyper_param):
    clf_pipe = Pipeline([
        ('reg', model)
    ])
#   
    print(clf_pipe.get_params().keys())
    gs = GridSearchCV(clf_pipe, 
                      param_grid=hyper_param, 
#                       cv=cv, 
                      verbose=10, n_jobs=-1)
    gs.fit(X_train_sc, y_train)
    
    print(model_name)
    print('Best Score: {}'.format(gs.best_score_))
    print('Best Params: {}'.format(gs.best_params_))
    
    # Best Model
    grid_model = gs.best_estimator_
    
    print('Model Score on X_train: {}'.format(grid_model.score(X_train_sc, y_train)))
    
    # Prediction and score
    y_test_pred = grid_model.predict(X_test_sc)

    test_regression_metrics = get_regression_metrics(y_test, y_test_pred, X_test.shape[1])
    test_regression_metrics['model_name'] = model_name
    

    test_regression_metrics = get_regression_metrics(y_test, y_test_pred, X_test.shape[1])

    df_test_true_pred = pd.DataFrame(columns=['true', 'pred'])
    df_test_true_pred = df_test_true_pred.assign(true=y_test)
    df_test_true_pred = df_test_true_pred.assign(pred=y_test_pred)
    df_test_true_pred['diff'] = df_test_true_pred['true'] - df_test_true_pred['pred']
    regression_diagnostic_plots(df_test_true_pred)
    
    return test_regression_metrics, y_test_pred

## Ridge Regression

In [None]:
test_regression_metrics, y_test_pred = gridsearch_model_evaluation('Ridge Regression',
                                                                    Ridge(),
                                                                    {'reg__alpha': np.logspace(0, 5, 200)})

df_test_regression_metrics = df_test_regression_metrics.append(test_regression_metrics, ignore_index=True)

## Lasso Regression

In [None]:
test_regression_metrics, y_test_pred = gridsearch_model_evaluation('Lasso Regression',
                                                                    Lasso(),
                                                                    {'reg__alpha': np.arange(0.001, 0.15, 0.0025)}
                                                                   )

df_test_regression_metrics = df_test_regression_metrics.append(test_regression_metrics, ignore_index=True)

## ElasticNet Regression

In [None]:
test_regression_metrics, y_test_pred = gridsearch_model_evaluation('ElasticNet Regression',
                                                                    ElasticNet(),
                                                                    {'reg__alpha': np.arange(0.5, 1.0, 0.005),
                                                                     'reg__l1_ratio': [0.5]}
                                                                   )

df_test_regression_metrics = df_test_regression_metrics.append(test_regression_metrics, ignore_index=True)

## Random Forest Regression

In [None]:
# https://gdcoder.com/decision-tree-regressor-explained-in-depth/
# https://medium.com/datadriveninvestor/random-forest-regression-9871bc9a25eb
test_regression_metrics, y_test_pred = gridsearch_model_evaluation('Random Forest Regression',
                                                                    RandomForestRegressor(),
                                                                    {'reg__max_depth': range(3,7),
                                                                     'reg__n_estimators': np.arange(10, 70, 10)}
                                                                   )

df_test_regression_metrics = df_test_regression_metrics.append(test_regression_metrics, ignore_index=True)

## Support Vector Regression

In [None]:
# https://medium.com/pursuitnotes/support-vector-regression-in-6-steps-with-python-c4569acd062d
# https://stackoverflow.com/questions/40568808/coefficient-in-support-vector-regression-svr-using-grid-search-gridsearchcv

test_regression_metrics, y_test_pred = gridsearch_model_evaluation('Support Vector Regression',
                                                                    SVR(),
                                                                    {'reg__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
                                                                     'reg__C': np.logspace(-3, 2, 2),
                                                                     'reg__gamma': np.logspace(-5, 2, 2)}
                                                                   )

df_test_regression_metrics = df_test_regression_metrics.append(test_regression_metrics, ignore_index=True)

## AdaBoost Regression

In [None]:
# https://www.programcreek.com/python/example/86712/sklearn.ensemble.AdaBoostRegressor
ada = AdaBoostRegressor(base_estimator=DecisionTreeRegressor()) 
test_regression_metrics, y_test_pred = gridsearch_model_evaluation('AdaBoost Regression',
                                                                    ada,
                                                                    {'reg__n_estimators': [25, 50],
                                                                     'reg__base_estimator__max_depth': [1,2],
                                                                     'reg__learning_rate': np.arange(0.4, 1.1, 0.1)
                                                                    }
                                                                   )

df_test_regression_metrics = df_test_regression_metrics.append(test_regression_metrics, ignore_index=True)

## XGBoost

In [None]:
# https://www.datacamp.com/community/tutorials/xgboost-in-python
# https://www.kaggle.com/phunter/xgboost-with-gridsearchcv
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

test_regression_metrics, y_test_pred = gridsearch_model_evaluation('XGBoost Regression',
                                                                    XGBRegressor(
                                                                        objective='reg:linear',
                                                                        nthread=12,
                                                                        eval_metric='rmse'
                                                                    ),
                                                                    {
                                                                      'reg__learning_rate': [.03, 0.05, .07], #so called `eta` value
                                                                      'reg__max_depth': [5, 6, 7],
                                                                      'reg__min_child_weight': [4],
                                                                      'reg__subsample': [0.7],
                                                                      'reg__colsample_bytree': [0.7],
                                                                      'reg__n_estimators': [500],
                                                                      'reg_'
                                                                    }
                                                                   )

df_test_regression_metrics = df_test_regression_metrics.append(test_regression_metrics, ignore_index=True)

In [None]:
# model_names = [
#                'Linear Regression',
#                'Ridge Regression',
#                'Lasso Regression'
# ]

# models = [
#          LinearRegression(),
#          Ridge(),
#          Lasso()
         
#         ]

# hyper_params = [
#                 {},
#                 {
#                     'clf__alpha': np.logspace(0, 5, 10)
#                 },
#                 {
#                     'clf__alpha': np.arange(0.001, 0.15, 0.0025)
#                     'clf__alpha': np.arange(0.001, 0.005, 0.0025)
#                 }
# ]

In [None]:
# # df_test_regression_metrics = pd.DataFrame()

# for model_name, model, hyper_param in zip(model_names, models, hyper_params):
    