### Load Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import operator
import pickle
from datetime import datetime
from scipy.stats import shapiro, normaltest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

### Load & Prepare Data

In [2]:
def load_dataset():
    df = pd.read_csv('../data/kc_house_data.csv')
    df = df.drop('id',axis=1)
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df['date'].apply(lambda date:date.month)
    df['year'] = df['date'].apply(lambda date:date.year)
    df = df.drop('date',axis=1)
    df = df.drop('zipcode',axis=1)
    
    X = df.drop('price',axis=1)
    y = df['price']
    
    return X, y

### Regression Models

In [3]:
def get_models():
    models = []
    models.append(('LR', LinearRegression()))
    models.append(('LASSO', Lasso()))
    models.append(('EN', ElasticNet()))
    models.append(('KNN', KNeighborsRegressor()))
    models.append(('CART', DecisionTreeRegressor()))
    models.append(('SVR', SVR(gamma='auto')))
    return models

### Ensemble Models

In [4]:
def get_ensemble_models():
    ensembles = []
    ensembles.append(('AB', AdaBoostRegressor()))
    ensembles.append(('GBM', GradientBoostingRegressor())) 
    ensembles.append(('RF', RandomForestRegressor()))
    ensembles.append(('ET', ExtraTreesRegressor()))
    ensembles.append(('XGB', XGBRegressor()))
    return ensembles

### Model Evaluation Function

In [5]:
# evaluate a model
def evaluate_model(X, y, model):
    # define evaluation procedure
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    # evaluate model
    scores = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=cv, n_jobs=-1)  
    return scores

In [6]:
def display_scores(scores, name=''):
    rmse_scores = np.sqrt(-scores)
    #print("Scores:", rmse_scores)
    print(f"RMSE Mean {name}: {rmse_scores.mean()}")
    print(f"RMSE SD {name}: {rmse_scores.std()}")

### Load the Dataset

In [7]:
X, y = load_dataset()
X.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15,month,year
0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,47.5112,-122.257,1340,5650,10,2014
1,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,47.721,-122.319,1690,7639,12,2014
2,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,47.7379,-122.233,2720,8062,2,2015
3,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,47.5208,-122.393,1360,5000,12,2014
4,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,47.6168,-122.045,1800,7503,2,2015


In [8]:
print(X.shape, y.shape)
print(X.shape[1])

(21597, 19) (21597,)
19


In [9]:
X.columns

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'month', 'year'],
      dtype='object')

### Object & Numeric Column Transfer

In [10]:
# determine categorical and numerical features
#numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns
#categorical_ix = X.select_dtypes(include=['object', 'bool']).columns

# IT SHOULD BE DON WHILE DATA PRE-PROCESSING

# define the data preparation for the columns
#t = [('cat', OneHotEncoder(), categorical_ix)]
#cat_col_transform = ColumnTransformer(transformers=t, remainder='passthrough')

### Normality Test

In [11]:
def is_normal(data):
    alpha = 0.05
    stat, p = normaltest(data)
    if p > alpha:
        normalTest = True
    else:
        normalTest = False
    
    stat, p = shapiro(data)
    if p > alpha:
        shapiroTest = True
    else:
        shapiroTest = False
        
    return normalTest and shapiroTest

In [12]:
Gaussian_Like = []
Non_Gaussian = []

for i, name in enumerate (X.columns):
    if is_normal(X[name]):
        Gaussian_Like.append(name)
    else:
        Non_Gaussian.append(name)
        
print (f"Gaussian Like columns: {Gaussian_Like}")
print (f"Non-Gaussian Like columns: {Non_Gaussian}")

Gaussian Like columns: []
Non-Gaussian Like columns: ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'month', 'year']




In [13]:
# Standardize Only Gaussian-Like Input Variables
# Normalize Only Non-Gaussian Input Variables

In [14]:
if len(Gaussian_Like)> 0 and len(Non_Gaussian) > 0:
    print("Normality Function is working")

### Spot Check Algorithms

In [15]:
init_time = datetime.now()
models = get_models()
for name, model in models:
    scores = evaluate_model(X, y, model)
    display_scores(scores, name)
fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))

RMSE Mean LR: 202148.7774280055
RMSE SD LR: 12936.354102455933
RMSE Mean LASSO: 202148.73899318252
RMSE SD LASSO: 12936.486855114108
RMSE Mean EN: 226051.84792081185
RMSE SD EN: 12663.621397224997
RMSE Mean KNN: 256648.30884374006
RMSE SD KNN: 16594.000541308156
RMSE Mean CART: 180010.62248766955
RMSE SD CART: 15101.962505019203
RMSE Mean SVR: 377716.3508651177
RMSE SD SVR: 20886.600878293215
Execution time :  0:06:10.969464


### Spot  Check Algorithms with Normalization

In [16]:
init_time = datetime.now()
models = get_models()
for name, model in models:
    # define pipeline
    pipeline = Pipeline(steps=[('norm', MinMaxScaler()), ('m', model)])
    scores = evaluate_model(X, y, pipeline)
    display_scores(scores, name)
fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))

RMSE Mean LR: 202145.2867384407
RMSE SD LR: 12943.15219668564
RMSE Mean LASSO: 202148.96255300718
RMSE SD LASSO: 12936.425663093607
RMSE Mean EN: 344150.09132223553
RMSE SD EN: 19991.857086988337
RMSE Mean KNN: 178421.54546147387
RMSE SD KNN: 14897.66028131097
RMSE Mean CART: 181904.84861288717
RMSE SD CART: 17154.193472721832
RMSE Mean SVR: 377657.8190079603
RMSE SD SVR: 20888.35441889303
Execution time :  0:04:05.610742


### Spot  Check Algorithms with Standardization

In [17]:
init_time = datetime.now()
models = get_models()
for name, model in models:
    # define pipeline
    pipeline = Pipeline(steps=[('Stnd', StandardScaler()), ('m', model)])
    scores = evaluate_model(X, y, pipeline)
    display_scores(scores, name)
fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))

RMSE Mean LR: 202154.91237409425
RMSE SD LR: 12962.970109882714
RMSE Mean LASSO: 202148.75997957124
RMSE SD LASSO: 12936.306065518083
RMSE Mean EN: 211384.64692688358
RMSE SD EN: 15067.324752263878
RMSE Mean KNN: 174557.6925252656
RMSE SD KNN: 13264.484181634602
RMSE Mean CART: 183082.97083737174
RMSE SD CART: 16967.704233080956
RMSE Mean SVR: 377142.5577756564
RMSE SD SVR: 20922.33359715182
Execution time :  0:03:33.923775


### Spot Check Ensemble Algorithms

In [18]:
init_time = datetime.now()
models = get_ensemble_models()
for name, model in models:
    scores = evaluate_model(X, y, model)
    display_scores(scores, name)
fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))

RMSE Mean AB: 350047.7043867652
RMSE SD AB: 25434.634143770087
RMSE Mean GBM: 134865.46673956417
RMSE SD GBM: 10054.581996241734
RMSE Mean RF: 127266.23241093171
RMSE SD RF: 11035.061423551599
RMSE Mean ET: 127482.3896872711
RMSE SD ET: 11414.98254748831
RMSE Mean XGB: 120784.22063826791
RMSE SD XGB: 10614.733343791724
Execution time :  0:05:23.735842


### Spot Check Ensemble Algorithms with Normalization

In [66]:
init_time = datetime.now()
models = get_ensemble_models()
for name, model in models:
    # define pipeline
    pipeline = Pipeline(steps=[('norm', MinMaxScaler()), ('m', model)])
    scores = evaluate_model(X, y, pipeline)
    display_scores(scores, name)
fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))

RMSE Mean AB: 343356.1299673539
RMSE SD AB: 25251.873982202575
RMSE Mean GBM: 135047.7803414584
RMSE SD GBM: 10182.124077718094
RMSE Mean RF: 126596.66284441769
RMSE SD RF: 10809.758322968812
RMSE Mean ET: 127517.91862967004
RMSE SD ET: 11470.729139705722
RMSE Mean XGB: 120651.20252651487
RMSE SD XGB: 10521.786036697406
Execution time :  0:05:56.099640


### Spot Check Ensemble Algorithms with Standardization

In [67]:
init_time = datetime.now()
models = get_ensemble_models()
for name, model in models:
    # define pipeline
    pipeline = Pipeline(steps=[('Stnd', StandardScaler()), ('m', model)])
    scores = evaluate_model(X, y, pipeline)
    display_scores(scores, name)
fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))

RMSE Mean AB: 344721.44724643056
RMSE SD AB: 18986.009189849035
RMSE Mean GBM: 135068.75384956726
RMSE SD GBM: 10233.561335932562
RMSE Mean RF: 126905.5855872733
RMSE SD RF: 11039.192063338774
RMSE Mean ET: 127626.69448219675
RMSE SD ET: 12362.584634625216
RMSE Mean XGB: 120788.18884176762
RMSE SD XGB: 10616.407416417767
Execution time :  0:05:43.918308


### XGB with Normalization & Standardization

In [68]:
init_time = datetime.now()
model = XGBRegressor()
# define pipeline
pipeline = Pipeline(steps=[('Stnd', StandardScaler()), ('norm', MinMaxScaler()), ('m', model)])
scores = evaluate_model(X, y, pipeline)
display_scores(scores)
fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))

RMSE Mean : 120651.20252651487
RMSE SD : 10521.786036697406
Execution time :  0:00:25.712544


### Normalize Only Non-Gaussian Input Variables

In [None]:
if len(Gaussian_Like)> 0 and len(Non_Gaussian) > 0:
    init_time = datetime.now()
    # define the selective transforms
    t = [('e', MinMaxScaler(), Non_Gaussian)]
    norm_transform = ColumnTransformer(transformers=t, remainder='passthrough')
    # define pipeline
    model = XGBRegressor()
    # define pipeline
    pipeline = Pipeline(steps=[('s', norm_transform), ('m', model)])
    scores = evaluate_model(X, y, pipeline)
    display_scores(scores)
    fin_time = datetime.now()
    print("Execution time : ", (fin_time-init_time))

### Standardize Only Gaussian-Like Input Variables

In [None]:
if len(Gaussian_Like)> 0 and len(Non_Gaussian) > 0:
    init_time = datetime.now()
    # define the selective transforms
    t = [('n', StandardScaler(), Gaussian_Like)]
    stnd_transform = ColumnTransformer(transformers=t, remainder='passthrough')
    # define pipeline
    model = XGBRegressor()
    # define pipeline
    pipeline = Pipeline(steps=[('s', stnd_transform), ('m', model)])
    scores = evaluate_model(X, y, pipeline)
    display_scores(scores)
    fin_time = datetime.now()
    print("Execution time : ", (fin_time-init_time))

### XGB with Selectively Normalize and Standardize Input Variables

In [None]:
if len(Gaussian_Like)> 0 and len(Non_Gaussian) > 0:
    init_time = datetime.now()
    # define the selective transforms
    t = [('e', MinMaxScaler(), Non_Gaussian), ('n', StandardScaler(), Gaussian_Like)]
    selective = ColumnTransformer(transformers=t)
    # define pipeline
    model = XGBRegressor()
    # define pipeline
    pipeline = Pipeline(steps=[('s', selective), ('m', model)])
    scores = evaluate_model(X, y, pipeline)
    display_scores(scores)
    fin_time = datetime.now()
    print("Execution time : ", (fin_time-init_time))

### XGB with PowerTransformer

In [30]:
init_time = datetime.now()
model = XGBRegressor()
# define pipeline
pipeline = Pipeline(steps=[('t', PowerTransformer()), ('m', model)])
scores = evaluate_model(X, y, pipeline)
display_scores(scores)
fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))

RMSE Mean: 120835.38353046724
RMSE SD: 9856.333672781506
Execution time :  0:00:35.081420


### XGB with PCA & Outlier Removal

In [23]:
init_time = datetime.now()
# get a list of models to evaluate
def get_models():
    models = dict()
    for i in range(1,20):
        #steps = [('pca', PCA(n_components=i)), ('m', XGBRegressor())]
        steps = [('svd', TruncatedSVD(n_components=i)), ('m', XGBRegressor())]
        models[str(i)] = Pipeline(steps=steps)
    return models

# get the models to evaluate
models = get_models() 
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(X, y, model)
    scores = np.sqrt(-scores)
    results.append(scores.mean())
    names.append(name)

minIndex = results.index(min(results))
print(f"RMSE of SVD value {names[minIndex]} is {results[minIndex]}")
print(results)
print(names)
fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))
best_n_components = int(names[minIndex])
print(best_n_components)
#RMSE of PCA value 18 is 153401.49704230417

RMSE of SVD value 18 is 156075.45787923344
[366130.19229220535, 360130.5543315672, 254319.91156130133, 253508.49515443415, 236552.1307046798, 233271.04619099796, 231821.95428368897, 221233.67172203813, 219728.49963490356, 204976.57683429297, 200407.09060808478, 196228.44623024564, 195671.43706768172, 195575.0672501937, 195796.2543774463, 168123.55253226738, 163228.67825907172, 156075.45787923344, nan]
['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19']
Execution time :  0:11:25.416230
18


In [24]:
init_time = datetime.now()

# define transform
#pca = PCA(n_components=best_n_components)
svd = TruncatedSVD(n_components=best_n_components)
# prepare transform on dataset
#pca.fit(X)
svd.fit(X)
# apply transform to dataset
# X_transformed = pca.transform(X)
X_transformed = svd.transform(X)

# identify outliers in the training dataset
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_transformed)

# select all rows that are not outliers
mask = yhat != -1
X_transformed, y_transformed = X_transformed[mask, :], y[mask]

# fit the model
model = XGBRegressor()

# evaluate the model
scores = evaluate_model(X_transformed, y_transformed, model)
display_scores(scores)

fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))

# PCA RMSE Mean : 143220.67129159492

RMSE Mean : 146503.68059951623
RMSE SD: 5925.882450907686
Execution time :  0:00:59.039002


In [25]:
init_time = datetime.now()

# identify outliers in the training dataset
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X)

# select all rows that are not outliers
mask = yhat != -1
X_transformed, y_transformed = X[mask, :], y[mask]

# fit the model
model = XGBRegressor()

# evaluate the model
scores = evaluate_model(X_transformed, y_transformed, model)
display_scores(scores)

fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))

TypeError: '(array([ True,  True, False, ...,  True,  True,  True]), slice(None, None, None))' is an invalid key

### XGB with Feature Union (PCA + TruncatedSVD)

In [19]:
# transforms for the feature union
transforms = list()
transforms.append(('norm', MinMaxScaler()))
transforms.append(('Stnd', StandardScaler()))
transforms.append(('pca', PCA()))
transforms.append(('svd', TruncatedSVD()))

# create the feature union
fu = FeatureUnion(transforms)

In [20]:
init_time = datetime.now()
model = XGBRegressor()
# define pipeline
steps = list()
steps.append(('fu', fu))
steps.append(('m', model))
pipeline = Pipeline(steps=steps)
scores = evaluate_model(X, y, pipeline)
display_scores(scores)
fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))

RMSE Mean : 129434.03817500819
RMSE SD : 10857.366984540666
Execution time :  0:02:27.711553


### XGB with Normalization and PowerTransformer

In [34]:
init_time = datetime.now()
model = XGBRegressor()
# define pipeline
steps = list()
steps.append(('norm', MinMaxScaler()))
steps.append(('power', PowerTransformer()))
steps.append(('m', model))
pipeline = Pipeline(steps=steps)
scores = evaluate_model(X, y, pipeline)
display_scores(scores)
fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))

RMSE Mean: 120431.06133167246
RMSE SD: 10085.271522888474
Execution time :  0:00:33.817571


### XGB with Normalization & PCA

In [35]:
init_time = datetime.now()
model = XGBRegressor()
# define pipeline
steps = list()
steps.append(('norm', MinMaxScaler()))
steps.append(('pca', PCA()))
steps.append(('m', model))
pipeline = Pipeline(steps=steps)
scores = evaluate_model(X, y, pipeline)
display_scores(scores)
fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))

RMSE Mean: 167931.7710757437
RMSE SD: 13606.865201620762
Execution time :  0:01:12.338696


### XGB with Normalization, PCA and PowerTransformer

In [37]:
init_time = datetime.now()
model = XGBRegressor()
# define pipeline
steps = list()
steps.append(('norm', MinMaxScaler()))
steps.append(('power', PowerTransformer()))
steps.append(('pca', PCA()))
steps.append(('m', model))
pipeline = Pipeline(steps=steps)
scores = evaluate_model(X, y, pipeline)
display_scores(scores)
fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))

RMSE Mean: 164418.13908667807
RMSE SD: 14092.657334348532
Execution time :  0:01:15.276995


### XGB with Grid Search

#### Train, Test Validation Data Split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [22]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

#### Data Scaling & Transforming

In [96]:
scaler = MinMaxScaler()
transformer = PowerTransformer()

scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_valid_scaled = scaler.transform(X_valid)

transformer.fit(X_train_scaled)
X_train_transformed = transformer.transform(X_train_scaled)
X_test_transformed = transformer.transform(X_test_scaled)
X_valid_transformed = transformer.transform(X_valid_scaled)

#### Randomized Search Function 

In [98]:
def get_RandomizedSearch_model(parameters):
    xgb_reg = XGBRegressor(objective='reg:squarederror')
    xgb_grid = RandomizedSearchCV(xgb_reg,
                        parameters,
                        scoring='neg_mean_squared_error',
                        cv = 2,
                        n_jobs = 10,
                        verbose=False)
    xgb_grid.fit(X_train, y_train)
    return xgb_grid

#### First Grid Search Experiment

In [76]:
parameters = {
                'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
                'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
                'learning_rate': [0.0001, 0.001, 0.01, 0.02, 0.1, 0.2, 0.3], 
                'subsample' : [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0],
                'colsample_bytree' : [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0],
                'colsample_bylevel' : [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1.0],
                'gamma': [0],
                'reg_lambda': [0, 1.0, 3.0, 5.0, 7.0, 10.0, 12.0]
              }

In [77]:
init_time = datetime.now()
results = dict()
for _ in range(10):
  xgb_grid = get_RandomizedSearch_model(parameters)
  xgb_predictions = xgb_grid.predict(X_test)
  rmse = np.sqrt(metrics.mean_squared_error(y_test, xgb_predictions))
  results[rmse] = xgb_grid.best_params_

fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))
min_error = min(results.keys())
print(f"Minimum Error: {min_error}")
print(f"Parameters: {results[min_error]}")

#Minimum Error: 117461.46650171727

Execution time :  0:05:05.905865
Minimum Error: 119579.95917679991
Parameters: {'subsample': 0.8, 'reg_lambda': 3.0, 'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.8}


#### Grid Search Function

In [110]:
def get_GridSearch_model(parameters):
    init_time = datetime.now()
    xgb_reg = XGBRegressor(objective='reg:squarederror')
    # define pipeline
    steps = list()
    steps.append(('norm', MinMaxScaler()))
    steps.append(('power', PowerTransformer()))
    steps.append(('xgb', xgb_reg))
    pipeline = Pipeline(steps=steps)
    xgb_grid = GridSearchCV(pipeline,
                        parameters,
                        scoring='neg_mean_squared_error',
                        cv = 5,
                        n_jobs = 10,
                        verbose=False)
    xgb_grid.fit(X_train, y_train)
    xgb_predictions = xgb_grid.predict(X_test)
    print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, xgb_predictions)))
    #print(f"Param: {xgb_grid.best_params_}")
    fin_time = datetime.now()
    print("Execution time : ", (fin_time-init_time))
    
    return xgb_grid

In [111]:
parameters = {
    'xgb__subsample': [0.8], 
    'xgb__reg_lambda': [9.0], 
    'xgb__n_estimators': [900], 
    'xgb__max_depth': [5], 
    'xgb__learning_rate': [0.1], 
    'xgb__gamma': [0], 
    'xgb__colsample_bytree': [0.6], 
    'xgb__colsample_bylevel': [0.9]
}
model = get_GridSearch_model(parameters)

#RMSE: 114726.49126301783
#Param: {'colsample_bylevel': 0.9, 'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.1, 
#'max_depth': 5, 'n_estimators': 900, 'reg_lambda': 9.0, 'subsample': 0.8 

#RMSE: 114439.38493937258 with Norm + Power

RMSE: 114439.38493937258
Execution time :  0:00:21.328859


In [102]:
avgValue = np.mean(y_test)
print(avgValue)


535407.6787037037


#### Save Model

In [112]:
# write the actual face recognition model to disk
f = open('best_param_model', "wb")
f.write(pickle.dumps(model.best_estimator_))
f.close()

In [105]:
# write the label encoder to disk
#le = LabelEncoder()
#labels = le.fit_transform(data["names"])
#f = open(args["le"], "wb")
#f.write(pickle.dumps(le))
#f.close()

#### Load Model

In [113]:
model = pickle.loads(open('best_param_model', "rb").read())
#le = pickle.loads(open(args["le"], "rb").read())
predictions = model.predict(X_test)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

RMSE: 114439.38493937258
