# <center>XGB for 25 X 25 GRID<center> 

## All Imports 

In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.decomposition import PCA
import math
from numpy import mean
from numpy import std

## Reading and Transforming data

In [71]:
dataset = pd.read_csv("Table_25x25_Final.csv")
dataTypeSeries = dataset.dtypes
dataset = dataset.sample(frac=1, random_state=2)
print(dataTypeSeries)

OBJECTID                             int64
Id                                   int64
gridcode                             int64
point_X                            float64
point_Y                            float64
                                    ...   
Mean for elevation                 float64
Mean for Distance_Faults_MEAN      float64
Mean for Distance_Faults_STD       float64
Mean for Distance_QAFaults_MEAN    float64
Mean for Distance_QAFaults_STD     float64
Length: 250, dtype: object


In [72]:
selected_rows = dataset[~dataset["Grad_Geot"].isnull()]

In [73]:
X = pd.DataFrame((selected_rows.drop(['Id', "gridcode", "Shape_Length", "Shape_Area", "HFD", "Grad_Geot", "Count of Points"], axis=1)))
Y = (selected_rows["Grad_Geot"])

### Outliers in label

In [74]:
Y_mean, Y_std = mean(Y), std(Y)
# identify outliers
cut_off = Y_std * 3
lower, upper = Y_mean - cut_off, Y_mean + cut_off
# identify outliers
outliers = [x for x in Y if x < lower or x > upper]
print('Identified outliers: %d' % len(outliers))
# remove outliers
outliers_removed = [x for x in Y if x >= lower and x <= upper]
print('Non-outlier observations: %d' % len(outliers_removed))
print(outliers)

Identified outliers: 3
Non-outlier observations: 114
[41.0, 48.0, 42.0]


### Removing Outliers From Data

In [75]:
FinalData = selected_rows[~selected_rows['Grad_Geot'].isin(outliers)]

In [76]:
X = pd.DataFrame((FinalData.drop(['Id', "gridcode", "Shape_Length", "Shape_Area", "HFD", "Grad_Geot", "Count of Points" ], axis=1)))
Y = (FinalData["Grad_Geot"])

In [77]:
Y.shape

(114,)

In [78]:
Y.describe()

count    114.000000
mean      21.405946
std        5.338412
min       10.000000
25%       17.245000
50%       20.800000
75%       24.469950
max       37.000000
Name: Grad_Geot, dtype: float64

min_max_scaler = MinMaxScaler()
min_max_scaler.fit(X)
min_max_scaler.transform(X)

## Model

In [79]:
Regressor = XGBRegressor(gamma=10,
 learning_rate=0.01, max_depth=5, n_estimators=500, random_state=2, reg_lambda=0.1)
Regressor.fit(X, Y)
pred_Regressor = Regressor.predict(X)

### K fold CV

In [80]:
k = 5
kf = KFold(n_splits=k)
 
result = cross_val_score(Regressor , X, Y, cv = kf)
 
print("Avg accuracy: {}".format(result.mean()))

Avg accuracy: -0.06912646659358908


In [81]:
k = 5
kf = KFold(n_splits=k)
result = cross_val_score(Regressor , X, Y, cv = kf, scoring='neg_mean_absolute_error')
print (result.mean())

-4.356813203828383


In [82]:
k = 5
kf = KFold(n_splits=k)
result = cross_val_score(Regressor , X, Y, cv = kf, scoring='neg_mean_squared_error')
print (result.mean())
rmse = math.sqrt(-result.mean())
rmse

-29.378654477297637


5.420207973620352

### Grid Search CV

In [32]:
n_estimators = [100, 200, 250, 500]
learning_rate = [0.01,0.05,0.1,0.2]
random_state = [2,3,4]
reg_lambda = [0.1,1,10]
gamma = [0.1,1,10]
max_depth = [5, 8, 10, 12, 15]

In [33]:
param_grid = {'n_estimators' : n_estimators, 'random_state' : random_state,
              'gamma' : gamma, 'max_depth' : max_depth, 'learning_rate' : learning_rate, 'reg_lambda' : reg_lambda}

In [34]:
rf_grid = GridSearchCV(estimator=Regressor, param_grid=param_grid, cv=5)
rf_grid.fit(X, Y)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1,
                                    enable_categorical=False, gamma=0,
                                    gpu_id=-1, importance_type=None,
                                    interaction_constraints='',
                                    learning_rate=0.300000012, max_delta_step=0,
                                    max_depth=6, min_child_weight=1,
                                    missing=nan, monotone_constraints='()',
                                    n_estimators=100, n_jobs=4,
                                    num_parallel_tree=1, predictor='auto',
                                    random_state=0, reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, subsample=1,
                                    tree_method='exact', vali

In [35]:
rf_grid.best_params_

{'gamma': 10,
 'learning_rate': 0.01,
 'max_depth': 5,
 'n_estimators': 500,
 'random_state': 2,
 'reg_lambda': 0.1}

In [87]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(Regressor, random_state=1).fit(X, Y)
eli5.show_weights(perm, feature_names = X.columns.tolist(), top=40)

Weight,Feature
0.2988  ± 0.0919,Bouguer_MEAN
0.2240  ± 0.0721,Fault_STD
0.0962  ± 0.0513,Mean for Fault_STD
0.0713  ± 0.0323,Fault_MAX
0.0653  ± 0.0193,Distance_Faults_STD
0.0559  ± 0.0134,Distance_Faults_MEAN
0.0542  ± 0.0038,Mean for Distance_QAFaults_STD
0.0511  ± 0.0122,Mean_Distance_Faults_MEAN
0.0472  ± 0.0111,Mean for Drain_STD
0.0396  ± 0.0119,OBJECTID


### Select From Model 

In [83]:
model = SelectFromModel(Regressor, prefit=True)
X_new = model.transform(X)



In [84]:
X_new.shape

(114, 65)

In [85]:
Regressor_new = XGBRegressor(gamma=10,
 learning_rate=0.2, max_depth=5, n_estimators=100, random_state=2, reg_lambda=0.1)
Regressor_new.fit(X_new, Y)
predict_new = Regressor_new.predict(X_new)

In [86]:
k = 5
kf = KFold(n_splits=k)
 
result = cross_val_score(Regressor_new , X_new, Y, cv = kf)
 
print("Avg accuracy: {}".format(result.mean()))

Avg accuracy: -0.04385809455400078


In [87]:
k = 5
kf = KFold(n_splits=k)
 
result = cross_val_score(Regressor_new , X_new, Y, cv = kf, scoring='neg_mean_absolute_error')
 
print("Avg accuracy: {}".format(result.mean()))

Avg accuracy: -4.275029736592616


In [88]:
k = 5
kf = KFold(n_splits=k)
 
result = cross_val_score(Regressor_new , X_new, Y, cv = kf, scoring='neg_mean_squared_error')
 
print("Avg accuracy: {}".format(result.mean()))
rmse = math.sqrt(-result.mean())
rmse

Avg accuracy: -28.52190183756533


5.340590027100501

In [48]:
rf_grid = GridSearchCV(estimator=Regressor_new, param_grid=param_grid, cv=5)
rf_grid.fit(X_new, Y)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1,
                                    enable_categorical=False, gamma=0,
                                    gpu_id=-1, importance_type=None,
                                    interaction_constraints='',
                                    learning_rate=0.300000012, max_delta_step=0,
                                    max_depth=6, min_child_weight=1,
                                    missing=nan, monotone_constraints='()',
                                    n_estimators=100, n_jobs=4,
                                    num_parallel_tree=1, predictor='auto',
                                    random_state=0, reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, subsample=1,
                                    tree_method='exact', vali

In [49]:
rf_grid.best_params_

{'gamma': 10,
 'learning_rate': 0.2,
 'max_depth': 5,
 'n_estimators': 100,
 'random_state': 2,
 'reg_lambda': 0.1}

## Heat flow density

In [89]:
selected_rows = dataset[~dataset["Grad_Geot"].isnull()]

In [103]:
XX = pd.DataFrame((selected_rows.drop(['OBJECTID', "gridcode", "Shape_Length", "Shape_Area", "HFD", "Grad_Geot", "Count of Points"], axis=1)))
YY = (selected_rows["HFD"])

In [104]:
YY_mean, YY_std = mean(YY), std(YY)
# identify outliers
cut_off = YY_std * 3
lower, upper = YY_mean - cut_off, YY_mean + cut_off
# identify outliers
outliers = [x for x in YY if x < lower or x > upper]
print('Identified outliers: %d' % len(outliers))
# remove outliers
outliers_removed = [x for x in YY if x >= lower and x <= upper]
print('Non-outlier observations: %d' % len(outliers_removed))
print(outliers)

Identified outliers: 2
Non-outlier observations: 115
[183.84, 139.5]


In [105]:
selected_rows = dataset[~dataset["HFD"].isnull()]
FinalData = selected_rows[~selected_rows['HFD'].isin(outliers)]

In [106]:
XX = pd.DataFrame((FinalData.drop(['OBJECTID', "gridcode", "Shape_Length", "Shape_Area", "HFD", "Grad_Geot", "Count of Points"], axis=1)))
YY = (FinalData["HFD"])

In [107]:
YY.shape

(115,)

In [108]:
YY.describe()

count    115.000000
mean      62.150376
std       18.538075
min       25.000000
25%       50.332900
50%       60.000000
75%       73.306000
max      114.863000
Name: HFD, dtype: float64

In [109]:
Regressor2 = XGBRegressor(gamma=0.1,
 learning_rate=0.05, max_depth=5, n_estimators=100, random_state=2, reg_lambda=1)
Regressor2.fit(XX, YY)
pred_Regressor = Regressor2.predict(XX)

In [110]:
k = 5
kf = KFold(n_splits=k)
result = cross_val_score(Regressor2 , XX, YY, cv = kf)
print (result.mean())

0.2097214122263976


In [111]:
k = 5
kf = KFold(n_splits=k)
result = cross_val_score(Regressor2 , XX, YY, cv = kf, scoring='neg_mean_absolute_error')
print (result.mean())

-12.916843949593808


In [112]:
k = 5
kf = KFold(n_splits=k)
result = cross_val_score(Regressor2 , XX, YY, cv = kf, scoring='neg_mean_squared_error')
print (result.mean())
rmse = math.sqrt(-result.mean())
rmse

-268.7441429622854


16.393417671806127

In [67]:
param_grid = {'n_estimators' : n_estimators, 'random_state' : random_state,
              'gamma' : gamma, 'max_depth' : max_depth, 'learning_rate' : learning_rate, 'reg_lambda' : reg_lambda}

In [68]:
rf_grid = GridSearchCV(estimator=Regressor2, param_grid=param_grid, cv=5)
rf_grid.fit(XX, YY)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1,
                                    enable_categorical=False, gamma=0,
                                    gpu_id=-1, importance_type=None,
                                    interaction_constraints='',
                                    learning_rate=0.300000012, max_delta_step=0,
                                    max_depth=6, min_child_weight=1,
                                    missing=nan, monotone_constraints='()',
                                    n_estimators=100, n_jobs=4,
                                    num_parallel_tree=1, predictor='auto',
                                    random_state=0, reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, subsample=1,
                                    tree_method='exact', vali

In [69]:
rf_grid.best_params_

{'gamma': 0.1,
 'learning_rate': 0.05,
 'max_depth': 5,
 'n_estimators': 100,
 'random_state': 2,
 'reg_lambda': 1}

### Select from model 

In [113]:
model2 = SelectFromModel(Regressor2, prefit=True)
X_new2 = model2.transform(XX)



In [114]:
Regressor_new2 = XGBRegressor(gamma=10,
 learning_rate=0.01, max_depth=5, n_estimators=500, random_state=2, reg_lambda=1)
Regressor_new2.fit(X_new2, YY)
predict_new2 = Regressor_new2.predict(X_new2)

In [115]:
k = 5
kf2 = KFold(n_splits=k)
 
result = cross_val_score(Regressor_new2 , X_new2, YY, cv = kf2)
 
print("Avg accuracy: {}".format(result.mean()))

Avg accuracy: 0.19499734222258752


In [116]:
k = 5
kf2 = KFold(n_splits=k)
 
result = cross_val_score(Regressor_new2 , X_new2, YY, cv = kf2, scoring='neg_mean_absolute_error')
 
print("Avg accuracy: {}".format(result.mean()))

Avg accuracy: -12.930981714857907


In [117]:
k = 5
kf2 = KFold(n_splits=k)
 
result = cross_val_score(Regressor_new2 , X_new2, YY, cv = kf2, scoring='neg_mean_squared_error')
 
print("Avg accuracy: {}".format(result.mean()))
rmse = math.sqrt(-result.mean())
rmse

Avg accuracy: -276.65090814922394


16.63282622254029

In [80]:
rf_grid = GridSearchCV(estimator=Regressor_new2, param_grid=param_grid, cv=5)
rf_grid.fit(X_new2, YY)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1,
                                    enable_categorical=False, gamma=0,
                                    gpu_id=-1, importance_type=None,
                                    interaction_constraints='',
                                    learning_rate=0.300000012, max_delta_step=0,
                                    max_depth=6, min_child_weight=1,
                                    missing=nan, monotone_constraints='()',
                                    n_estimators=100, n_jobs=4,
                                    num_parallel_tree=1, predictor='auto',
                                    random_state=0, reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, subsample=1,
                                    tree_method='exact', vali

In [81]:
rf_grid.best_params_

{'gamma': 10,
 'learning_rate': 0.01,
 'max_depth': 5,
 'n_estimators': 500,
 'random_state': 2,
 'reg_lambda': 1}

## Predict Full Data

In [121]:
full_data = pd.read_csv("Table_25x25_Final.csv")
X_full = (full_data.drop(['OBJECTID', "gridcode", "Shape_Length", "Shape_Area", "HFD", "Grad_Geot", "Count of Points"], axis=1))
Y_full = (full_data["HFD"])
X_full.shape

(14810, 243)

In [122]:
X_full = model2.transform(X_full)



In [123]:
predict_full = Regressor_new2.predict(X_full)
predict_full

array([55.462055, 48.3778  , 55.86551 , ..., 59.79841 , 62.43442 ,
       56.84914 ], dtype=float32)

In [124]:
prediction = pd.DataFrame(predict_full).to_excel("XGB_25x25_Selected_outliersout_hfd.xlsx")