# <center>Random Forest For 2,5x2,5 Grid size:<center>

## All Imports

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.decomposition import PCA
import math
from numpy import mean
from numpy import std
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest

## Reading and Transforming data

In [62]:
dataset = pd.read_csv("Table_25x25_Final.csv")
dataTypeSeries = dataset.dtypes
dataset = dataset.sample(frac=1, random_state=2)
print(dataTypeSeries)

OBJECTID                             int64
Id                                   int64
gridcode                             int64
point_X                            float64
point_Y                            float64
                                    ...   
Mean for elevation                 float64
Mean for Distance_Faults_MEAN      float64
Mean for Distance_Faults_STD       float64
Mean for Distance_QAFaults_MEAN    float64
Mean for Distance_QAFaults_STD     float64
Length: 250, dtype: object


In [63]:
selected_rows = dataset[~dataset["Grad_Geot"].isnull()]

In [94]:
X = pd.DataFrame((selected_rows.drop(['Id', "gridcode", "Shape_Length", "Shape_Area", "HFD", "Grad_Geot", "Count of Points" ], axis=1)))
Y = (selected_rows["Grad_Geot"])

### Outliers in label

In [95]:
Y_mean, Y_std = mean(Y), std(Y)
# identify outliers
cut_off = Y_std * 3
lower, upper = Y_mean - cut_off, Y_mean + cut_off
# identify outliers
outliers = [x for x in Y if x < lower or x > upper]
print('Identified outliers: %d' % len(outliers))
# remove outliers
outliers_removed = [x for x in Y if x >= lower and x <= upper]
print('Non-outlier observations: %d' % len(outliers_removed))
print(outliers)

Identified outliers: 3
Non-outlier observations: 114
[41.0, 48.0, 42.0]


### Removing Outliers From Data

In [96]:
selected_rows = dataset[~dataset["Grad_Geot"].isnull()]

In [97]:
FinalData = selected_rows[~selected_rows['Grad_Geot'].isin(outliers)]

In [98]:
X = pd.DataFrame((FinalData.drop(["Id", "gridcode", "Shape_Length", "Shape_Area", "HFD", "Grad_Geot", "Count of Points" ], axis=1)))
Y = (FinalData["Grad_Geot"])

In [99]:
Y.shape

(114,)

In [100]:
Y.describe()

count    114.000000
mean      21.405946
std        5.338412
min       10.000000
25%       17.245000
50%       20.800000
75%       24.469950
max       37.000000
Name: Grad_Geot, dtype: float64

## MODEL - Geothermal Gradient 

In [101]:
Regressor = RandomForestRegressor(criterion='absolute_error', max_depth=5, min_samples_leaf=1, min_samples_split=2,
                                  n_estimators=100, random_state=2)
Regressor.fit(X, Y)
pred_Regressor = Regressor.predict(X)

### Kfold

In [102]:
k = 5
kf = KFold(n_splits=k)
result = cross_val_score(Regressor , X, Y, cv = kf)
print (result.mean())

0.039130307389368735


In [103]:
k = 5
kf = KFold(n_splits=k)
result = cross_val_score(Regressor , X, Y, cv = kf, scoring='neg_mean_absolute_error')
print (result.mean())

-4.058596305655069


In [104]:
k = 5
kf = KFold(n_splits=k)
result = cross_val_score(Regressor , X, Y, cv = kf, scoring='neg_mean_squared_error')
print (result.mean())
rmse = math.sqrt(-result.mean())
rmse

-26.542367004527534


5.151928474321779

## Grid Search CV

In [30]:
n_estimators = [100, 200, 250, 500]
criterion = ['absolute_error', 'squared_error']
random_state = [2,3,4]
max_depth = [5, 8, 10, 12, 15]
min_samples_split = [2,3]
min_samples_leaf = [1,2]

In [31]:
param_grid = {'n_estimators' : n_estimators, 'criterion' : criterion, 'random_state' : random_state,
              'min_samples_split' : min_samples_split, 'max_depth' : max_depth, 'min_samples_leaf' : min_samples_leaf}

In [35]:
rf_grid = GridSearchCV(estimator=Regressor, param_grid=param_grid, cv=5)
rf_grid.fit(X, Y)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'criterion': ['absolute_error', 'squared_error'],
                         'max_depth': [5, 8, 10, 12, 15],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 3],
                         'n_estimators': [100, 200, 250, 500],
                         'random_state': [2, 3, 4]})

In [36]:
rf_grid.best_params_

{'criterion': 'absolute_error',
 'max_depth': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100,
 'random_state': 2}

# Feature Selection

In [49]:
df = pd.DataFrame({'Feature_names' : X.columns, 'Importances' : Regressor.feature_importances_})

print(df.sort_values(by='Importances', ascending=False).nlargest(100, 'Importances'))

                                        Feature_names  Importances
235                          Mean for Temperature_STD     0.088355
3                                        Bouguer_MEAN     0.059248
231                                Mean for Fault_STD     0.049859
241                   Mean for Distance_QAFaults_MEAN     0.044837
0                                            OBJECTID     0.036156
..                                                ...          ...
50                                     ZonaSeismic_05     0.000966
161  MAX_ONEHOT_Descricao1_Dev_nico_do_Bordo_Sudoeste     0.000942
49                                     ZonaSeismic_04     0.000896
40                                          Texture_1     0.000887
23                                       SlopeClass_6     0.000882

[100 rows x 2 columns]


In [156]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(Regressor, random_state=1).fit(X, Y)
eli5.show_weights(perm, feature_names = X.columns.tolist(), top=40)

Weight,Feature
0.1097  ± 0.0264,Mean for Temperature_STD
0.1016  ± 0.0404,Bouguer_MEAN
0.0476  ± 0.0190,Mean for Fault_STD
0.0401  ± 0.0195,Mean for Distance_QAFaults_MEAN
0.0335  ± 0.0107,Fault_STD
0.0335  ± 0.0056,Mean for Distance_QAFaults_STD
0.0285  ± 0.0053,OBJECTID
0.0269  ± 0.0170,Fault_SUM
0.0218  ± 0.0071,elevation
0.0218  ± 0.0036,point_X


## SelectFromModel

In [80]:
model = SelectFromModel(Regressor, prefit=True)
X_new = model.transform(X)



In [81]:
X_new.shape

(114, 52)

In [82]:
Regressor_new = RandomForestRegressor(criterion='absolute_error', max_depth=5, min_samples_leaf=2, min_samples_split=2,
                                  n_estimators=100, random_state=2)
Regressor_new.fit(X_new, Y)
predict_new = Regressor_new.predict(X_new)

### K fold CV

In [83]:
k = 5
kf = KFold(n_splits=k)
 
result = cross_val_score(Regressor_new , X_new, Y, cv = kf, scoring='r2')
 
print("Avg accuracy: {}".format(result.mean()))

Avg accuracy: 0.04512467835522431


In [47]:
k = 5
kf = KFold(n_splits=k)
 
result = cross_val_score(Regressor_new , X_new, Y, cv = kf, scoring='neg_mean_absolute_error')
 
print("Avg accuracy: {}".format(result.mean()))

Avg accuracy: -4.234510884328992


In [48]:
k = 5
kf = KFold(n_splits=k)
 
result = cross_val_score(Regressor_new , X_new, Y, cv = kf, scoring='neg_mean_squared_error')
 
print("Avg accuracy: {}".format(result.mean()))
rmse = math.sqrt(-result.mean())
rmse

Avg accuracy: -29.773037719986625


5.456467512959884

### Grid Search CV

In [32]:
rf_grid = GridSearchCV(estimator=Regressor_new, param_grid=param_grid, cv=5)
rf_grid.fit(X_new, Y)

GridSearchCV(cv=5,
             estimator=RandomForestRegressor(criterion='absolute_error',
                                             max_depth=12, min_samples_leaf=2,
                                             random_state=2),
             param_grid={'criterion': ['absolute_error', 'squared_error'],
                         'max_depth': [5, 8, 10, 12, 15],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 3],
                         'n_estimators': [100, 200, 250, 500],
                         'random_state': [2, 3, 4]})

In [33]:
rf_grid.best_params_

{'criterion': 'absolute_error',
 'max_depth': 5,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 100,
 'random_state': 2}

## Predicting Full data

In [175]:
full_data = pd.read_csv("Table_25x25_Final.csv")
X_full = (full_data.drop(["gridcode", "Shape_Length", "Shape_Area", "HFD", "Grad_Geot", "Count of Points" ], axis=1))
Y_full = (full_data["Grad_Geot"])
X_full.shape

(14810, 244)

In [176]:
X_full = model2.transform(X_full)



In [177]:
predict_full = Regressor_new2.predict(X_full)
predict_full

array([57.35306102, 53.55647003, 59.43514134, ..., 61.56141343,
       56.7767103 , 61.90545871])

In [178]:
predict_full = pd.DataFrame(predict_full)
predict_full.describe()

Unnamed: 0,0
count,14810.0
mean,61.84261
std,7.668943
min,29.5627
25%,56.911251
50%,61.969959
75%,66.268201
max,102.699313


In [179]:
prediction = pd.DataFrame(predict_full).to_excel("RF_25x25_Selected_outliersout_hfd.xlsx")

# Model - Heat flux density

In [155]:
selected_rows = dataset[~dataset["Grad_Geot"].isnull()]

In [156]:
XX = pd.DataFrame((selected_rows.drop(["gridcode", "Shape_Length", "Shape_Area", "HFD", "Grad_Geot", "Count of Points"], axis=1)))
YY = (selected_rows["HFD"])

In [157]:
YY_mean, YY_std = mean(YY), std(YY)
# identify outliers
cut_off = YY_std * 3
lower, upper = YY_mean - cut_off, YY_mean + cut_off
# identify outliers
outliers = [x for x in YY if x < lower or x > upper]
print('Identified outliers: %d' % len(outliers))
# remove outliers
outliers_removed = [x for x in YY if x >= lower and x <= upper]
print('Non-outlier observations: %d' % len(outliers_removed))
print(outliers)

Identified outliers: 2
Non-outlier observations: 115
[183.84, 139.5]


In [158]:
selected_rows = dataset[~dataset["HFD"].isnull()]

In [159]:
FinalData = selected_rows[~selected_rows['HFD'].isin(outliers)]

In [160]:
XX = pd.DataFrame((FinalData.drop(["gridcode", "Shape_Length", "Shape_Area", "HFD", "Grad_Geot", "Count of Points" ], axis=1)))
YY = (FinalData["HFD"])

In [161]:
YY.shape

(115,)

In [162]:
YY.describe()

count    115.000000
mean      62.150376
std       18.538075
min       25.000000
25%       50.332900
50%       60.000000
75%       73.306000
max      114.863000
Name: HFD, dtype: float64

In [163]:
Regressor2 = RandomForestRegressor(criterion='absolute_error', max_depth=15, min_samples_leaf=2, min_samples_split=2,
                                  n_estimators=100, random_state=4)
Regressor2.fit(XX, YY)
pred_Regressor = Regressor2.predict(XX)

In [165]:
k = 5
kf = KFold(n_splits=k)
result = cross_val_score(Regressor2 , XX, YY, cv = kf)
print (result.mean())

0.22363973838246407


In [166]:
k = 5
kf = KFold(n_splits=k)
result = cross_val_score(Regressor2 , XX, YY, cv = kf, scoring='neg_mean_absolute_error')
print (result.mean())

-13.060064679710143


In [167]:
k = 5
kf = KFold(n_splits=k)
result = cross_val_score(Regressor2 , XX, YY, cv = kf, scoring='neg_mean_squared_error')
print (result.mean())
rmse = math.sqrt(-result.mean())
rmse

-260.27156350432364


16.132934125704587

In [None]:
rf_grid = GridSearchCV(estimator=Regressor2, param_grid=param_grid, cv=5)
rf_grid.fit(XX, YY)

In [89]:
rf_grid.best_params_

{'criterion': 'absolute_error',
 'max_depth': 15,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 100,
 'random_state': 4}

### Select from model 

In [168]:
model2 = SelectFromModel(Regressor2, prefit=True)
X_new2 = model2.transform(XX)



In [169]:
X_new2.shape

(115, 68)

In [170]:
Regressor_new2 = RandomForestRegressor(criterion='squared_error', max_depth=10, min_samples_leaf=1, min_samples_split=2,
                                  n_estimators=100, random_state=3)
Regressor_new2.fit(X_new2, YY)
predict_new2 = Regressor_new2.predict(X_new2)

In [171]:
df = pd.DataFrame({'Feature_names' : XX.columns, 'Importances' : Regressor2.feature_importances_})

print(df.sort_values(by='Importances', ascending=False).nlargest(120, 'Importances'))

                                         Feature_names  Importances
3                                              point_Y     0.078364
1                                                   Id     0.050768
243                     Mean for Distance_QAFaults_STD     0.037057
115                                Distance_Faults_STD     0.032206
165               MAX_ONEHOT_Descricao1_Faixa_Piritosa     0.031704
..                                                 ...          ...
129  MAX_ONEHOT_Descricao1_Complexo_Xisto_Grauv_qui...     0.000395
88                       ONEHOT_Zona_Rochas_filonianas     0.000384
49                                      ZonaSeismic_03     0.000315
109                     ONEHOT_Nome_Bacia_do_Tejo_Sado     0.000304
151  MAX_ONEHOT_Descricao1_Dep_sitos_paleog_nicos_d...     0.000267

[120 rows x 2 columns]


In [172]:
k = 5
kf2 = KFold(n_splits=k)
 
result = cross_val_score(Regressor_new2 , X_new2, YY, cv = kf2)
 
print("Avg accuracy: {}".format(result.mean()))

Avg accuracy: 0.23181970188969983


In [173]:
k = 5
kf2 = KFold(n_splits=k)
 
result = cross_val_score(Regressor_new2 , X_new2, YY, cv = kf2, scoring='neg_mean_absolute_error')
 
print("Avg accuracy: {}".format(result.mean()))

Avg accuracy: -12.92838858602525


In [174]:
k = 5
kf2 = KFold(n_splits=k)
 
result = cross_val_score(Regressor_new2 , X_new2, YY, cv = kf2, scoring='neg_mean_squared_error')
 
print("Avg accuracy: {}".format(result.mean()))
rmse = math.sqrt(-result.mean())
rmse

Avg accuracy: -259.059936424189


16.095338965805876

In [140]:
rf_grid = GridSearchCV(estimator=Regressor_new2, param_grid=param_grid, cv=5)
rf_grid.fit(X_new2, YY)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=1),
             param_grid={'criterion': ['absolute_error', 'squared_error'],
                         'max_depth': [5, 8, 10, 12, 15],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 3],
                         'n_estimators': [100, 200, 250, 500],
                         'random_state': [2, 3, 4]})

In [141]:
rf_grid.best_params_

{'criterion': 'squared_error',
 'max_depth': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100,
 'random_state': 3}

## Saved codes

n_estimators_range = range(10,1000)
n_est_scores = []
for n in n_estimators_range: 
    rf = RandomForestRegressor(n_estimators= n, random_state=2, min_samples_leaf=3)
    scores = cross_val_score(rf, X, Y, cv=5)
    n_est_scores.append(scores.mean())

%matplotlib inline
plt.plot(n_estimators_range, n_est_scores)