# Import Statements

In [1]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
import joblib
import optuna
import sklearn 

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

# Load Data

In [2]:
# load data
train = pd.read_csv('../input/forest-cover-type-prediction/train.csv')
# view data
train.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5


In [3]:
# remove ID column from set
train = train.iloc[:, 1:]
train.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


# Add Features from Feature Engineering

In [4]:
# add new features from feature engineering
train['Elev_to_Horizontal_Hyd'] = train.Elevation - 0.2 * train.Horizontal_Distance_To_Hydrology 
train['Elev_to_Horizontal_Road'] = train.Elevation - 0.05 * train.Horizontal_Distance_To_Roadways  
train['Elev_to_Verticle_Hyd'] = train.Elevation - train.Vertical_Distance_To_Hydrology 
train['Mean_Horizontal_Dist'] = (train.Horizontal_Distance_To_Fire_Points + train.Horizontal_Distance_To_Hydrology + 
                                 train.Horizontal_Distance_To_Roadways)/3 
train['Mean_Fire_Hydro'] = (train.Horizontal_Distance_To_Fire_Points + train.Horizontal_Distance_To_Hydrology)/2

# Preprocessing

In [5]:
# move target to first column
first_column = train.pop('Cover_Type')
  
# insert column using insert(position,column_name,first_column) function
train.insert(0, 'Cover_Type', first_column)
  
# view
train.head()

Unnamed: 0,Cover_Type,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Elev_to_Horizontal_Hyd,Elev_to_Horizontal_Road,Elev_to_Verticle_Hyd,Mean_Horizontal_Dist,Mean_Fire_Hydro
0,5,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,2544.4,2570.5,2596,2349.0,3268.5
1,5,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,2547.6,2570.5,2596,2275.666667,3218.5
2,2,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,2750.4,2645.0,2739,3189.666667,3194.5
3,2,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,2736.6,2630.5,2667,3181.0,3226.5
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,2564.4,2575.45,2596,2238.666667,3162.5


In [6]:
# create cat, num, and y
X_cat = train.iloc[:, 11:55].values
B = train.iloc[:, 55:60]
A = train.iloc[:, 1:11]
X_num = pd.concat([A, B], axis = 1).values
y = train.iloc[:, 0].values

In [7]:
# scale/standardizing numerical columns
# scaler object
scaler = StandardScaler()
# fit to training data
scaler.fit(X_num)
# scale num columns
X_num = scaler.transform(X_num)

# shape
print(f'Categorical Shape: {X_cat.shape}')
print(f'Numerical Shape: {X_num.shape}')
print(f'Label Shape: {y.shape}')

Categorical Shape: (15120, 44)
Numerical Shape: (15120, 15)
Label Shape: (15120,)


In [8]:
# combine num and cat
X = np.hstack((X_num, X_cat))
print(X.shape)

(15120, 59)


# Various Models
The following sections include various models we built to predict the Forest Cover Type. We used GridSearchCV and Optuna to identify the optimal paramaters for each model. 

Our hyperparameter tuning can be found in the notebook: https://www.kaggle.com/emknowles/g2-forestcovertype-modelparams-notebook/

# Logistic Regression
Logistic Regression did not yield high accuracy, even after GridSearchCV and Optuna were used to find optimal parameters.

In [9]:
# best model 

lr_model = LogisticRegression(random_state = 1, 
                              penalty = 'none', 
                              max_iter = 500, 
                              solver = 'saga')
lr_model.fit(X, y)
print(lr_model.score(X, y))

0.7126322751322751




# Decision Tree
The best parameters from Optuna were {'max_depth': 19, 'min_samples_leaf': 1}

In [10]:
# dt best model
dt_model = DecisionTreeClassifier(random_state = 1, 
                                  max_depth = 19, 
                                  min_samples_leaf = 1)
dt_model.fit(X, y)

DecisionTreeClassifier(max_depth=19, random_state=1)

# Random Forest
The best parameters from Optuna were {'n_estimators': 130, 'max_depth': 50, 'min_samples_leaf': 1}

In [11]:
# best model
rf_model = RandomForestClassifier(random_state = 1, 
                                  n_estimators = 130, 
                                  max_depth = 50, 
                                  min_samples_leaf = 1)

rf_model.fit(X, y)

RandomForestClassifier(max_depth=50, n_estimators=130, random_state=1)

# Extra Tree Classifier
The best parameters from Optuna were {'max_depth': 33, 'min_samples_leaf': 1}

In [12]:
# best model
tree_model = ExtraTreesClassifier(random_state = 1, 
                                  n_estimators = 1000, 
                                  max_depth = 33, 
                                  min_samples_leaf = 1)

tree_model.fit(X, y)

ExtraTreesClassifier(max_depth=33, n_estimators=1000, random_state=1)

# Gradient Boosting Classifier
The best parameters from Optuna were {'max_depth': 19, 'min_samples_leaf': 16}

In [13]:
%%time
# best model
gradb_model = GradientBoostingClassifier(random_state = 0,
                                         max_depth = 19, 
                                         min_samples_leaf = 16)

gradb_model.fit(X, y)

CPU times: user 3min 29s, sys: 135 ms, total: 3min 29s
Wall time: 3min 29s


GradientBoostingClassifier(max_depth=19, min_samples_leaf=16, random_state=0)

# Extreme Gradient Boosting Classifier
The best parameters from Optuna were {'max_depth': 14}

In [14]:
%%time
# best model
xgb_model = XGBClassifier(random_state = 0,
                           max_depth = 14,
                           tree_method = 'gpu_hist')

xgb_model.fit(X, y)
xgb_model.score(X, y)



CPU times: user 7.53 s, sys: 364 ms, total: 7.89 s
Wall time: 8.54 s


1.0

# Light Gradient Boosting Classifier
The best parameters from Optuna were {'num_leaves': 10, 'max_depth': 16}

In [15]:
%%time
# best model
lgbm_model = LGBMClassifier(random_state = 0,
                            max_depth = 16,
                            num_leaves = 10)

lgbm_model.fit(X, y)

CPU times: user 2.12 s, sys: 40.9 ms, total: 2.16 s
Wall time: 1.12 s


LGBMClassifier(max_depth=16, num_leaves=10, random_state=0)

# Model Selection 

In [16]:
%%time 
# create ensemble classifier 
ensemble_model = VotingClassifier(
    estimators = [('tree', tree_model), 
                  ('rf', rf_model), 
                  ('gradb', gradb_model), 
                  ('xgb', xgb_model)],
    voting = 'hard'
)

# fit
ensemble_model.fit(X, y)

# print training accuracy
print('Logistic Regression Accuracy', lr_model.score(X, y))
print('Decision Tree Accuracy', dt_model.score(X, y))
print('Random Forest Accuracy', rf_model.score(X, y))
print('Extra Trees Accuracy', tree_model.score(X, y))
print('Gradient Boosting Accuracy', gradb_model.score(X, y))
print('Extra Gradient Boosting Accuracy', xgb_model.score(X, y))
print('LightGBM Accuracy', lgbm_model.score(X, y))
print('Ensemble Accuracy:', ensemble_model.score(X, y))



Logistic Regression Accuracy 0.7126322751322751
Decision Tree Accuracy 0.9808201058201058
Random Forest Accuracy 1.0
Extra Trees Accuracy 1.0
Gradient Boosting Accuracy 1.0
Extra Gradient Boosting Accuracy 1.0
LightGBM Accuracy 0.8923941798941799
Ensemble Accuracy: 1.0
CPU times: user 4min 4s, sys: 891 ms, total: 4min 5s
Wall time: 4min 4s


# Save Preprocessor and Models

In [17]:
# save scaler
joblib.dump(scaler, 'forest_cover_scaler_final.joblib')

['forest_cover_scaler_final.joblib']

In [18]:
joblib.dump(rf_model, 'rf_model_final.joblib')
joblib.dump(tree_model, 'tree_model_final.joblib')
joblib.dump(gradb_model, 'gradb_model_final.joblib')
joblib.dump(xgb_model, 'xgb_model_final.joblib')
joblib.dump(lgbm_model, 'lgbm_model_final.joblib')
joblib.dump(ensemble_model, 'ensemble_model_final.joblib')
print('Model written to file.')

Model written to file.
