I started this competition investigating neural networks with this kernel https://www.kaggle.com/mulargui/keras-nn
Now switching to using ensembles in this new kernel.
You can find all my notes and versions at https://github.com/mulargui/kaggle-Classify-forest-types

In [57]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#load data
dftrain=pd.read_csv('/kaggle/input/learn-together/train.csv')
dftest=pd.read_csv('/kaggle/input/learn-together/test.csv')

####### DATA PREPARATION #####
#split train data in features and labels
y = dftrain.Cover_Type
x = dftrain.drop(['Id','Cover_Type'], axis=1)

# split test data in features and Ids
Ids = dftest.Id
x_predict = dftest.drop('Id', axis=1)

# one data set with all features
X = pd.concat([x,x_predict],keys=[0,1])

In [58]:
###### FEATURE ENGINEERING #####
#https://www.kaggle.com/arateris/2-layer-k-fold-learning-forest-cover 
#Fixing Hillshade_3pm
#replacing the zeros for better guess, mainly to avoid zeros in the feature engineering and fake outliers. 
cols_for_HS = ['Aspect','Slope', 'Hillshade_9am','Hillshade_Noon']
HS_zero = X[X.Hillshade_3pm==0]
HS_train = X[X.Hillshade_3pm!=0]

from sklearn.ensemble import RandomForestRegressor
rf_hs = RandomForestRegressor(n_estimators=100).fit(HS_train[cols_for_HS], HS_train.Hillshade_3pm)
out = rf_hs.predict(HS_zero[cols_for_HS]).astype(int)
#X.loc[HS_zero.index,'Hillshade_3pm'] = out

# Adding Gaussian Mixture features to perform some unsupervised learning hints from the full data
#https://www.kaggle.com/arateris/2-layer-k-fold-learning-forest-cover 
#https://www.kaggle.com/stevegreenau/stacking-multiple-classifiers-clustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
gmix = GaussianMixture(n_components=10) 
gaussian = gmix.fit_predict(StandardScaler().fit_transform(X))
X['GM'] = gaussian

#https://www.kaggle.com/evimarp/top-6-roosevelt-national-forest-competition
from itertools import combinations
from bisect import bisect
def features(df):
    df['Euclidean_distance_to_hydro'] = (df.Vertical_Distance_To_Hydrology**2 
                                         + df.Horizontal_Distance_To_Hydrology**2)**.5

    cols = [
        'Horizontal_Distance_To_Roadways',
        'Horizontal_Distance_To_Fire_Points',
        'Horizontal_Distance_To_Hydrology',
    ]
    df['distance_mean'] = df[cols].mean(axis=1)
    df['distance_sum'] = df[cols].sum(axis=1)
    df['distance_road_fire'] = df[cols[:2]].mean(axis=1)
    df['distance_hydro_fire'] = df[cols[1:]].mean(axis=1)
    df['distance_road_hydro'] = df[[cols[0], cols[2]]].mean(axis=1)
    
    df['distance_sum_road_fire'] = df[cols[:2]].sum(axis=1)
    df['distance_sum_hydro_fire'] = df[cols[1:]].sum(axis=1)
    df['distance_sum_road_hydro'] = df[[cols[0], cols[2]]].sum(axis=1)
    
    df['distance_dif_road_fire'] = df[cols[0]] - df[cols[1]]
    df['distance_dif_hydro_road'] = df[cols[2]] - df[cols[0]]
    df['distance_dif_hydro_fire'] = df[cols[2]] - df[cols[1]]
    
    # Vertical distances measures
    colv = ['Elevation', 'Vertical_Distance_To_Hydrology']
    df['Vertical_dif'] = df[colv[0]] - df[colv[1]]
    df['Vertical_sum'] = df[colv].sum(axis=1)
    
    SHADES = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
    
    df['shade_noon_diff'] = df['Hillshade_9am'] - df['Hillshade_Noon']
    df['shade_3pm_diff'] = df['Hillshade_Noon'] - df['Hillshade_3pm']
    df['shade_all_diff'] = df['Hillshade_9am'] - df['Hillshade_3pm']
    df['shade_sum'] = df[SHADES].sum(axis=1)
    df['shade_mean'] = df[SHADES].mean(axis=1)
    
    df['ElevationHydro'] = df['Elevation'] - 0.25 * df['Euclidean_distance_to_hydro']
    df['ElevationV'] = df['Elevation'] - df['Vertical_Distance_To_Hydrology']
    df['ElevationH'] = df['Elevation'] - 0.19 * df['Horizontal_Distance_To_Hydrology']

    df['Elevation2'] = df['Elevation']**2
    df['ElevationLog'] = np.log1p(df['Elevation'])

    df['Aspect_cos'] = np.cos(np.radians(df.Aspect))
    df['Aspect_sin'] = np.sin(np.radians(df.Aspect))
    #df['Slope_sin'] = np.sin(np.radians(df.Slope))
    df['Aspectcos_Slope'] = df.Slope * df.Aspect_cos
    #df['Aspectsin_Slope'] = df.Slope * df.Aspect_sin
    
    cardinals = [i for i in range(45, 361, 90)]
    points = ['N', 'E', 'S', 'W']
    df['Cardinal'] = df.Aspect.apply(lambda x: points[bisect(cardinals, x) % 4])

    d = {'N': 0, 'E': 1, 'S': 0, 'W':-1}
    df['Cardinal'] = df.Cardinal.apply(lambda x: d[x])
    return df

X = features(X)

#adding features based on https://douglas-fraser.com/forest_cover_management.pdf pages 21,22
#note: not all climatic and geologic codes have a soil type

def Climatic2(row): 
    if (row['Soil_Type1'] == 1) or (row['Soil_Type2'] == 1) or (row['Soil_Type3'] == 1) or (row['Soil_Type4'] == 1) \
        or (row['Soil_Type5'] == 1) or (row['Soil_Type6'] == 1) :
        return 1 
    return 0

X['Climatic2'] = X.apply (lambda row: Climatic2(row), axis=1)

def Climatic3(row): 
    if (row['Soil_Type7'] == 1) or (row['Soil_Type8'] == 1) :
        return 1 
    return 0

X['Climatic3'] = X.apply (lambda row: Climatic3(row), axis=1)

def Climatic4(row): 
    if (row['Soil_Type9'] == 1) or (row['Soil_Type10'] == 1) or (row['Soil_Type11'] == 1) or (row['Soil_Type12'] == 1) \
        or (row['Soil_Type13'] == 1) :
        return 1 
    return 0

X['Climatic4'] = X.apply (lambda row: Climatic4(row), axis=1)

def Climatic5(row): 
    if (row['Soil_Type14'] == 1) or (row['Soil_Type15'] == 1) :
        return 1 
    return 0

X['Climatic5'] = X.apply (lambda row: Climatic5(row), axis=1)

def Climatic6(row): 
    if (row['Soil_Type16'] == 1) or (row['Soil_Type17'] == 1) or (row['Soil_Type18'] == 1) :
        return 1 
    return 0

X['Climatic6'] = X.apply (lambda row: Climatic6(row), axis=1)

def Climatic7(row): 
    if (row['Soil_Type19'] == 1) or (row['Soil_Type20'] == 1) or (row['Soil_Type21'] == 1) or (row['Soil_Type22'] == 1) \
        or (row['Soil_Type23'] == 1) or (row['Soil_Type24'] == 1) or (row['Soil_Type25'] == 1) or (row['Soil_Type26'] == 1) \
        or (row['Soil_Type27'] == 1) or (row['Soil_Type28'] == 1) or (row['Soil_Type29'] == 1) or (row['Soil_Type30'] == 1) \
        or (row['Soil_Type31'] == 1) or (row['Soil_Type32'] == 1) or (row['Soil_Type33'] == 1) or (row['Soil_Type34'] == 1) :
        return 1 
    return 0

X['Climatic7'] = X.apply (lambda row: Climatic7(row), axis=1)

def Climatic8(row): 
    if (row['Soil_Type35'] == 1) or (row['Soil_Type36'] == 1) or (row['Soil_Type37'] == 1) or (row['Soil_Type38'] == 1) \
        or (row['Soil_Type39'] == 1) or (row['Soil_Type40'] == 1) :
        return 1 
    return 0

X['Climatic8'] = X.apply (lambda row: Climatic8(row), axis=1)

def Geologic1(row): 
    if (row['Soil_Type14'] == 1) or (row['Soil_Type15'] == 1) or (row['Soil_Type16'] == 1) or (row['Soil_Type17'] == 1) \
        or (row['Soil_Type19'] == 1) or (row['Soil_Type20'] == 1) or (row['Soil_Type21'] == 1) :
        return 1 
    return 0

X['Geologic1'] = X.apply (lambda row: Geologic1(row), axis=1)

def Geologic2(row): 
    if (row['Soil_Type9'] == 1) or (row['Soil_Type22'] == 1) or (row['Soil_Type23'] == 1) :
        return 1 
    return 0

X['Geologic2'] = X.apply (lambda row: Geologic2(row), axis=1)

def Geologic5(row): 
    if (row['Soil_Type7'] == 1) or (row['Soil_Type8'] == 1) :
        return 1 
    return 0

X['Geologic5'] = X.apply (lambda row: Geologic5(row), axis=1)

def Geologic7(row): 
    if (row['Soil_Type1'] == 1) or (row['Soil_Type2'] == 1) or (row['Soil_Type3'] == 1) or (row['Soil_Type4'] == 1) \
        or (row['Soil_Type5'] == 1) or (row['Soil_Type6'] == 1) or (row['Soil_Type10'] == 1) \
        or (row['Soil_Type11'] == 1) or (row['Soil_Type12'] == 1) or (row['Soil_Type13'] == 1) or (row['Soil_Type18'] == 1) \
        or (row['Soil_Type24'] == 1) or (row['Soil_Type25'] == 1) or (row['Soil_Type26'] == 1) or (row['Soil_Type27'] == 1) \
        or (row['Soil_Type28'] == 1) or (row['Soil_Type29'] == 1) or (row['Soil_Type30'] == 1) or (row['Soil_Type31'] == 1) \
        or (row['Soil_Type32'] == 1) or (row['Soil_Type33'] == 1) or (row['Soil_Type34'] == 1) or (row['Soil_Type35'] == 1) \
        or (row['Soil_Type36'] == 1) or (row['Soil_Type37'] == 1) or (row['Soil_Type38'] == 1) or (row['Soil_Type39'] == 1) \
        or (row['Soil_Type40'] == 1) :
        return 1 
    return 0

X['Geologic7'] = X.apply (lambda row: Geologic7(row), axis=1)

#Reversing One-Hot-Encoding to Categorical attributes, several articles recommend it for decision tree algorithms
#Doing it for Soil_Type, Wilderness_Area, Geologic and Climatic
#we are also replacing the categorical values by random numbers to difficult to the algorythm to find relationships between the values
X['Tmp']=np.where(X.loc[:, 'Soil_Type1':'Soil_Type40'])[1] +1
cols = [c for c in X.columns if c[:9] != 'Soil_Type']
X=X[cols]
X=X.rename(columns = {'Tmp':'Soil_Type'})

min=X['Soil_Type'].min()
max=X['Soil_Type'].max()
s=np.random.randint(low=1, high=np.iinfo(np.int64).max, size=(max-min+1), dtype='int64')
for i in range (min,max+1):
    X['Soil_Type'].replace(to_replace=i, value=s[i-min], inplace=True)

X['Tmp']=np.where(X.loc[:, 'Wilderness_Area1':'Wilderness_Area4'])[1] +1
cols = [c for c in X.columns if c[:15] != 'Wilderness_Area']
X=X[cols]
X=X.rename(columns = {'Tmp':'Wilderness_Area'})

min=X['Wilderness_Area'].min()
max=X['Wilderness_Area'].max()
s=np.random.randint(low=1, high=np.iinfo(np.int64).max, size=(max-min+1), dtype='int64')
for i in range (min,max+1):
    X['Wilderness_Area'].replace(to_replace=i, value=s[i-min], inplace=True)

X['Tmp']=np.where(X.loc[:, 'Climatic2':'Climatic8'])[1] +1
cols = [c for c in X.columns if c[:8] != 'Climatic']
X=X[cols]
X=X.rename(columns = {'Tmp':'Climatic'})

min=X['Climatic'].min()
max=X['Climatic'].max()
s=np.random.randint(low=1, high=np.iinfo(np.int64).max, size=(max-min+1), dtype='int64')
for i in range (min,max+1):
    X['Climatic'].replace(to_replace=i, value=s[i-min], inplace=True)

X['Tmp']=np.where(X.loc[:, 'Geologic1':'Geologic7'])[1] +1
cols = [c for c in X.columns if c[:8] != 'Geologic']
X=X[cols]
X=X.rename(columns = {'Tmp':'Geologic'})

min=X['Geologic'].min()
max=X['Geologic'].max()
s=np.random.randint(low=1, high=np.iinfo(np.int64).max, size=(max-min+1), dtype='int64')
for i in range (min,max+1):
    X['Geologic'].replace(to_replace=i, value=s[i-min], inplace=True)

#https://www.kaggle.com/arateris/2-layer-k-fold-learning-forest-cover 
# Add PCA features
from sklearn.decomposition import PCA
pca = PCA(n_components=0.99).fit(X)
trans = pca.transform(X)

for i in range(trans.shape[1]):
    col_name= 'pca'+str(i+1)
    X[col_name] = trans[:,i]

# Scale and bin features
from sklearn.preprocessing import MinMaxScaler
X.loc[:, :] = np.floor(MinMaxScaler((0, 100)).fit_transform(X))
X = X.astype('int8')

In [59]:
#break it down again in train and test
x,x_predict = X.xs(0),X.xs(1)

In [61]:
###### THIS IS THE ENSEMBLE SECTION ######
#https://www.kaggle.com/kwabenantim/forest-cover-stacking-multiple-classifiers
import random
from lightgbm import LGBMClassifier
from mlxtend.classifier import StackingCVClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

randomstate = 1
random.seed(randomstate)
np.random.seed(randomstate)

#max_features = min(30, x.columns.size)
max_features = 30

ab_clf = AdaBoostClassifier(n_estimators=200,
                            base_estimator=DecisionTreeClassifier(
                                min_samples_leaf=2,
                                random_state=randomstate),
                            random_state=randomstate)

et_clf = ExtraTreesClassifier(n_estimators=300,
                              min_samples_leaf=2,
                              min_samples_split=2,
                              max_depth=50,
                              max_features=max_features,
                              random_state=randomstate,
                              n_jobs=1)

lg_clf = LGBMClassifier(n_estimators=300,
                        num_leaves=128,
                        verbose=-1,
                        random_state=randomstate,
                        n_jobs=1)

rf_clf = RandomForestClassifier(n_estimators=300,
                                random_state=randomstate,
                                n_jobs=1)

ensemble = [('AdaBoostClassifier', ab_clf),
            ('ExtraTreesClassifier', et_clf),
            ('LGBMClassifier', lg_clf),
            ('RandomForestClassifier', rf_clf)]

#Cross-validating classifiers
for label, clf in ensemble:
    score = cross_val_score(clf, x, y,
                            cv=5,
                            scoring='accuracy',
                            verbose=0,
                            n_jobs=-1)
# Fitting stack
stack = StackingCVClassifier(classifiers=[ab_clf, et_clf, lg_clf, rf_clf],
                             meta_classifier=rf_clf,
                             cv=5,
                             stratify=True,
                             shuffle=True,
                             use_probas=True,
                             use_features_in_secondary=True,
                             verbose=1,
                             random_state=randomstate)

stack = stack.fit(x, y)

y_predict = stack.predict(x_predict)
y_predict = pd.Series(y_predict, index=x_predict.index, dtype=y.dtype)

Fitting 4 classifiers...
Fitting classifier1: adaboostclassifier (1/4)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  5.6min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier2: extratreesclassifier (2/4)


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier3: lgbmclassifier (3/4)


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier4: randomforestclassifier (4/4)


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   53.7s finished


In [62]:
# Save predictions to a file for submission
output = pd.DataFrame({'Id': Ids,
                       'Cover_Type': y_predict})
output.to_csv('submission.csv', index=False)

#create a link to download the file    
from IPython.display import FileLink
FileLink(r'submission.csv')