In [12]:
import pandas as pd
import re

train_data = pd.read_csv('train.csv',index_col=0,delimiter=';')
test_data = pd.read_csv('test.csv',index_col=0,delimiter=';')

In [13]:
# Transforming the columns to have an underscore between the column names
train_columns = train_data.columns
for col in train_columns:
    train_data = train_data.rename(
        columns={
            # strip out parentheses, and their contents
            col: re.sub(r'\(.*', '', col)
            .strip() 
            .replace(' ', '_')
            .replace('-', '_') 
            .lower()  # lowercase the column name
        }
    )


In [14]:
# Data exploration on a chosen subset
data_subset_1 = train_data[['national_park', 'elevation', 'aspect', 'slope','horizontal_distance_to_water', 'vertical_distance_to_water','horizontal_distance_to_road', 'light_at_9am', 'light_at_noon','light_at_3pm', 'horizontal_distance_to_fire_ignition_point','forest_type']]
missing_values_1= data_subset_1.isna().sum() # Finding the missing values in the subset
data_description_1 = data_subset_1.describe() # Numerical explenation of the data set
data_column_dtypes_1 = data_subset_1.dtypes

In [15]:
data_subset_2 = train_data[['soil_1','soil_2', 'soil_3', 'soil_4', 'soil_5', 'soil_6', 'soil_7', 'soil_8','soil_9', 'soil_10', 'soil_11', 'soil_12', 'soil_13', 'soil_14','soil_15', 'soil_16', 'soil_17', 'soil_18', 'soil_19', 'soil_20','soil_21', 'soil_22', 'soil_23', 'soil_24', 'soil_25', 'soil_26','soil_27', 'soil_28', 'soil_29', 'soil_30', 'soil_31', 'soil_32','soil_33', 'soil_34', 'soil_35', 'soil_36', 'soil_37', 'soil_38','soil_39', 'soil_40', 'forest_type']]
missing_values_2 = data_subset_2.isna().sum() # Finding the missing values in the subset
data_description_2 = data_subset_2.describe() # Numerical explenation of the data set
data_column_dtypes_2 = data_subset_2.dtypes

In [16]:
print(data_subset_1['national_park'].unique())
print(data_subset_1['forest_type'].unique())
print(data_subset_2['forest_type'].value_counts())
#print(missing_values_1)
#print(missing_values_2)
#data_description_1

['Mount Rainer' 'Yosemite' 'Yellowstone' 'Acadia']
['Cottonwood' 'Lodgepole']
Lodgepole     872095
Cottonwood    526000
Name: forest_type, dtype: int64


In [17]:
# first we clean and force the data to be numerical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = train_data.copy()
df = df.replace(to_replace= ['Mount Rainer','Yosemite','Yellowstone','Acadia','Cottonwood','Lodgepole'], 
                value= [0,1,0,2,0,1] ) # Cotttenwood = 0 and Lodgepole = 1

x = df.iloc[:,:-1].copy()
y = df.iloc[:,-1].copy()

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.4,random_state=100,stratify=y)


### Piplines for classification algorithms logistic regression, possibly regularized, Support Vector Classifier


In [18]:
# Using regularization methods L1 and L2
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

x_lr_train,y_lr_train,x_lr_test,y_lr_test= x_train.copy(),y_train.copy(),x_test.copy(),y_test.copy()

lr_pipe_l1 = make_pipeline(StandardScaler(),LogisticRegression(random_state=1,solver='liblinear',penalty='l1',C = 0.5))
lr_pipe_l1.fit(x_lr_train, y_lr_train)
lr_pipe_l2 = make_pipeline(StandardScaler(),LogisticRegression(random_state=1,solver='liblinear',penalty='l2', C = 0.5))
lr_pipe_l2.fit(x_lr_train, y_lr_train)

print('Test Accuracy L1 Logistic Regression: %.8f' % lr_pipe_l1.score(x_lr_train, y_lr_train))
print('Test Accuracy L1 Logistic Regression: %.8f' % lr_pipe_l1.score(x_lr_test, y_lr_test))
print()
print('Train Accuracy L2 Logistic Regression: %.8f' % lr_pipe_l2.score(x_lr_train, y_lr_train))
print('Test Accuracy L2 Logistic Regression: %.8f' % lr_pipe_l2.score(x_lr_test, y_lr_test))

Test Accuracy L1 Logistic Regression: 0.94489287
Test Accuracy L1 Logistic Regression: 0.94534527

Train Accuracy L2 Logistic Regression: 0.94488572
Test Accuracy L2 Logistic Regression: 0.94534349


In [19]:
# Pipelines for ensamble methods
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

x_ensamble_train,y_ensamble_train,x_ensamble_test,y_ensamble_test= x_train.copy(),y_train.copy(),x_test.copy(),y_test.copy()
RF = make_pipeline(RandomForestClassifier(n_estimators=40, random_state=10,n_jobs=-1))
RF.fit(x_ensamble_train, y_ensamble_train)

print('Train accuracy Random Forest : {0:.5f}'.format(RF.score(x_ensamble_train, y_ensamble_train)))
print('Test accuracy Random Forest : {0:.5f}'.format(RF.score(x_ensamble_test, y_ensamble_test)))

Train accuracy Random Forest : 0.99995
Test accuracy Random Forest : 0.97234


In [20]:
#print('Train accuracy svm: {0:.5f}'.format(svm.score(X_train_sc, y_train)))
#print('Test accuracy svm: {0:.5f}'.format(svm.score(X_test_sc, y_test)))
print()
print('Train accuracy Random Forest : {0:.5f}'.format(RF.score(x_ensamble_train, y_train)))
print('Test accuracy Random Forest : {0:.5f}'.format(RF.score(x_ensamble_test, y_test)))
print()
print('Test Accuracy L1 Logistic Regression: %.3f' % lr_pipe_l1.score(x_lr_test, y_lr_test))
print('Test Accuracy L1 Logistic Regression: %.3f' % lr_pipe_l1.score(x_lr_test, y_lr_test))
print()
print('Train Accuracy L2 Logistic Regression: %.3f' % lr_pipe_l2.score(x_lr_train, y_lr_train))
print('Test Accuracy L2 Logistic Regression: %.3f' % lr_pipe_l2.score(x_lr_test, y_lr_test))


Train accuracy Random Forest : 0.99995
Test accuracy Random Forest : 0.97234

Test Accuracy L1 Logistic Regression: 0.945
Test Accuracy L1 Logistic Regression: 0.945

Train Accuracy L2 Logistic Regression: 0.945
Test Accuracy L2 Logistic Regression: 0.945


In [25]:
# Learning Keras
import numpy as np
from tensorflow import keras
from keras.models import Sequential
from keras import models
from keras.layers import Dense, Input, Flatten
from keras.utils import to_categorical

In [26]:
df_keras= train_data.copy()

In [42]:
keras1_data = df_keras.copy()
keras1_data = pd.get_dummies(data = keras1_data, columns=['national_park'])
keras1_data['forest_type'] = np.where(keras1_data['forest_type'] == 'Cottonwood', 0, 1)
keras1_data = keras1_data.drop(['national_park_Yellowstone'], axis=1)

x_keras1= keras1_data.drop(['forest_type'],axis = 1)
y_keras1= keras1_data['forest_type']
keras1_x_train, keras1_x_test, keras1_y_train, keras1_y_test = train_test_split(x_keras1,y_keras1, test_size = 0.4, random_state=100)

In [37]:
keras1_data.columns

Index(['elevation', 'aspect', 'slope', 'horizontal_distance_to_water',
       'vertical_distance_to_water', 'horizontal_distance_to_road',
       'light_at_9am', 'light_at_noon', 'light_at_3pm',
       'horizontal_distance_to_fire_ignition_point', 'soil_1', 'soil_2',
       'soil_3', 'soil_4', 'soil_5', 'soil_6', 'soil_7', 'soil_8', 'soil_9',
       'soil_10', 'soil_11', 'soil_12', 'soil_13', 'soil_14', 'soil_15',
       'soil_16', 'soil_17', 'soil_18', 'soil_19', 'soil_20', 'soil_21',
       'soil_22', 'soil_23', 'soil_24', 'soil_25', 'soil_26', 'soil_27',
       'soil_28', 'soil_29', 'soil_30', 'soil_31', 'soil_32', 'soil_33',
       'soil_34', 'soil_35', 'soil_36', 'soil_37', 'soil_38', 'soil_39',
       'soil_40', 'forest_type', 'national_park_Acadia',
       'national_park_Mount Rainer', 'national_park_Yellowstone',
       'national_park_Yosemite'],
      dtype='object')

In [28]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

print(keras1_x_train.shape)
print(keras1_y_train.shape)
print(keras1_x_test.shape)
print(keras1_y_test.shape)

(838857, 51)
(838857,)
(559238, 51)
(559238,)
(838857, 54)
(838857,)
(559238, 54)
(559238,)


In [53]:
# Code for creating and training a ANN with Keras

# 1
model = Sequential()

model.add(Dense(20, input_dim = 53, activation = 'relu', use_bias=True))
model.add(Dense(20, activation = "relu"))
model.add(Dense(10, activation = "relu"))
model.add(Dense(20, activation = "relu"))
model.add(Dense(10, activation = "relu"))
model.add(Dense(20, activation = "relu"))
model.add(Dense(10, activation = "relu"))
model.add(Dense(20, activation = "relu"))
model.add(Dense(10, activation = "relu"))
model.add(Dense(20, activation = "relu"))
model.add(Dense(10, activation = "relu"))
model.add(Dense(20, activation = "relu"))
model.add(Dense(1, activation = "sigmoid"))
model.summary()

# model compile
model.compile(loss = 'binary_crossentropy', optimizer = 'Adamax', metrics = ['accuracy'])

hist = model.fit(keras1_x_train, keras1_y_train, 
                 epochs = 10, 
                 batch_size = 512,
                 validation_data = (keras1_x_test, keras1_y_test))

Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_418 (Dense)            (None, 20)                1080      
_________________________________________________________________
dense_419 (Dense)            (None, 20)                420       
_________________________________________________________________
dense_420 (Dense)            (None, 10)                210       
_________________________________________________________________
dense_421 (Dense)            (None, 20)                220       
_________________________________________________________________
dense_422 (Dense)            (None, 10)                210       
_________________________________________________________________
dense_423 (Dense)            (None, 20)                220       
_________________________________________________________________
dense_424 (Dense)            (None, 10)              

In [None]:
keras2_data = df_keras.copy()
keras2_data = pd.get_dummies(data = keras2_data.copy(), columns=['national_park'])
keras2_data['forest_type'] = np.where(keras2_data['forest_type'] == 'Cottonwood', 0, 1)

#keras2_data = keras2_data.replace(to_replace= ['Cottonwood','Lodgepole'], 
#                value= [0,1] ) # Cotttenwood = 0 and Lodgepole = 1


x_keras2= keras2_data.drop(['forest_type'],axis = 1)
y_keras2= keras2_data['forest_type']

keras2_x_train,keras2_x_test,keras2_y_train,keras2_y_test = train_test_split(x_keras2,y_keras2,test_size=0.4,random_state=100,stratify=y_keras2)

In [None]:
# Code for creating and training a ANN with Keras

# 2
model1 = Sequential()

model1.add(Dense(20, input_dim = 54, activation = 'relu'))
model1.add(Dense(10, activation = "relu"))
model1.add(Dense(20, activation = "relu"))
model1.add(Dense(10, activation = "relu"))
model1.add(Dense(1, activation = "relu"))
model1.summary()

# model compile

model1.compile(loss = 'binary_crossentropy', optimizer = 'Adamax', metrics = ['accuracy'])

hist = model1.fit(keras2_x_train, keras2_y_train, 
                 epochs = 20, 
                 batch_size = 128,
                 validation_data = (keras2_x_test, keras2_y_test))

In [None]:
submission_data = test_data.copy()
submission_data = pd.get_dummies(data = train_data.copy(), columns=['national_park'])
submission_data['forest_type'] = np.where(train_data['forest_type'] == 'Cottonwood', 0, 1)

In [None]:
submission_data['forest_type']

In [None]:
X_sub= submission_data.iloc[:,:-1].copy()
y_sub= submission_data.iloc[:,-1].copy()
sub_x_train, sub_x_test, sub_y_train, sub_y_test = train_test_split(X_sub,y_sub, test_size = 0.4, random_state=100, stratify = y)

In [None]:
model = Sequential()
model.add(Dense(20, input_dim = 54, activation = 'relu'))
model.add(Dense(10, activation = "relu"))
model.add(Dense(1, activation = "sigmoid"))
model.summary()

# model compile
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

hist = model.fit(sub_x_train, sub_y_train, 
                 epochs = 10, 
                 batch_size = 128,
                 validation_data = (sub_x_test, sub_y_test))

In [None]:
keras_predictions = model.predict(submission_data)
output = pd.DataFrame({'index': submission_data.index,'Predicted': keras_predictions})