In [1]:
#pip install seaborn

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sb

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
 #       print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### Inspiration: 
https://www.kaggle.com/roshanchoudhary/forest-cover-walkthrough-in-python-knn-96-51<br>
https://www.kaggle.com/sharmasanthosh/exploratory-study-on-feature-selection

## Data Dictionary

* Elevation = Elevation (altitude) in meters
* Aspect = Aspect in degrees azimuth
* Slope = Slope in degrees
* Horizontal_Distance_To_Hydrology = Horizontal distance to nearest surface water features
* Vertical_Distance_To_Hydrology = Vertical distance to nearest surface water features
* Horizontal_Distance_To_Roadways = Horizontal distance to nearest roadway
* Hillshade_9am = Hill shade index at 9am, summer solstice. Value out of 255
* Hillshade_Noon = Hill shade index at noon, summer solstice. Value out of 255
* Hillshade_3pm = Hill shade index at 3pm, summer solstice. Value out of 255
* Horizontal_Distance_To_Fire_Point = sHorizontal distance to nearest wildfire ignition points
* Wilderness_Area1 = Rawah Wilderness Area
* Wilderness_Area2 = Neota Wilderness Area
* Wilderness_Area3 = Comanche Peak Wilderness Area
* Wilderness_Area4 = Cache la Poudre Wilderness Area
* Soil types 1-40

### Tree species:
1. Spruce/Fir
2. Lodgepole Pine
3. Ponderosa Pine
4. Cottonwood/Willow
5. Aspen
6. Douglas-fir
7. Krummholz

### The problem:
Based on the given data, try to find some patterns in the feature space which influence the result. The result (label) is the forest cover itself, which refers to tree species. With the above features we have to try to find some algorithms that can help us predict the tree species based on environmental/climate parameters on a given plot.
### Real world application:
This prediction can be used for afforestation or reforestation decisions when it comes to finding the right species to plant.
### Approach
The data should be analyzed and statistically described. Then the useful features have to be selected.
This is a multi-class classification problem. We have 7 labels and we have to predict which one applies to given parameters.
To solve classification problems, we can use machine learning algorithms.

### Reading the csv files as a dataframe

In [3]:
df = pd.read_csv('/home/balintp/Downloads/covtype.csv')

### Checking the dimension of the dataframe

In [4]:
print("There are {} records and {} features in the dataset".format(df.shape[0], df.shape[1]))

There are 581012 records and 55 features in the dataset


### Checking data types by column

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581012 entries, 0 to 581011
Data columns (total 55 columns):
 #   Column                              Non-Null Count   Dtype
---  ------                              --------------   -----
 0   Elevation                           581012 non-null  int64
 1   Aspect                              581012 non-null  int64
 2   Slope                               581012 non-null  int64
 3   Horizontal_Distance_To_Hydrology    581012 non-null  int64
 4   Vertical_Distance_To_Hydrology      581012 non-null  int64
 5   Horizontal_Distance_To_Roadways     581012 non-null  int64
 6   Hillshade_9am                       581012 non-null  int64
 7   Hillshade_Noon                      581012 non-null  int64
 8   Hillshade_3pm                       581012 non-null  int64
 9   Horizontal_Distance_To_Fire_Points  581012 non-null  int64
 10  Wilderness_Area1                    581012 non-null  int64
 11  Wilderness_Area2                    581012 non-null 

### We can see that all columns contain only integers and there are only non-null values
### We can also visualize the missing values by column if any:

In [7]:
#pip install missingno

Defaulting to user installation because normal site-packages is not writeable
Collecting missingno
  Downloading missingno-0.4.2-py3-none-any.whl (9.7 kB)
Installing collected packages: missingno
Successfully installed missingno-0.4.2
Note: you may need to restart the kernel to use updated packages.


In [6]:
import missingno as msno
msno.matrix(df)

ModuleNotFoundError: No module named 'missingno'

### We can see again, the dataset is very clear, no missing values

### We want to see all columns when seeing results, so we need to set this option (*Pandas only shows some columns from the beginning and from the end part of the dataframe by default*)

In [None]:
pd.set_option('display.max_columns', None)

### Displaying the first 5 rows of the dataframe just to have an overview

In [None]:
df.head()

### Displaying the main statistical metrics for each column

In [None]:
df.describe()

### Some interesting facts:
* mean elevation is ~2959 m, minimum is 1859 m, maximum is 3858 m
* mean slope is 14°

### Take a look at the correlation between variables

In [None]:
sb.set(style="white")
df_corr = df.corr()
mask_train = np.triu(np.ones_like(df_corr, dtype=np.bool))
f, ax = plt.subplots(figsize=(20, 20))
cmap_train = sb.diverging_palette(220, 10, as_cmap=True)
sb.heatmap(df_corr, mask=mask_train, cmap=cmap_train, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

### Check skewness

In [None]:
print(df.skew())

In [None]:
df['Cover_Type'].value_counts()

### We'll look at skewness visually as well, but first we need to separate the continuous variables from the dataset

In [None]:
cont_df = df[['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology','Horizontal_Distance_To_Roadways',
              'Hillshade_9am','Hillshade_Noon','Hillshade_3pm','Horizontal_Distance_To_Fire_Points']]

In [None]:
cont_df.head(5)

In [None]:
sb.set_style("darkgrid")
for i, col in enumerate(cont_df.columns):
    plt.figure(i)
    t = sb.distplot(cont_df[col],color="g",label="Skewness: {0:.2f}".format(cont_df[col].skew()))
    t.legend()

### On the next visuals we can see not just the quartiles and median but also the probability density of the data at different categories.

In [None]:
df['Cover_Type']=df['Cover_Type'].astype('category')

for i, col in enumerate(cont_df.columns):
    plt.figure(i,figsize=(8,4))
    sb.violinplot(x=df['Cover_Type'], y=col, data=df, palette="mako")

In [None]:
cols = df.columns

#number of rows=r, number of columns=c
r,c = df.shape

#Create a new dataframe with r rows, one column for each encoded category, and target
newdf = pd.DataFrame(index=np.arange(0, r),columns=['Wilderness_Area','Soil_Type','Cover_Type'])

#Make an entry in 'newdf' for each r as category_id, target value
for x in range(0,r):
    w=0;
    s=0;
    
    # Category1
    for y in range(10,14):
        if (df.iloc[x,y] == 1):
            w=y-9  #category class
            break
            
    # Category2       
    for z in range(14,54):
        if (df.iloc[x,z] == 1):
            s=z-13 #category class
            break
    #Make an entry in 'data' for each r as category_id, target value        
    newdf.iloc[x]=[w,s,df.iloc[x,c-1]]

#Category 1:
sb.countplot(x="Wilderness_Area", hue="Cover_Type", data=newdf, palette="viridis")
plt.show()

#Category 2:
plt.rc("figure", figsize=(25, 10))
sb.countplot(x="Soil_Type", hue="Cover_Type", data=newdf, palette="Spectral")
plt.show()

### Let's look at on the correlations between features again, now only with regards to continuous features

In [None]:
cont_df.corr()

In [None]:
plt.figure(figsize=(15,8))
sb.heatmap(cont_df.corr(),cmap='viridis',annot=True)

In [None]:
g = sb.PairGrid(cont_df)
g.map(sb.scatterplot, color="green")

### Let's divide the dataset to features and label

In [None]:
X=df.loc[:,'Elevation':'Soil_Type40']
y=df['Cover_Type']

### Let's remove the features with low variance

In [None]:
colstodrop=['Hillshade_3pm','Soil_Type7','Soil_Type8','Soil_Type14','Soil_Type15',
     'Soil_Type21','Soil_Type25','Soil_Type28','Soil_Type36','Soil_Type37']

In [None]:
X.drop(colstodrop, axis=1, inplace=True)

In [None]:
X.head()

In [None]:
X.shape

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier

### Split the data into train and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0001, random_state=42)

### Making pipeline including scaling, grid search and cross validation for each model

pipe = make_pipeline(StandardScaler(), SVC())  
param_grid = {'svc__C': [1e3, 1e4, 1e5],
              'svc__gamma': [0.001, 0.01, 0.1]}
gs = GridSearchCV(pipe, param_grid, cv=5)
gs.fit(X_train[:2000], y_train[:2000])

gs.best_params_

gs.best_score_

In [None]:
clf_svc = make_pipeline(StandardScaler(),SVC(kernel='rbf',C=1000,gamma=0.001))

In [None]:
clf_svc.fit(X_train, y_train)

In [None]:
y_pred = clf_svc.predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred, labels=['Spruce/Fir', 'Lodgepole Pine', 'Ponderosa Pine','Cottonwood/Willow','Aspen','Douglas-fir','Krummholz'])

target_names = ['Spruce/Fir', 'Lodgepole Pine', 'Ponderosa Pine','Cottonwood/Willow','Aspen','Douglas-fir','Krummholz']
classification_report(y_test, y_pred, target_names=target_names)