### Tree mortality prediction based on growth patterns

data: [HF213](https://harvardforest1.fas.harvard.edu/exist/apps/datasets/showData.html?id=HF213)

Use classification algorithms to predict A(live) or D(ad) labels in __mortality13__ and __mortality14__ columns using these features:  
 - spp: USDA Plants database species code  
 - dbh09: diameter at Breast Height (1.4m) in year 2009 (unit: centimeter / missing value: NA)  
 - dbh11: diameter at Breast Height (1.4m) in year 2011 (unit: centimeter / missing value: NA)  
 - dbh12: diameter at Breast Height (1.4m) in year 2012 (unit: centimeter / missing value: NA)  
 - dbh13: diameter at Breast Height (1.4m) in year 2013 (unit: centimeter / missing value: NA)  

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import sys, os, pathlib, shutil, platform
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    roc_auc_score,
    roc_curve,
    auc,
)
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import OrdinalEncoder

from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler


In [None]:
MINIMUM_COUNT = 10
TRAIN_DATA = 0.6

In [None]:
# !/opt/conda/bin/conda install -c anaconda seaborn pandas scikit-learn -y 


In [None]:
%matplotlib inline

In [None]:
# !pwd
# !ls -la ./../data/hrvardf/HF213

In [None]:
dataFileName='hf213-01-hf-inventory.csv'
dataPathFull= pathlib.Path('./../data/hrvardf/HF213') / dataFileName
myData = pd.read_csv(str(dataPathFull)) 

In [None]:
myData.shape
myData.head(2)
myData.tail(2)

In [None]:
myData.info()

In [None]:
# basic descriptive statistics for numeric columns:
myData.describe()

In [None]:
# myData.groupby('spp').size()
myCols = ['spp', 'mortality13', 'dmg13']
myData[myCols[0]].value_counts(dropna=False) 
myData[myCols[1]].value_counts(dropna=False)
myData[myCols[2]].value_counts(dropna=False)
myData.pivot_table(index = [myCols[0]]
                   , columns = myCols[1]
                   , values =  myCols[2]
                   , aggfunc=np.sum, fill_value=0)
          


In [None]:
# import seaborn as sns
# sns.countplot(x= myData['spp'],label="spp Count")
# plt.show()

In [None]:
myData['spp'].value_counts(dropna=False) 
removeSPP = myData['spp'].value_counts(dropna=False).loc[lambda x : x<MINIMUM_COUNT].index.tolist()
removeSPP

# filteredData = myData.replace(dict.fromkeys(removeSPP, 'TooFew'))
# filteredData['spp'].value_counts(dropna=False)

In [None]:
featureColumn_01=['spp', 'dbh09', 'dbh11', 'dbh12']
# featureColumn_01=[ 'dbh09', 'dbh11', 'dbh12']
labelColumn_01 = 'mortality13'
featureColumn_02=['spp', 'dbh09', 'dbh11', 'dbh12', 'dbh13']
# featureColumn_02=['dbh09', 'dbh11', 'dbh12', 'dbh13']
labelColumn_02 = 'mortality14' 

labelColumn = labelColumn_02
featureColumn = featureColumn_02

In [None]:
sorted(set(featureColumn+[labelColumn]))

In [None]:
filteredData = myData
filteredDataML = filteredData[sorted(set(featureColumn+[labelColumn]))]

filteredDataML.shape
filteredDataML.head()
filteredDataML[labelColumn].value_counts(dropna=False)

In [None]:
filteredDataML[labelColumn].value_counts(dropna=False)

filteredDataML = filteredDataML[filteredDataML[labelColumn].isin(['D', 'A'])]
filteredDataML.shape
filteredDataML.head()


filteredDataML[labelColumn].value_counts(dropna=False)

In [None]:
# https://scikit-learn.org/stable/auto_examples/ensemble/plot_stack_predictors.html#sphx-glr-auto-examples-ensemble-plot-stack-predictors-py

catCols = filteredDataML.columns[filteredDataML.dtypes == 'O']
numCols = filteredDataML.columns[filteredDataML.dtypes == 'float64']
catCols
numCols


In [None]:
stratifySplit = StratifiedShuffleSplit(n_splits=1, train_size=TRAIN_DATA, random_state=1)

trainIdx, tstIdx = next(stratifySplit.split(filteredDataML, filteredDataML[labelColumn]))
# print("\n Train:", sorted(trainIdx))
len(trainIdx)
len(tstIdx)

filteredDataML.loc[filteredDataML.index.intersection(filteredDataML.index[trainIdx])].shape
filteredDataML[filteredDataML.index.isin(filteredDataML.index[trainIdx])].shape
aa=filteredDataML.loc[filteredDataML.index.intersection(filteredDataML.index[tstIdx])]
aa.shape
stratifySplit = StratifiedShuffleSplit(n_splits=1, train_size=TRAIN_DATA, test_size=1-TRAIN_DATA, random_state=1)
testIdx, validationIdx = next(stratifySplit.split(aa,  aa[labelColumn]))

len(testIdx)
len(validationIdx)
filteredDataML.shape

# testIdx=tstIdx[testIdx]
# validationIdx=tstIdx[validationIdx]

# print("\n Test:", sorted(testIdx))
# print("\nValidation:", sorted(validationIdx))

In [None]:
trainData=filteredDataML.loc[filteredDataML.index.intersection(filteredDataML.index[trainIdx]),:]
testData=aa.loc[aa.index.intersection(aa.index[testIdx]),:]
validationData = aa.loc[aa.index.intersection(aa.index[validationIdx]),:]

filteredDataML[labelColumn].value_counts(dropna=False)
trainData[labelColumn].value_counts(dropna=False) 
testData[labelColumn].value_counts(dropna=False) 
validationData[labelColumn].value_counts(dropna=False) 

In [None]:
ordinalEncoder = OrdinalEncoder()
ordinalEncoder.fit(filteredDataML[catCols])
ordinalEncoder.categories_

trainData[catCols] = ordinalEncoder.transform(trainData[catCols])
testData[catCols] = ordinalEncoder.transform(testData[catCols])
validationData[catCols] = ordinalEncoder.transform(validationData[catCols])

trainData.head()

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer.fit(trainData[featureColumn])


In [None]:
trainData[featureColumn] = imputer.transform(trainData[featureColumn])
testData[featureColumn] = imputer.transform(testData[featureColumn])
validationData[featureColumn] = imputer.transform(validationData[featureColumn])

In [None]:
SVC_model = svm.SVC(kernel='rbf', random_state=0, gamma=.1, C=100, probability=True)
KNN_model = KNeighborsClassifier(n_neighbors=50)
RF_model = RandomForestClassifier(n_estimators=10, class_weight=dict({0:10000., 1:10.}))


SVC_model.fit(trainData[featureColumn], trainData[labelColumn])
KNN_model.fit(trainData[featureColumn], trainData[labelColumn])
RF_model.fit(trainData[featureColumn], trainData[labelColumn])

In [None]:
SVC_prediction = SVC_model.predict(testData[featureColumn])
KNN_prediction = KNN_model.predict(testData[featureColumn])
RF_prediction  = RF_model.predict(testData[featureColumn])

In [None]:
accuracy_score(SVC_prediction, testData[labelColumn])
accuracy_score(KNN_prediction, testData[labelColumn])
accuracy_score(RF_prediction, testData[labelColumn])

In [None]:
confusion_matrix(SVC_prediction, testData[labelColumn])
confusion_matrix(KNN_prediction, testData[labelColumn])
confusion_matrix(RF_prediction, testData[labelColumn])

In [None]:
print(classification_report(SVC_prediction,  testData[labelColumn]))
print(classification_report(KNN_prediction,  testData[labelColumn]))
print(classification_report(RF_prediction,  testData[labelColumn]))

#### SVM SVC ROC curve analysis

In [None]:
SVC_prediction_probs = SVC_model.predict_proba(testData[featureColumn])
fpr, tpr, thresholds = roc_curve(testData[labelColumn], SVC_prediction_probs[:, 1])
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc)

In [None]:
# Plot ROC curve
import pylab as pl
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Receiverrating characteristic example')
pl.legend(loc="lower right")
pl.show()