<h1>Leaf Classification</h1>

In [38]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def warn(*args, **kwargs): pass
import warnings
warnings.warn = warn

from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedShuffleSplit

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

<h2>data preparation</h2>

In [39]:
# Swiss army knife function to organize the data

def encode(train, test):
    le = LabelEncoder().fit(train.species) 
    labels = le.transform(train.species)           
    classes = list(le.classes_)                    
    test_ids = test.id                             
    
    train = train.drop(['species', 'id'], axis=1)  
    test = test.drop(['id'], axis=1)
    
    return train, labels, test, test_ids, classes

train, labels, test, test_ids, classes = encode(train, test)
train.head(1)

Unnamed: 0,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,margin9,margin10,margin11,margin12,margin13,margin14,margin15,margin16,margin17,margin18,margin19,margin20,Unnamed: 21
0,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0,0.001953,0.033203,0.013672,0.019531,0.066406,0,0.029297,0,0.03125,0.011719,0,0.025391,...


In [40]:
sss = StratifiedShuffleSplit(labels, 10, test_size=0.2, random_state=23)

for train_index, test_index in sss:
    X_train, X_test = train.values[train_index], train.values[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

In [41]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    train_predictions = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)


KNeighborsClassifier
****Results****
Accuracy: 88.8889%
Log Loss: 1.57550751299
SVC
****Results****
Accuracy: 81.8182%
Log Loss: 4.62861251794
NuSVC
****Results****
Accuracy: 88.3838%
Log Loss: 2.46975793767
DecisionTreeClassifier
****Results****
Accuracy: 63.1313%
Log Loss: 12.7339933173
RandomForestClassifier
****Results****
Accuracy: 90.4040%
Log Loss: 0.982589937995
AdaBoostClassifier
****Results****
Accuracy: 4.5455%
Log Loss: 4.1987407785
GradientBoostingClassifier
****Results****
Accuracy: 58.5859%
Log Loss: 2.5244924939
GaussianNB
****Results****
Accuracy: 57.0707%
Log Loss: 14.8272524928
LinearDiscriminantAnalysis
****Results****
Accuracy: 97.9798%
Log Loss: 0.930197776314
QuadraticDiscriminantAnalysis
****Results****
Accuracy: 2.5253%
Log Loss: 33.6665850718


In [51]:
favorite_clf = LinearDiscriminantAnalysis()
favorite_clf.fit(X_train, y_train)
test_predictions = favorite_clf.predict_proba(test)

submission = pd.DataFrame(test_predictions, columns=classes)
submission.insert(0, 'id', test_ids)
submission.reset_index()

submission.to_csv('submission.csv', index=False)
submission.tail()

Unnamed: 0,id,Acer_Capillipes,Acer_Circinatum,Acer_Mono,Acer_Opalus,Acer_Palmatum,Acer_Pictum,Acer_Platanoids,Acer_Rubrum,Acer_Rufinerve,Acer_Saccharinum,Alnus_Cordata,Alnus_Maximowiczii,Alnus_Rubra,Alnus_Sieboldiana,Alnus_Viridis,Arundinaria_Simonii,Betula_Austrosinensis,Betula_Pendula,Callicarpa_Bodinieri,Unnamed: 21
589,1576,6.204136e-54,0.144937,1.475659e-90,5.132873e-26,9.010706e-52,2.352036e-90,1.112248e-124,1.322167e-25,0.1359639,1.731641e-88,7.832327000000001e-23,0.144937,3.477004e-78,9.258767e-53,6.921931e-71,4.452189e-170,0.144937,7.960853e-90,2.781299e-130,...
590,1577,1.900531e-37,1.0459950000000001e-99,1.557514e-158,9.196877e-06,0.0,9.698483e-80,5.454368e-98,0.02602529,0.001802151,6.814215e-235,1.5366589999999998e-19,4.24361e-89,0.1944157,7.084461e-83,6.842750999999999e-124,2.573405e-181,1.495325e-58,4.273743e-16,2.405937e-117,...
591,1579,4.449199e-58,1.2288000000000002e-140,4.226146e-102,9.938732e-15,0.0,9.061245000000001e-61,2.29052e-121,8.680465e-80,8.649427e-82,7.643139e-222,4.354557e-51,1.492633e-144,1.348423e-39,3.5170409999999997e-122,2.239732e-174,5.1310630000000006e-157,3.398926e-109,1.662867e-57,1.7650100000000001e-112,...
592,1580,6.855197e-88,1.544689e-115,4.0609249999999996e-78,4.288282e-21,3.055445e-280,1.27244e-68,4.2573750000000004e-73,1.627633e-21,1.2294520000000001e-60,1.3887619999999999e-239,6.95035e-29,8.311599e-80,5.492875e-94,1.973762e-79,2.562938e-135,2.0101710000000003e-157,1.521816e-84,1.138149e-47,1.86625e-109,...
593,1583,7.590912e-110,3.0412330000000003e-140,9.026623e-76,1.2002839999999999e-34,0.0,1.332657e-22,4.334935e-98,1.949399e-94,1.070657e-89,1.7731379999999998e-270,5.694996e-59,1.832707e-127,2.490247e-128,1.494029e-74,3.730233e-143,0.3333333,4.152657e-130,7.08375e-83,5.356651e-114,...
