In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_data = pd.read_csv(os.path.join('../../data/raw', 'train.csv'))
test_data = pd.read_csv(os.path.join('../../data/raw', 'test.csv'))
sample_submission = pd.read_csv(os.path.join('../../data/raw', 'sample_submission.csv'))

In [3]:
train_data.head()

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,Acer_Opalus,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391
1,2,Pterocarya_Stenoptera,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0.0,...,0.000977,0.0,0.0,0.000977,0.023438,0.0,0.0,0.000977,0.039062,0.022461
2,3,Quercus_Hartwissiana,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0.0,...,0.1543,0.0,0.005859,0.000977,0.007812,0.0,0.0,0.0,0.020508,0.00293
3,5,Tilia_Tomentosa,0.0,0.003906,0.023438,0.005859,0.021484,0.019531,0.023438,0.0,...,0.0,0.000977,0.0,0.0,0.020508,0.0,0.0,0.017578,0.0,0.047852
4,6,Quercus_Variabilis,0.005859,0.003906,0.048828,0.009766,0.013672,0.015625,0.005859,0.0,...,0.09668,0.0,0.021484,0.0,0.0,0.0,0.0,0.0,0.0,0.03125


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990 entries, 0 to 989
Columns: 194 entries, id to texture64
dtypes: float64(192), int64(1), object(1)
memory usage: 1.5+ MB


In [5]:
#sns.pairplot(x_train.iloc[:, :4], hue='species', size=3)

#### Building the Pipeline

In [28]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [33]:
t_train = LabelEncoder().fit_transform(train_data['species'])
x_train = train_data.drop("id", axis=1)
N, M = x_train.shape
print("Training Data:", N)
print("Dimension:", M)

Training Data: 990
Dimension: 193


In [8]:
X_train, X_test, y_train, y_test = train_test_split(x_train.iloc[:, 1:].values,
                                                    t_train,
                                                    test_size=0.4,
                                                    random_state=10)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(594, 192)
(396, 192)
(594,)
(396,)


In [9]:
pipeline = Pipeline([
    ('Standardization', StandardScaler()),  # Step 1 - Normalize data (z-score)
    ('clf', LogisticRegression())  # Step 2 - Classifier
])
print(pipeline.steps)

[('Standardization', StandardScaler()), ('clf', LogisticRegression())]


### Trying Logistic Regression Classifier
Use Cross-Validation to test the accuracy of the pipeline

In [10]:
from sklearn.model_selection import cross_validate

scores = cross_validate(pipeline, X_train, y_train)
print(scores)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

{'fit_time': array([0.90662265, 1.99227285, 1.22870445, 1.12266588, 1.21712589]), 'score_time': array([0.00127125, 0.00081205, 0.00265145, 0.00140715, 0.00180459]), 'test_score': array([0.96638655, 0.97478992, 0.95798319, 1.        , 0.98305085])}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [11]:
print("Average accuracy of pipeline with Logistic Regression:", "%.2f" % (scores['test_score'].mean()*100), "%")

Average accuracy of pipeline with Logistic Regression: 97.64 %


### Trying out other classification algorithms

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [13]:
clfs = []
clfs.append(LogisticRegression())
clfs.append(SVC())
clfs.append(KNeighborsClassifier())
clfs.append(DecisionTreeClassifier())
clfs.append(RandomForestClassifier())
#clfs.append(GradientBoostingClassifier())

for classifier in clfs:
    pipeline.set_params(clf=classifier)
    scores = cross_validate(pipeline, X_train, y_train)
    print('-----------------------------------------------')
    print(str(classifier))
    print('-----------------------------------------------')
    for key, values in scores.items():
        print(key, 'mean ', values.mean())
        print(key, 'std ', values.std())
    
    
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

-----------------------------------------------
LogisticRegression()
-----------------------------------------------
fit_time mean  1.559008502960205
fit_time std  0.44567430898873645
score_time mean  0.0022132396697998047
score_time std  0.0016062205917440578
test_score mean  0.9764421022646346
test_score std  0.014444550098210666
-----------------------------------------------
SVC()
-----------------------------------------------
fit_time mean  0.16770439147949218
fit_time std  0.016070336183280867
score_time mean  0.022563648223876954
score_time std  0.0006096440998395342
test_score mean  0.9461900014242985
test_score std  0.02287538732959637
-----------------------------------------------
KNeighborsClassifier()
-----------------------------------------------
fit_time mean  0.006565618515014649
fit_time std  0.0005924019860652996
score_time mean  0.01924748420715332
score_time std  0.00027687283414216403
test_score mean  0.9275886625836776
test_score std  0.019687206434200955




-----------------------------------------------
DecisionTreeClassifier()
-----------------------------------------------
fit_time mean  0.19738287925720216
fit_time std  0.01751761298507476
score_time mean  0.000904703140258789
score_time std  8.131738810997403e-05
test_score mean  0.5303375587523145
test_score std  0.016993140994811435




-----------------------------------------------
RandomForestClassifier()
-----------------------------------------------
fit_time mean  0.9178591728210449
fit_time std  0.03623022458896941
score_time mean  0.016502761840820314
score_time std  0.00021854511832674974
test_score mean  0.9612590799031476
test_score std  0.01265783970687586


## Cross-Validation and Hyper-parameters Tuning

In [14]:
from sklearn.model_selection import GridSearchCV
pipeline.set_params(clf=SVC())
print(pipeline.steps)

[('Standardization', StandardScaler()), ('clf', SVC())]


In [21]:
parameters = {
    'clf__kernel': ['linear', 'rbf'],
    'clf__C': np.linspace(0.1, 1.2, 12)
}

cv_grid = GridSearchCV(pipeline, param_grid=parameters)

cv_grid.fit(X_train, y_train)



GridSearchCV(estimator=Pipeline(steps=[('Standardization', StandardScaler()),
                                       ('clf', SVC())]),
             param_grid={'clf__C': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2]),
                         'clf__kernel': ['linear', 'rbf']})

Best combinations of the parameters can be accessed from **best_params_**

In [22]:
print("Best Parameters from Grid Search")
print(cv_grid.best_params_)

Best Parameters from Grid Search
{'clf__C': 0.1, 'clf__kernel': 'linear'}


In [23]:
cv_grid.best_estimator_

Pipeline(steps=[('Standardization', StandardScaler()),
                ('clf', SVC(C=0.1, kernel='linear'))])

In [25]:
cv_grid.best_score_

0.9798034468024497

### Test set prediction

In [26]:
y_predict = cv_grid.predict(X_test)
accuracy = accuracy_score(y_test, y_predict)
print("Accuracy of the best classifier after CV is %.3f%%" % (accuracy*100))

Accuracy of the best classifier after CV is 96.717%


In [27]:
y_predict

array([86, 30, 23, 87, 70, 49, 65, 97, 61, 16, 46, 62, 65, 22,  9, 25, 44,
       73, 91, 79, 55, 57, 40, 13, 94, 10, 76, 43, 87, 96, 74, 35, 62, 31,
       55, 31,  1, 35, 68, 81, 90, 67, 96,  0, 34, 83,  2, 37, 77, 46, 13,
       57, 98, 43, 15, 89, 82, 22, 66, 96, 18, 27, 44, 51, 52, 21, 98, 12,
       40, 29, 85, 94, 34, 49, 81, 98, 40, 33, 43, 14,  3, 96, 17, 45,  7,
       26, 76, 91, 67, 42, 49, 35, 58, 46, 64, 82, 64, 95,  9, 63, 92, 97,
       33, 64, 38, 72, 18, 34, 30, 41, 58, 66, 98, 21, 75, 74,  4, 24, 68,
       11,  6, 60, 17,  5, 87,  8, 43, 89, 11,  5, 22, 61, 27, 77, 56, 18,
        0,  4, 24, 59, 94, 82, 46, 50, 43,  6, 47, 49, 73, 16, 65, 53, 56,
       58, 54, 55, 27, 26, 71, 77, 21,  9, 32, 13, 52, 11, 37, 56, 12, 25,
       70, 63, 77, 93, 25, 54, 72, 57, 66, 72, 27, 52,  0, 70, 40, 42, 12,
       37, 41, 27, 88, 41, 86, 84, 14, 44,  8, 61, 22, 47, 35, 67, 74, 93,
        2, 12,  7, 30, 62, 33, 39,  4, 42, 19, 18, 60, 47, 67, 88, 82, 71,
       61, 37, 81, 83, 68