# Wheat Kernel Classification

Create a classification model to determine the type of wheat seed based on kernel attributes

In [2]:
%matplotlib notebook

import os
import csv
import pickle
import numpy as np
import pandas as pd

from datetime import datetime

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer 
from sklearn.base import BaseEstimator, TransformerMixin 

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier 

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split as tts 

from yellowbrick.classifier import ClassBalance, ConfusionMatrix, ClassificationReport
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

## Data Loading

Load data as a Pandas DataFrame

In [16]:
df = pd.read_csv("data/seeds.csv", delim_whitespace=True, header=None)
df.columns = ['area', 'perimeter', 'compactness', 'kernel_length', 'kernel_width', 'asymmetry_coeff', 'groove_length', 'type']

In [17]:
df.describe()

Unnamed: 0,area,perimeter,compactness,kernel_length,kernel_width,asymmetry_coeff,groove_length,type
count,210.0,210.0,210.0,210.0,210.0,210.0,210.0,210.0
mean,14.847524,14.559286,0.870999,5.628533,3.258605,3.700201,5.408071,2.0
std,2.909699,1.305959,0.023629,0.443063,0.377714,1.503557,0.49148,0.818448
min,10.59,12.41,0.8081,4.899,2.63,0.7651,4.519,1.0
25%,12.27,13.45,0.8569,5.26225,2.944,2.5615,5.045,1.0
50%,14.355,14.32,0.87345,5.5235,3.237,3.599,5.223,2.0
75%,17.305,15.715,0.887775,5.97975,3.56175,4.76875,5.877,3.0
max,21.18,17.25,0.9183,6.675,4.033,8.456,6.55,3.0


In [24]:
# Feature averages by feature type
df.groupby('type').mean()

Unnamed: 0_level_0,area,perimeter,compactness,kernel_length,kernel_width,asymmetry_coeff,groove_length
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,14.334429,14.294286,0.88007,5.508057,3.244629,2.667403,5.087214
2,18.334286,16.135714,0.883517,6.148029,3.677414,3.6448,6.0206
3,11.873857,13.247857,0.849409,5.229514,2.853771,4.7884,5.1164


## Split Test and Train Data

In [29]:
features = [col for col in df.columns if col !='type']

target = 'type'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)


In [31]:
# Always check the shape of X and y makes sense 
print("X Train shape is {} y train shape is {}".format(
    X_train.shape, y_train.shape
))

print("X Test shape is {} y Test shape is {}".format(
    X_test.shape, y_test.shape
))

X Train shape is (168, 7) y train shape is (168,)
X Test shape is (42, 7) y Test shape is (42,)


## Fit a Classifier

In [36]:
def simple_evaluate_model(model):
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)
    print("f1: {}".format(f1_score(y_test, y_hat, average='weighted')))

In [37]:
# Simple Evaluation
clf = GradientBoostingClassifier()
simple_evaluate_model(clf)

f1: 0.9523809523809523


In [38]:
cross_val_score(clf, X_train, y_train, cv=12, scoring='f1_macro').mean()

0.91797338464005129

In [39]:
clf = LogisticRegression()
simple_evaluate_model(clf)

f1: 0.9034391534391534


In [40]:
cross_val_score(clf, X_train, y_train, cv=12, scoring='f1_macro').mean()

0.92109387526054187

In [41]:
clf = GaussianNB()
simple_evaluate_model(clf)

f1: 0.951373960436888


In [42]:
cross_val_score(clf, X_train, y_train, cv=12, scoring='f1_macro').mean()

0.90290804874138197

## Model Management

In [21]:
def internal_params(estimator):
    for attr in dir(estimator):
        if attr.endswith("_") and not attr.startswith("_"):
            yield attr

In [22]:
def save_model(model, path=None):
    if path is None:
        path = model.__class__.__name__ + ".pkl"
    with open(path, 'wb') as f:
        pickle.dump(model, f)

In [24]:
list(internal_params(clf))
#save_model(clf)

['class_count_', 'class_prior_', 'classes_', 'sigma_', 'theta_']