# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV

RANDOM_STATE = 42 # Used for reproducibility

# Define Evaluation Function

In [2]:
def evaluate_model(pipeline, cv, X, y, params=None):

    if params:
        pipeline.set_params(**params)

    # Needs to be weighted to account for class imbalance
    scoring = ['balanced_accuracy', 'f1_weighted', 'precision_weighted', 'recall_weighted']

    train_scores = cross_validate(pipeline,
                                  X,
                                  y,
                                  scoring=scoring,
                                  cv=cv,
                                  n_jobs=-1,
                                  return_train_score=True
                                  )

    print('\nTraining')
    print('Accuracy: %.4f' % np.mean(train_scores['train_balanced_accuracy']))
    print('F1 Score: %.4f' % np.mean(train_scores['train_f1_weighted']))
    print('Precision: %.4f' % np.mean(train_scores['train_precision_weighted']))
    print('Recall: %.4f' % np.mean(train_scores['train_recall_weighted']))

    print('\nValidation')
    print('Accuracy: %.4f' % np.mean(train_scores['test_balanced_accuracy']))
    print('F1 Score: %.4f' % np.mean(train_scores['test_f1_weighted']))
    print('Precision: %.4f' % np.mean(train_scores['test_precision_weighted']))
    print('Recall: %.4f' % np.mean(train_scores['test_recall_weighted']))

# Load Data

In [10]:
df = pd.read_csv('fruits_dataset.csv')

targets = df['class']
features = df.drop('class', axis=1)
classnames = targets.unique()

# Verify shapes of targets and features
print(targets.shape)
print(features.shape)
df.head()

(2723,)
(2723, 28)


Unnamed: 0,class,area,perimeter,circularity,convexity,red_mean,green_mean,blue_mean,red_std,green_std,...,v_mean,h_std,s_std,v_std,h_skew,s_skew,v_skew,h_kurt,s_kurt,v_kurt
0,freshapples,2774.0,224.65,0.69,0.98,186.77854,195.670201,144.949637,78.619189,74.775987,...,195.867247,26.064645,74.31321,74.72978,2.127686,0.241026,-1.597067,11.286257,1.207072,4.758536
1,freshapples,1463.5,176.12,0.59,0.95,171.70729,183.934112,132.812281,77.60178,76.236469,...,184.139927,27.110773,73.843719,76.124151,1.712094,0.129645,-1.231315,8.534175,1.242978,3.656074
2,freshapples,20031.5,602.4,0.69,0.95,182.189931,136.880879,132.493662,81.347379,106.574611,...,182.278241,38.365581,94.697792,81.401297,3.49673,0.341599,-0.946086,14.083473,1.22839,2.884416
3,freshapples,20122.0,553.67,0.82,0.98,204.151028,162.443858,143.651188,73.629622,83.117642,...,204.218252,22.782814,76.408299,73.654985,4.610105,0.396334,-1.964857,27.664547,1.527833,5.752591
4,freshapples,61.5,33.56,0.69,0.88,176.65332,117.875877,121.402443,71.781614,87.136889,...,177.002412,80.840884,75.647223,72.09472,-0.270837,0.032657,-1.152348,1.205922,1.43429,3.865


# Feature Selection

In [4]:
# ensures equal distribution of classes in each fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# model for feature selection
rfecv_model = DecisionTreeClassifier(random_state=RANDOM_STATE)

rfecv = RFECV(estimator=rfecv_model, cv=cv, scoring="balanced_accuracy")
rfecv = rfecv.fit(features, targets)

In [5]:
print("The optimal number of features:", rfecv.n_features_)
print("Best features:", list(features.columns[rfecv.support_]))

The optimal number of features: 18
Best features: ['area', 'perimeter', 'convexity', 'red_mean', 'green_mean', 'red_std', 'green_std', 'blue_std', 'green_skew', 'blue_kurt', 's_mean', 'h_std', 's_std', 'h_skew', 's_skew', 'v_skew', 's_kurt', 'v_kurt']


# Classification

In [14]:
pipeline = Pipeline([
    ('scaler', StandardScaler()), # Standardization/Normalization
    ('classifier', KNeighborsClassifier(n_neighbors=3)) # Estimation/Classification
])

evaluate_model(pipeline, cv, features, targets, None)


Training
Accuracy: 0.9075
F1 Score: 0.9068
Precision: 0.9078
Recall: 0.9075

Validation
Accuracy: 0.8105
F1 Score: 0.8080
Precision: 0.8105
Recall: 0.8098
