# Import Libraries

In [57]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

RANDOM_STATE = 42 # Used for reproducibility

# Define Evaluation Function

In [58]:
def evaluate_model(pipeline, cv, X, y, params=None):

    if params:
        pipeline.set_params(**params)

    # Needs to be weighted to account for class imbalance
    scoring = ['balanced_accuracy', 'f1_weighted', 'precision_weighted', 'recall_weighted']

    train_scores = cross_validate(pipeline,
                                  X,
                                  y,
                                  scoring=scoring,
                                  cv=cv,
                                  n_jobs=-1,
                                  return_train_score=True
                                  )

    print('\nTraining')
    print('Accuracy: %.4f' % np.mean(train_scores['train_balanced_accuracy']))
    print('F1 Score: %.4f' % np.mean(train_scores['train_f1_weighted']))
    print('Precision: %.4f' % np.mean(train_scores['train_precision_weighted']))
    print('Recall: %.4f' % np.mean(train_scores['train_recall_weighted']))

    print('\nValidation')
    print('Accuracy: %.4f' % np.mean(train_scores['test_balanced_accuracy']))
    print('F1 Score: %.4f' % np.mean(train_scores['test_f1_weighted']))
    print('Precision: %.4f' % np.mean(train_scores['test_precision_weighted']))
    print('Recall: %.4f' % np.mean(train_scores['test_recall_weighted']))

# Load Data

In [59]:
df = pd.read_csv('fruits_dataset.csv')

targets = df['class']
features = df.drop('class', axis=1)
classnames = targets.unique()

# Verify shapes of targets and features
print(targets.shape)
print(features.shape)

(2723,)
(2723, 28)


In [76]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    # ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('classifier', KNeighborsClassifier(n_neighbors=3))
])

# Ensures equal distribution of classes in each fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
evaluate_model(pipeline, cv, features, targets, None)


Training
Accuracy: 0.9075
F1 Score: 0.9068
Precision: 0.9078
Recall: 0.9075

Validation
Accuracy: 0.8105
F1 Score: 0.8080
Precision: 0.8105
Recall: 0.8098
