In [107]:
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler, Normalizer, MaxAbsScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
import time


# Load Data
file_path = "../../data/soyabean/dataset_42_soybean.arff"
dataset, meta = arff.loadarff(file_path)
df = pd.DataFrame(data=dataset)

# Convert all byte columns to strings
for column in df.select_dtypes([object]):
    df[column] = df[column].str.decode('utf-8')

df = df.replace('?', np.nan)

# Split data into train and test sets
X = df.drop(columns=['class'])
y = df['class']

# Ensure that the target variable is of type 'category'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [108]:
# Preprocessing
pipeline = Pipeline([
    ('imputer', SimpleImputer()),  # Impute missing values with the most frequent value
    ("ordinal", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
    ("scaler", StandardScaler()),
    ('classifier', KNeighborsClassifier())  # DecisionTreeClassifier
])

In [109]:
param_grid = {
    'imputer__strategy': ['most_frequent', 'constant'], 
    'imputer__fill_value': ['missing'],
    'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler(), MaxAbsScaler()],
    'classifier__n_neighbors': [3, 5, 7, 9, 11, 13, 15], 
    'classifier__weights': ['distance'], 
    'classifier__metric': ['euclidean', 'manhattan'],
}

# Set up GridSearchCV with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro', n_jobs=-1, verbose=1)

# Fit grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and model
print(f"Best parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

# Make predictions with the best model
start_time = time.time()
y_pred = best_model.predict(X_test)
end_time = time.time()

# Calculate time in milliseconds
prediction_time_ms = (end_time - start_time) * 1000

# Print prediction time
print(f"Prediction Time: {prediction_time_ms:.2f} ms")

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Best Model Accuracy: {accuracy:.2f}")

# best cross-validation score
best_cv_score = grid_search.best_score_
print(f"Best Cross-validation accuracy: {best_cv_score:.2f}")

print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 112 candidates, totalling 560 fits
Best parameters: {'classifier__metric': 'manhattan', 'classifier__n_neighbors': 3, 'classifier__weights': 'distance', 'imputer__fill_value': 'missing', 'imputer__strategy': 'most_frequent', 'scaler': RobustScaler()}
Prediction Time: 3.10 ms
Best Model Accuracy: 0.93
Best Cross-validation accuracy: 0.94
                             precision    recall  f1-score   support

               2-4-d-injury       1.00      1.00      1.00         2
        alternarialeaf-spot       0.74      1.00      0.85        14
                anthracnose       1.00      1.00      1.00         8
           bacterial-blight       0.75      1.00      0.86         3
          bacterial-pustule       1.00      0.67      0.80         3
                 brown-spot       0.90      0.95      0.92        19
             brown-stem-rot       1.00      1.00      1.00         8
               charcoal-rot       1.00      1.00      1.00         5
           