In [10]:
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier

# Load Data
file_path = "../../data/soyabean/dataset_42_soybean.arff"
dataset, meta = arff.loadarff(file_path)
df = pd.DataFrame(data=dataset)

# Convert all byte columns to strings
for column in df.select_dtypes([object]):
    df[column] = df[column].str.decode('utf-8')

df = df.replace('?', np.nan)

# Split data into train and test sets
X = df.drop(columns=['class'])
y = df['class']

# Ensure that the target variable is of type 'category'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:

categorical_features = X.select_dtypes(include=['object']).columns
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OrdinalEncoder())
        ]), categorical_features)
    ],
    remainder='passthrough'
)

knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),  # Standardizing features is helpful for KNN
    ('classifier', KNeighborsClassifier(n_neighbors=5))
])

# Train the KNN model
knn_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = knn_pipeline.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"KNN Classifier Accuracy: {accuracy:.2f}")


KNN Classifier Accuracy: 0.91


In [15]:
param_grid = {
    'preprocessor__cat__imputer__strategy': ['most_frequent', 'constant'],  # Imputer strategies
    'scaler': [StandardScaler(), MinMaxScaler()],  # Different scalers
    'classifier__n_neighbors': [3, 5, 7, 9, 11, 13, 15],  # Number of neighbors
    'classifier__weights': ['distance'],  # Weight function
    'classifier__metric': ['euclidean', 'manhattan'],  # Distance metrics
    'classifier__p': [1, 2]  # Power parameter for Minkowski distance
}

# Set up GridSearchCV with cross-validation
grid_search = GridSearchCV(knn_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

# Fit grid search to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and best cross-validation score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)
# Evaluate the model with best parameters on the test set
y_pred = grid_search.best_estimator_.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)
test_precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
f1 = f1_score(y_test, y_pred, average="macro")
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.2f}")
print(grid_search.best_params_)


Fitting 5 folds for each of 112 candidates, totalling 560 fits
Best Parameters: {'classifier__metric': 'manhattan', 'classifier__n_neighbors': 3, 'classifier__p': 1, 'classifier__weights': 'distance', 'preprocessor__cat__imputer__strategy': 'most_frequent', 'scaler': MinMaxScaler()}
Best Cross-Validation Accuracy: 0.9029190992493745
Test Accuracy: 0.91
{'classifier__metric': 'manhattan', 'classifier__n_neighbors': 3, 'classifier__p': 1, 'classifier__weights': 'distance', 'preprocessor__cat__imputer__strategy': 'most_frequent', 'scaler': MinMaxScaler()}


In [9]:
y_pred

array(['diaporthe-pod-&-stem-blight', 'brown-spot', 'alternarialeaf-spot',
       'brown-spot', '2-4-d-injury', 'phytophthora-rot',
       'phytophthora-rot', 'bacterial-blight', 'rhizoctonia-root-rot',
       'frog-eye-leaf-spot', 'charcoal-rot', 'anthracnose',
       'alternarialeaf-spot', 'purple-seed-stain', 'frog-eye-leaf-spot',
       'alternarialeaf-spot', 'powdery-mildew', 'phytophthora-rot',
       'brown-stem-rot', 'anthracnose', 'brown-spot',
       'diaporthe-pod-&-stem-blight', 'phytophthora-rot',
       'diaporthe-pod-&-stem-blight', 'diaporthe-stem-canker',
       'frog-eye-leaf-spot', 'frog-eye-leaf-spot', 'alternarialeaf-spot',
       'brown-spot', 'brown-spot', 'rhizoctonia-root-rot',
       'bacterial-pustule', 'herbicide-injury', 'phytophthora-rot',
       'brown-stem-rot', 'frog-eye-leaf-spot', 'brown-spot',
       'phytophthora-rot', 'downy-mildew', 'brown-stem-rot', 'brown-spot',
       'alternarialeaf-spot', 'brown-spot', 'charcoal-rot',
       'diaporthe-pod-&-

In [13]:
print(classification_report(y_test, y_pred))

                             precision    recall  f1-score   support

               2-4-d-injury       1.00      1.00      1.00         5
        alternarialeaf-spot       0.81      0.72      0.76        18
                anthracnose       1.00      1.00      1.00        10
           bacterial-blight       1.00      0.75      0.86         4
          bacterial-pustule       1.00      1.00      1.00         6
                 brown-spot       0.93      0.93      0.93        14
             brown-stem-rot       1.00      1.00      1.00        10
               charcoal-rot       1.00      1.00      1.00         4
              cyst-nematode       1.00      1.00      1.00         2
diaporthe-pod-&-stem-blight       0.83      1.00      0.91         5
      diaporthe-stem-canker       1.00      1.00      1.00         4
               downy-mildew       1.00      1.00      1.00         4
         frog-eye-leaf-spot       0.63      0.80      0.71        15
           herbicide-injury      

In [15]:
print(
    {
        "accuracy": test_accuracy,
        "precision": test_precision,
        "recall": recall,
        "f1": f1,
        "conf_matrix": conf_matrix
    }
)

{'accuracy': 0.9124087591240876, 'precision': np.float64(0.958209668909115), 'recall': np.float64(0.9053049289891394), 'f1': np.float64(0.920985619128034), 'conf_matrix': array([[ 5,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  5,  0,  0,  0,
         0,  0,  0],
       [ 0,  0, 10,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  3,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0, 13,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0, 10,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,
         0,  