In [1]:
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.naive_bayes import CategoricalNB
import time

# Load Data
file_path = "../../data/soyabean/dataset_42_soybean.arff"
dataset, meta = arff.loadarff(file_path)
df = pd.DataFrame(data=dataset)

# Convert all byte columns to strings
for column in df.select_dtypes([object]):
    df[column] = df[column].str.decode('utf-8')

df = df.replace('?', np.nan)

# Split data into train and test sets
X = df.drop(columns=['class'])
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OrdinalEncoder())
        ]), X.columns.tolist())
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CategoricalNB())
])

In [6]:
param_grid = {
    'preprocessor__cat__imputer__strategy': ['constant', 'most_frequent'],
    'preprocessor__cat__imputer__fill_value': ['missing'],  
    
    'classifier__alpha': [0.01, 0.1, 1.0, 10.0],
    'classifier__fit_prior': [True, False],
    'classifier__min_categories': [None, 2, 5, 10, 15]
}

# Set up GridSearchCV with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1, error_score='raise')

# Fit grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and model
print(f"Best parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

# Make predictions with the best model
start_time = time.time()

y_pred = best_model.predict(X_test)
end_time = time.time()
prediction_time_ms = (end_time - start_time) * 1000

# Print prediction time
print(f"Prediction Time: {prediction_time_ms:.2f} ms")


# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Best Model Accuracy: {accuracy:.2f}")

# best cross-validation score
best_cv_score = grid_search.best_score_
print(f"Best Cross-validation accuracy: {best_cv_score:.2f}")

print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 80 candidates, totalling 400 fits
Best parameters: {'classifier__alpha': 0.01, 'classifier__fit_prior': True, 'classifier__min_categories': None, 'preprocessor__cat__imputer__fill_value': 'missing', 'preprocessor__cat__imputer__strategy': 'most_frequent'}
Prediction Time: 2.80 ms
Best Model Accuracy: 0.96
Best Cross-validation accuracy: 0.94
                             precision    recall  f1-score   support

               2-4-d-injury       1.00      1.00      1.00         5
        alternarialeaf-spot       0.85      0.94      0.89        18
                anthracnose       1.00      1.00      1.00        10
           bacterial-blight       1.00      1.00      1.00         4
          bacterial-pustule       1.00      1.00      1.00         6
                 brown-spot       0.93      1.00      0.97        14
             brown-stem-rot       1.00      1.00      1.00        10
               charcoal-rot       1.00      1.00      1.00         4
      