In [94]:
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time

file_path = "../../data/soyabean/dataset_42_soybean.arff"
dataset, meta = arff.loadarff(file_path)
df = pd.DataFrame(data=dataset)
for column in df.select_dtypes([object]):
    df[column] = df[column].str.decode('utf-8')


# Split data into train and test sets
X = df.drop(columns=['class'])
y = df['class']

# Ensure that the target variable is of type 'category'
y = y.astype('category')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)


In [95]:
# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Handle missing categorical values
            ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))  # Ordinal encoding
        ]), X.columns.tolist())  # Apply to all columns in X
    ])

pipe = Pipeline([("preprocessor", preprocessor), ("classifier", DecisionTreeClassifier())])


In [96]:
# Fitting & Analysis 
param_grid = {
    # Hyperparameters for the classifier
    'classifier__max_depth': [3, 5, 10, None],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__criterion': ['gini', 'entropy'],
}


grid_search = GridSearchCV(pipe, param_grid, cv=StratifiedKFold(n_splits=2), scoring='accuracy', n_jobs=-1, verbose=1, error_score='raise')

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

# Make predictions with the best model

start_time = time.time()
y_pred = best_model.predict(X_test)
end_time = time.time()

# Calculate time in milliseconds
prediction_time_ms = (end_time - start_time) * 1000

# Print prediction time
print(f"Prediction Time: {prediction_time_ms:.2f} ms")


# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Best Model Accuracy: {accuracy:.2f}")

# best cross-validation score
best_cv_score = grid_search.best_score_
print(f"Best Cross-validation accuracy: {best_cv_score:.2f}")

print(classification_report(y_test, y_pred))

Fitting 2 folds for each of 24 candidates, totalling 48 fits
Best parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1}
Prediction Time: 2.37 ms
Best Model Accuracy: 0.93
Best Cross-validation accuracy: 0.91
                             precision    recall  f1-score   support

               2-4-d-injury       1.00      1.00      1.00         5
        alternarialeaf-spot       0.80      0.71      0.75        17
                anthracnose       1.00      1.00      1.00         8
           bacterial-blight       1.00      1.00      1.00         2
          bacterial-pustule       1.00      1.00      1.00         2
                 brown-spot       1.00      1.00      1.00        18
             brown-stem-rot       1.00      1.00      1.00         9
               charcoal-rot       1.00      1.00      1.00         3
              cyst-nematode       1.00      1.00      1.00         2
diaporthe-pod-&-stem-blight       1.00  