In [37]:
import pandas as pd
import numpy as np
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

# Load Data
file_path = "../../data/soyabean/dataset_42_soybean.arff"
dataset, meta = arff.loadarff(file_path)
df = pd.DataFrame(data=dataset)

# Convert all byte columns to strings
for column in df.select_dtypes([object]):
    df[column] = df[column].str.decode('utf-8')

# Split data into train and test sets
X = df.drop(columns=['class'])
y = df['class']

X = X.replace('?', np.nan)

# Ensure that the target variable is of type 'category'
y = y.astype('category')


# Convert categorical target variable into integers using LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Column transformer for preprocessing (only OrdinalEncoder for categorical features)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Handle missing categorical values
            ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))  # Ordinal encoding
        ]), X.columns.tolist())  # Apply to all columns in X
    ])

# Apply the column transformer to the data
pipe = Pipeline([("preprocessor", preprocessor), ("classifier", DecisionTreeClassifier())])

# Fit the pipeline to training data
pipe.fit(X_train, y_train)

# Predict on test data
y_pred = pipe.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8467


In [9]:
y_pred_original = label_encoder.inverse_transform(y_pred)
y_pred_original

array([b'alternarialeaf-spot', b'alternarialeaf-spot',
       b'frog-eye-leaf-spot', b'alternarialeaf-spot',
       b'alternarialeaf-spot', b'alternarialeaf-spot',
       b'alternarialeaf-spot', b'frog-eye-leaf-spot',
       b'frog-eye-leaf-spot', b'frog-eye-leaf-spot',
       b'alternarialeaf-spot', b'alternarialeaf-spot',
       b'alternarialeaf-spot', b'frog-eye-leaf-spot',
       b'alternarialeaf-spot', b'alternarialeaf-spot',
       b'alternarialeaf-spot', b'alternarialeaf-spot',
       b'alternarialeaf-spot', b'frog-eye-leaf-spot',
       b'alternarialeaf-spot', b'alternarialeaf-spot',
       b'alternarialeaf-spot', b'alternarialeaf-spot',
       b'alternarialeaf-spot', b'alternarialeaf-spot',
       b'alternarialeaf-spot', b'alternarialeaf-spot',
       b'alternarialeaf-spot', b'alternarialeaf-spot',
       b'alternarialeaf-spot', b'frog-eye-leaf-spot',
       b'alternarialeaf-spot', b'alternarialeaf-spot',
       b'alternarialeaf-spot', b'alternarialeaf-spot',
       b'frog-eye