In [2]:
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

In [3]:
#sample(10) Load Data
file_path = "../../data/soyabean/dataset_42_soybean.arff"
dataset, meta = arff.loadarff(file_path)
df = pd.DataFrame(data=dataset)
df = df.apply(lambda col: col.map(lambda x: x.decode('ascii') if isinstance(x, bytes) else x))


In [15]:
count_rows_with_question_mark = (df == '?').any(axis=1).sum()
print(count_rows_with_question_mark)
121/683


121


0.17715959004392387

In [173]:
# Split data into train and test sets
X = df.drop(columns=['class'])
y = df['class']

# Ensure that the target variable is of type 'category'
y = y.astype('category')

# Convert categorical target variable into integers using LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [176]:
# Column transformer for preprocessing (only OrdinalEncoder for categorical features)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Handle missing categorical values
            ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))  # Ordinal encoding
        ]), X.columns.tolist())  # Apply to all columns in X
    ])

In [177]:
pipe = Pipeline([("preprocessor", preprocessor), ("classifier", DecisionTreeClassifier())])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [178]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8467


In [179]:
# label_encoder.inverse_transform(y_pred)