In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, f1_score
import joblib

So, based on our exploration and data collection we can assume some things:
1. We have categorical data so we need models that are okay with that
2. Dataset is not that big (10000<), so we can use simpler models.
3. But we have non-linear relationship since we have some features that influence the prediction more than others (odor, for example.)
4. We need high accuracy
5. We have high dimensionality so we may use SVM, Random forest etc.
6. the data is not sensitive to outliers.

In [8]:
X_train_final = joblib.load('X_train_final.pkl')
X_test_final = joblib.load('X_test_final.pkl')
y_train_encoded = joblib.load('y_train_encoded.pkl')
y_test_encoded = joblib.load('y_test_encoded.pkl')

In [9]:
models = {
    "Decision Tree": {
        "estimator": DecisionTreeClassifier(random_state=42),
        "params": {
            "max_depth": [3, 5, 7, None],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4]
        }
    },
    "Random Forest": {
        "estimator": RandomForestClassifier(random_state=42, n_jobs=-1),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [5, 10, None],
            "max_features": ["sqrt", "log2", None],
            "min_samples_leaf": [1, 2, 4]
        }
    },
    "Categorical NB": {
        "estimator": CategoricalNB(),
        "params": {
            "alpha": [0.5, 1.0, 2.0]
        }
    }
}

In [15]:
results = []

for name, mp in models.items():
    print(f"\n=== Training and tuning: {name} ===")

    grid = GridSearchCV(
        estimator=mp["estimator"],
        param_grid=mp["params"],
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    grid.fit(X_train_final, y_train_encoded)

    model = grid.best_estimator_

    # Calculate cv_score, test accuracy and f1 for this model
    cv_score = cross_val_score(model, X_train_final, y_train_encoded, cv=5, scoring='accuracy').mean()
    y_pred = model.predict(X_test_final)
    test_acc = accuracy_score(y_test_encoded, y_pred)
    test_f1 = f1_score(y_test_encoded, y_pred, average='weighted')

    results.append({
        "Model": name,
        "CV Accuracy": cv_score,
        "Test Accuracy": test_acc,
        "Test F1": test_f1,
        "Best Hyperparameters": grid.best_params_
    })

    print(f"Best Hyperparameters for {name}: {grid.best_params_}")
    print(f"Test Accuracy: {test_acc:.4f}, F1-score: {test_f1:.4f}")

print(pd.DataFrame(results))


=== Training and tuning: Decision Tree ===
Best Hyperparameters for Decision Tree: {'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 2}
Test Accuracy: 1.0000, F1-score: 1.0000

=== Training and tuning: Random Forest ===
Best Hyperparameters for Random Forest: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 100}
Test Accuracy: 1.0000, F1-score: 1.0000

=== Training and tuning: Categorical NB ===
Best Hyperparameters for Categorical NB: {'alpha': 0.5}
Test Accuracy: 0.9526, F1-score: 0.9525
            Model  CV Accuracy  Test Accuracy   Test F1  \
0   Decision Tree     1.000000       1.000000  1.000000   
1   Random Forest     1.000000       1.000000  1.000000   
2  Categorical NB     0.954609       0.952615  0.952502   

                                Best Hyperparameters  
0  {'max_depth': 7, 'min_samples_leaf': 1, 'min_s...  
1  {'max_depth': 10, 'max_features': 'sqrt', 'min...  
2                                     {'alpha': 0.5}  


In [16]:
results_df = pd.DataFrame(results).sort_values("Test Accuracy", ascending=False)
print(results_df.to_string(index=False))

         Model  CV Accuracy  Test Accuracy  Test F1                                                                  Best Hyperparameters
 Decision Tree     1.000000       1.000000 1.000000                       {'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 2}
 Random Forest     1.000000       1.000000 1.000000 {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 100}
Categorical NB     0.954609       0.952615 0.952502                                                                        {'alpha': 0.5}
