In [1]:
import joblib
import pandas as pd
from typing import Dict, Any
from pydantic import BaseModel, Field
import numpy as np
from enum import Enum
from abc import ABC, abstractmethod
import sys
import os

# Scikit-learn model and metrics imports
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../')))
from ThreeWToolkit.core.enums import ModelTypeEnum
from ThreeWToolkit.models import sklearn_models
from ThreeWToolkit.metrics import _classification

In [2]:
# Create a synthetic dataset
X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=10,
    n_redundant=5,
    n_classes=2,
    random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (700, 20)
Testing data shape: (300, 20)


In [6]:
# Define a list of configs, one for each model type
model_configs = [
    sklearn_models.TrainerSklearnModelsConfig(model_type=ModelTypeEnum.LOGISTIC_REGRESSION),
    sklearn_models.TrainerSklearnModelsConfig(model_type=ModelTypeEnum.DECISION_TREE, model_params={"max_depth": 5}),
    sklearn_models.TrainerSklearnModelsConfig(model_type=ModelTypeEnum.RANDOM_FOREST, model_params={"n_estimators": 100, "max_depth": 5}),
    sklearn_models.TrainerSklearnModelsConfig(model_type=ModelTypeEnum.SVM, model_params={"kernel": "rbf", "C": 1.0}),
    sklearn_models.TrainerSklearnModelsConfig(model_type=ModelTypeEnum.KNN, model_params={"n_neighbors": 5}),
    sklearn_models.TrainerSklearnModelsConfig(model_type=ModelTypeEnum.NAIVE_BAYES),
    sklearn_models.TrainerSklearnModelsConfig(model_type=ModelTypeEnum.GRADIENT_BOOSTING, 
                                       model_params={"n_estimators": 100, "learning_rate": 0.1})
]

metrics_to_calculate = [
    _classification.accuracy_score,
    _classification.balanced_accuracy_score,
    _classification.f1_score,
    _classification.roc_auc_score # This will be skipped for SVM
]

# Loop through the configs to train and evaluate each model
results = {}
for config in model_configs:
    model_name = config.model_type.name
    print(f"--- Training {model_name} ---")
    
    model = sklearn_models.SklearnModels(config)
    model.train(x=X_train, y=y_train)
    
    metrics = model.evaluate(x=X_test, y=y_test, metrics=metrics_to_calculate)
    results[model_name] = metrics
    print(f"Accuracy: {metrics.get('accuracy_score', 'N/A'):.4f}\n")

# Display a summary of all results
print("--- Evaluation Summary ---")
results_df = pd.DataFrame(results).T
print(results_df)

--- Training LOGISTIC_REGRESSION ---
Accuracy: 0.8133

--- Training DECISION_TREE ---
Accuracy: 0.8333

--- Training RANDOM_FOREST ---
Accuracy: 0.8900

--- Training SVM ---
Accuracy: 0.9500

--- Training KNN ---
Accuracy: 0.9100

--- Training NAIVE_BAYES ---
Accuracy: 0.7533

--- Training GRADIENT_BOOSTING ---
Accuracy: 0.9067

--- Evaluation Summary ---
                     accuracy_score  balanced_accuracy_score  f1_score  \
LOGISTIC_REGRESSION        0.813333                 0.815179  0.808219   
DECISION_TREE              0.833333                 0.835714  0.829932   
RANDOM_FOREST              0.890000                 0.892411  0.887372   
SVM                        0.950000                 0.950446  0.946996   
KNN                        0.910000                 0.910268  0.904594   
NAIVE_BAYES                0.753333                 0.751339  0.731884   
GRADIENT_BOOSTING          0.906667                 0.908482  0.903448   

                     roc_auc_score  
LOGISTIC_REG

In [10]:
# To demonstrate saving, let's train one model again
rf_config = sklearn_models.TrainerSklearnModelsConfig(model_type=ModelTypeEnum.RANDOM_FOREST)
rf_model = sklearn_models.SklearnModels(rf_config)
rf_model.train(x=X_train, y=y_train)

model_filepath = "random_forest_model.pkl"
rf_model.save(model_filepath)

# Load it back from the file
loaded_rf_model = sklearn_models.SklearnModels.load(model_filepath, rf_config)

# Verify the loaded model works by evaluating it
loaded_metrics = loaded_rf_model.evaluate(
    x=X_test, y=y_test, metrics=[_classification.accuracy_score]
)
loaded_accuracy = loaded_metrics["accuracy_score"]

print(f"Loaded Random Forest model accuracy: {loaded_accuracy:.4f}")

Loaded Random Forest model accuracy: 0.9167
