In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

import joblib
import warnings
warnings.filterwarnings("ignore")


X_train = pd.read_csv("data/X_train.csv")
X_val   = pd.read_csv("data/X_val.csv")
X_test  = pd.read_csv("data/X_test.csv")

y_train = pd.read_csv("data/y_train.csv").values.ravel()
y_val   = pd.read_csv("data/y_val.csv").values.ravel()
y_test  = pd.read_csv("data/y_test.csv").values.ravel()

print("Data loaded successfully")


numeric_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = X_train.select_dtypes(include=["object", "bool"]).columns.tolist()


numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)


dt = DecisionTreeClassifier(
    random_state=42,
    class_weight="balanced"
)

pipeline = Pipeline(steps=[
    ("preproc", preprocessor),
    ("clf", dt)
])


param_grid = {
    "clf__max_depth": [3, 5, 10, None],
    "clf__min_samples_split": [2, 10, 50],
    "clf__min_samples_leaf": [1, 5, 20]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    scoring="f1",
    cv=3,
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)

best_dt = grid.best_estimator_


def evaluate(model, X, y, name):
    preds = model.predict(X)
    print(f"\n{name} results")
    print("Accuracy:", accuracy_score(y, preds))
    print("F1:", f1_score(y, preds))
    print("Confusion matrix:\n", confusion_matrix(y, preds))
    print(classification_report(y, preds, zero_division=0))

evaluate(best_dt, X_train, y_train, "TRAIN")
evaluate(best_dt, X_val, y_val, "VALIDATION")
evaluate(best_dt, X_test, y_test, "TEST")


joblib.dump(best_dt, "models/decision_tree_model.joblib")
print("Decision Tree model saved")


Data loaded successfully
Best parameters: {'clf__max_depth': None, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2}

TRAIN results
Accuracy: 0.9844429640659901
F1: 0.5744908896034298
Confusion matrix:
 [[24854   397]
 [    0   268]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     25251
           1       0.40      1.00      0.57       268

    accuracy                           0.98     25519
   macro avg       0.70      0.99      0.78     25519
weighted avg       0.99      0.98      0.99     25519


VALIDATION results
Accuracy: 0.9764125068568295
F1: 0.2711864406779661
Confusion matrix:
 [[5316   95]
 [  34   24]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      5411
           1       0.20      0.41      0.27        58

    accuracy                           0.98      5469
   macro avg       0.60      0.70      0.63      5469
weighted avg       0.99      0.98      0.9