In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
import joblib

In [2]:
data = pd.read_csv("zbiór_10.csv")

In [3]:
X = data.drop(columns=["default"])
y = data["default"]

# dropujemy szczegolna forma wlasnosci (kazdy ma taka sama 117)
X = X.drop(columns="szczegolnaFormaPrawna_Symbol")

unique_values = X['formaWlasnosci_Symbol'].unique()

categorical_cols = ['formaWlasnosci_Symbol']

# lista kolumn OHE odpowiadających symbolom form własności
numeric_cols = [
    'ohe_fw_214','ohe_fw_215','ohe_fw_113','ohe_fw_216','ohe_fw_225','ohe_fw_226',
    'ohe_fw_224','ohe_fw_227','ohe_fw_234','ohe_fw_111','ohe_fw_112','ohe_fw_235',
    'ohe_fw_132','ohe_fw_123','ohe_fw_133','ohe_fw_122','ohe_fw_338', 'ohe_fw_000'
]

# inicjalizacja OneHotEncoder z ustalonymi kategoriami
ohe = OneHotEncoder(
    categories=[sorted([int(c.split('_')[-1]) for c in numeric_cols])],
    sparse_output=False,  # zmiana z sparse -> sparse_output
    drop=None
)

# dopasowanie i transformacja
ohe_array = ohe.fit_transform(X[['formaWlasnosci_Symbol']])

# utworzenie DataFrame z odpowiednimi nazwami kolumn
df_ohe = pd.DataFrame(ohe_array, columns=numeric_cols, index=X.index)

# połączenie z oryginalnym df
df = pd.concat([X, df_ohe], axis=1)
df = df.drop(columns=["formaWlasnosci_Symbol"])

In [4]:
## analogicznie dla kolumny 'schemat_wsk_bilans'

ohe_cols = ['SFJIN_wsk_bilans', 'SFJMI_wsk_bilans', 'SFJMA_wsk_bilans']

# inicjalizacja OneHotEncoder z ustalonymi kategoriami
ohe = OneHotEncoder(
    categories = [['SFJIN', 'SFJMI', 'SFJMA']],
    sparse_output=False,  # zmiana z sparse -> sparse_output
    drop=None
)

# dopasowanie i transformacja
ohe_array = ohe.fit_transform(X[['schemat_wsk_bilans']])

# utworzenie DataFrame z odpowiednimi nazwami kolumn
df_ohe = pd.DataFrame(ohe_array, columns=ohe_cols, index=X.index)

# połączenie z oryginalnym df
df = pd.concat([df, df_ohe], axis=1)
df = df.drop(columns=["schemat_wsk_bilans"])

In [5]:
## i jeszcze raz dla 'schemat_wsk_rzis'

ohe_cols = ['SFJIN_wsk_rzis', 'SFJMI_wsk_rzis', 'SFJMA_wsk_rzis']

# inicjalizacja OneHotEncoder z ustalonymi kategoriami
ohe = OneHotEncoder(
    categories = [['SFJIN', 'SFJMI', 'SFJMA']],
    sparse_output=False,  # zmiana z sparse -> sparse_output
    drop=None
)

# dopasowanie i transformacja
ohe_array = ohe.fit_transform(X[['schemat_wsk_rzis']])

# utworzenie DataFrame z odpowiednimi nazwami kolumn
df_ohe = pd.DataFrame(ohe_array, columns=ohe_cols, index=X.index)

# połączenie z oryginalnym df
df = pd.concat([df, df_ohe], axis=1)
df = df.drop(columns=["schemat_wsk_rzis"])

In [6]:
# Stratified K Fold


def stratified_train_val_test_split(X, y, test_size = 0.15, val_size = 0.15, random_state = 2137):
    # 1) hold out the test set
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )

    # 2) compute val fraction relative to the remaining (X_temp)
    #    val_size is fraction of original; convert to fraction of X_temp
    val_rel = val_size / (1.0 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_rel, stratify=y_temp, random_state=random_state
    )

    # quick class-count check
    print("Counts: train", Counter(y_train), "val", Counter(y_val), "test", Counter(y_test))
    return X_train, X_val, X_test, y_train, y_val, y_test


In [7]:
X_train, X_val, X_test, y_train, y_val, y_test = stratified_train_val_test_split(df, y)

Counts: train Counter({0: 1971, 1: 128}) val Counter({0: 423, 1: 28}) test Counter({0: 423, 1: 27})


In [8]:
# wybieramy top_n najczęstszych kategorii
X = df.copy()
top_n = 10
top_values = X_train['pkdKod'].value_counts().nlargest(top_n).index

# tworzymy nową kolumnę w X_train, gdzie rzadkie wartości są zastąpione przez '0'
X_train['pkdKod_grouped'] = X_train['pkdKod'].where(X_train['pkdKod'].isin(top_values), other='0')
X_train = X_train.drop(columns=['pkdKod'])

# tworzymy nową kolumnę w X_val, gdzie rzadkie wartości są zastąpione przez '0'
X_val['pkdKod_grouped'] = X_val['pkdKod'].where(X_val['pkdKod'].isin(top_values), other='0')
X_val = X_val.drop(columns=['pkdKod'])

# tworzymy nową kolumnę w X_test, gdzie rzadkie wartości są zastąpione przez '0'
X_test['pkdKod_grouped'] = X_test['pkdKod'].where(X_test['pkdKod'].isin(top_values), other='0')
X_test = X_test.drop(columns=['pkdKod'])


In [9]:
# łączymy X i y do obliczenia statystyk
train_temp = X_train.copy()
train_temp['default'] = y_train.reindex(train_temp.index)

# agregujemy liczbe przypadkow
agg = train_temp.groupby('pkdKod_grouped')['default'].agg(['sum', 'count']).rename(columns={'sum': 'bad', 'count': 'total'})
agg['good'] = agg['total'] - agg['bad']

# smoothing, żeby uniknąć dzielenia przez zero
smoothing = 0.5
agg['bad_s'] = agg['bad'] + smoothing
agg['good_s'] = agg['good'] + smoothing

total_bad = agg['bad_s'].sum()
total_good = agg['good_s'].sum()

agg['woe'] = np.log((agg['good_s'] / total_good) / (agg['bad_s'] / total_bad))

woe_map = agg['woe'].to_dict()

# dodajemy nową kolumnę do X
X_train['WoE_pkdKod_grouped'] = X_train['pkdKod_grouped'].map(woe_map)

fallback = X_train['WoE_pkdKod_grouped'].mean()  # or 0

X_train = X_train.drop(columns=['pkdKod_grouped'])

X_val['WoE_pkdKod_grouped'] = X_val['pkdKod_grouped'].map(woe_map).fillna(fallback)
X_test['WoE_pkdKod_grouped'] = X_test['pkdKod_grouped'].map(woe_map).fillna(fallback)



In [10]:
# inf na nan
for df in (X_train, X_val, X_test):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
base_cols = list(X_train.columns)

for df in (X_train, X_val, X_test):
    # Iteruj TYLKO po oryginalnych kolumnach (przed dodaniem wskaźników)
    # Zapewni to, że w każdym DF dodasz wskaźniki tylko raz
    for c in base_cols:
        # Sprawdź, czy kolumna istnieje w bieżącym DF, aby uniknąć błędów
        if c in df.columns:
            # Twórz wskaźnik NA w bieżącym DF
            df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)

cols_to_impute = list(X_train.columns) 

# exclude indicator columns from numeric imputation — keep them as-is (0/1)
indicator_cols = [c for c in X_train.columns if c.endswith('_mial_braki_danych')]
numeric_impute_cols = [c for c in X_train.columns if c not in indicator_cols]

imputer = SimpleImputer(strategy="median")
imputer.fit(X_train[numeric_impute_cols])

X_train[numeric_impute_cols] = imputer.transform(X_train[numeric_impute_cols])
X_val[numeric_impute_cols]   = imputer.transform(X_val.reindex(columns=numeric_impute_cols))
X_test[numeric_impute_cols]  = imputer.transform(X_test.reindex(columns=numeric_impute_cols))


  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mia

In [11]:

# Wywalamy za bardzo skorelowane
corr_matrix = X_train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

to_drop = []
for col_a in upper.columns:
    # Znajdź kolumny B, które są silnie skorelowane z A
    highly_correlated_cols = upper.index[upper[col_a] > 0.9].tolist()

    for col_b in highly_correlated_cols:
        # Jeśli żadna z nich nie została jeszcze usunięta
        if col_a not in to_drop and col_b not in to_drop:
            # Określenie, która kolumna jest silniej skorelowana z celem
            corr_a = abs(X_train[[col_a]].join(y_train).corr().iloc[0,1])
            corr_b = abs(X_train[[col_b]].join(y_train).corr().iloc[0,1])

            # Usuń tę, która jest MNIEJ skorelowana z celem
            if corr_a < corr_b:
                to_drop.append(col_a)
            else:
                to_drop.append(col_b)

# Usuwamy kolumny na wszystkich zbiorach
X_train.drop(to_drop, axis=1, inplace=True)
X_val.drop(to_drop, axis=1, inplace=True)
X_test.drop(to_drop, axis=1, inplace=True)


In [12]:
y_train = y_train.reindex(X_train.index)
y_val   = y_val.reindex(X_val.index)
y_test  = y_test.reindex(X_test.index)

dt = DecisionTreeClassifier(random_state=42)

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 8, 12, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'ccp_alpha': [0.0, 1e-4, 1e-3, 1e-2]  # cost-complexity pruning
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(dt, param_grid, cv=cv, scoring='f1_macro', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

print("Best params from CV (on train):", grid.best_params_)
print("Best CV score (f1_macro):", grid.best_score_)

best_dt = grid.best_estimator_

y_val_pred = best_dt.predict(X_val)
print("\nVALIDATION REPORT")
print(classification_report(y_val, y_val_pred))
print("Accuracy (val):", accuracy_score(y_val, y_val_pred))
if len(np.unique(y_train)) == 2:
    try:
        y_val_proba = best_dt.predict_proba(X_val)[:, 1]
        print("ROC AUC (val):", roc_auc_score(y_val, y_val_proba))
    except Exception:
        pass
print("Confusion matrix (val):\n", confusion_matrix(y_val, y_val_pred))

X_train_val = pd.concat([X_train, X_val], axis=0)
y_train_val = pd.concat([y_train, y_val], axis=0)

final_dt = DecisionTreeClassifier(**grid.best_params_, random_state=42)
final_dt.fit(X_train_val, y_train_val)

y_test_pred = final_dt.predict(X_test)
print("\nTEST REPORT (final model trained on train+val)")
print(classification_report(y_test, y_test_pred))
print("Accuracy (test):", accuracy_score(y_test, y_test_pred))
if len(np.unique(y_train_val)) == 2:
    try:
        y_test_proba = final_dt.predict_proba(X_test)[:, 1]
        print("ROC AUC (test):", roc_auc_score(y_test, y_test_proba))
    except Exception:
        pass
print("Confusion matrix (test):\n", confusion_matrix(y_test, y_test_pred))

# Save final model
joblib.dump(final_dt, "decision_tree_final.joblib")
print("Saved final model to decision_tree_final.joblib")



Fitting 5 folds for each of 640 candidates, totalling 3200 fits
Best params from CV (on train): {'ccp_alpha': 0.0, 'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 5, 'min_samples_split': 2}
Best CV score (f1_macro): 0.5631048813908867


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- pkdKod_grouped
