In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import joblib
from sklearn.model_selection import cross_val_score
from sklearn.metrics import average_precision_score, roc_auc_score, brier_score_loss, log_loss
from scipy.stats import ks_2samp
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, f1_score
import matplotlib as plt

In [3]:
data = pd.read_csv("zbiór_10.csv")

In [4]:
X = data.drop(columns=["default"])
y = data["default"]

# dropujemy szczegolna forma wlasnosci (kazdy ma taka sama 117)
X = X.drop(columns="szczegolnaFormaPrawna_Symbol")

unique_values = X['formaWlasnosci_Symbol'].unique()

categorical_cols = ['formaWlasnosci_Symbol']

# lista kolumn OHE odpowiadających symbolom form własności
numeric_cols = [
    'ohe_fw_214','ohe_fw_215','ohe_fw_113','ohe_fw_216','ohe_fw_225','ohe_fw_226',
    'ohe_fw_224','ohe_fw_227','ohe_fw_234','ohe_fw_111','ohe_fw_112','ohe_fw_235',
    'ohe_fw_132','ohe_fw_123','ohe_fw_133','ohe_fw_122','ohe_fw_338', 'ohe_fw_000'
]

# inicjalizacja OneHotEncoder z ustalonymi kategoriami
ohe = OneHotEncoder(
    categories=[sorted([int(c.split('_')[-1]) for c in numeric_cols])],
    sparse_output=False,  # zmiana z sparse -> sparse_output
    drop=None
)

# dopasowanie i transformacja
ohe_array = ohe.fit_transform(X[['formaWlasnosci_Symbol']])

# utworzenie DataFrame z odpowiednimi nazwami kolumn
df_ohe = pd.DataFrame(ohe_array, columns=numeric_cols, index=X.index)

# połączenie z oryginalnym df
df = pd.concat([X, df_ohe], axis=1)
df = df.drop(columns=["formaWlasnosci_Symbol"])

In [5]:
## analogicznie dla kolumny 'schemat_wsk_bilans'

ohe_cols = ['SFJIN_wsk_bilans', 'SFJMI_wsk_bilans', 'SFJMA_wsk_bilans']

# inicjalizacja OneHotEncoder z ustalonymi kategoriami
ohe = OneHotEncoder(
    categories = [['SFJIN', 'SFJMI', 'SFJMA']],
    sparse_output=False,  # zmiana z sparse -> sparse_output
    drop=None
)

# dopasowanie i transformacja
ohe_array = ohe.fit_transform(X[['schemat_wsk_bilans']])

# utworzenie DataFrame z odpowiednimi nazwami kolumn
df_ohe = pd.DataFrame(ohe_array, columns=ohe_cols, index=X.index)

# połączenie z oryginalnym df
df = pd.concat([df, df_ohe], axis=1)
df = df.drop(columns=["schemat_wsk_bilans"])

In [6]:
## i jeszcze raz dla 'schemat_wsk_rzis'

ohe_cols = ['SFJIN_wsk_rzis', 'SFJMI_wsk_rzis', 'SFJMA_wsk_rzis']

# inicjalizacja OneHotEncoder z ustalonymi kategoriami
ohe = OneHotEncoder(
    categories = [['SFJIN', 'SFJMI', 'SFJMA']],
    sparse_output=False,  # zmiana z sparse -> sparse_output
    drop=None
)

# dopasowanie i transformacja
ohe_array = ohe.fit_transform(X[['schemat_wsk_rzis']])

# utworzenie DataFrame z odpowiednimi nazwami kolumn
df_ohe = pd.DataFrame(ohe_array, columns=ohe_cols, index=X.index)

# połączenie z oryginalnym df
df = pd.concat([df, df_ohe], axis=1)
df = df.drop(columns=["schemat_wsk_rzis"])

In [7]:
## zamiana poprzedniego sposobu liczenia woe na kawalek  pipeline'u  

class PKDKodWoEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, top_n=10, smoothing=0.5):
        self.top_n = top_n
        self.smoothing = smoothing
    
    def fit(self, X, y):
        X = X.copy()
        y = y.copy()
        
        # wybieramy top_n najczęstszych kategorii

        self.top_values_ = X['pkdKod'].value_counts().nlargest(self.top_n).index
        
        # kolumna w ktorej rzadkie wartosci zamieniamy na '0'
        grouped = X['pkdKod'].where(X['pkdKod'].isin(self.top_values_), other='0')
        
        # liczymy woe
        df = pd.DataFrame({'group': grouped, 'target': y})
        agg = df.groupby('group')['target'].agg(['sum', 'count'])
        agg = agg.rename(columns={'sum':'bad', 'count':'total'})
        agg['good'] = agg['total'] - agg['bad']

        agg['bad_s'] = agg['bad'] + self.smoothing
        agg['good_s'] = agg['good'] + self.smoothing

        total_bad = agg['bad_s'].sum()
        total_good = agg['good_s'].sum()

        agg['woe'] = np.log((agg['good_s'] / total_good) / (agg['bad_s'] / total_bad))

        self.woe_map_ = agg['woe'].to_dict()
        self.fallback_ = np.mean(list(self.woe_map_.values()))
        
        return self
    
    def transform(self, X):
        X = X.copy()
        grouped = X['pkdKod'].where(X['pkdKod'].isin(self.top_values_), other='0')
        X['WoE_pkdKod_grouped'] = grouped.map(self.woe_map_).fillna(self.fallback_)
        return X.drop(columns=['pkdKod'])

In [8]:
class MissingValueIndicatorAndImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy="median"):
        self.strategy = strategy

    def fit(self, X, y=None):
        X = X.replace([np.inf, -np.inf], np.nan).copy()
        self.base_cols_ = list(X.columns)

        # kolumny do imputacji
        self.imputer_ = SimpleImputer(strategy=self.strategy)
        self.imputer_.fit(X[self.base_cols_])

        # nazwy kolumn wskaźników
        self.indicator_cols_ = [f"{c}_mial_braki_danych" for c in self.base_cols_]

        return self

    def transform(self, X):
        X = X.replace([np.inf, -np.inf], np.nan).copy()

        # imputacja
        X_imputed = pd.DataFrame(
            self.imputer_.transform(X[self.base_cols_]),
            columns=self.base_cols_,
            index=X.index
        )

        # wskaźniki braków danych
        indicator_df = X[self.base_cols_].isna().astype(int)
        indicator_df.columns = self.indicator_cols_
        indicator_df.index = X.index

        # łączymy razem
        X_out = pd.concat([X_imputed, indicator_df], axis=1)

        return X_out

In [9]:
class DropConstantColumns(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # znajdź kolumny, które mają tylko jedną unikalną wartość
        self.cols_to_drop_ = [col for col in X.columns if X[col].nunique() <= 1]
        return self

    def transform(self, X):
        return X.drop(columns=self.cols_to_drop_, errors='ignore')

In [10]:
class CorrelationBasedFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9):
        self.threshold = threshold

    def fit(self, X, y):
        X = X.copy()
        y = y.copy()

        # Liczymy macierz korelacji
        corr_matrix = X.corr().abs()

        # Bierzemy tylko górny trójkąt
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

        to_drop = []

        # Iterujemy po parach kolumn, które przekraczają próg korelacji
        for col_a in upper.columns:
            # Znajdź kolumny silnie skorelowane z col_a
            highly_corr = upper.index[upper[col_a] > self.threshold].tolist()

            for col_b in highly_corr:
                # Jeśli żadna z kolumn jeszcze nie została usunięta
                if col_a not in to_drop and col_b not in to_drop:
                    
                    # Korelacja każdej z targetem
                    corr_a = abs(np.corrcoef(X[col_a], y)[0,1])
                    corr_b = abs(np.corrcoef(X[col_b], y)[0,1])

                    # Wywalamy tę słabiej skorelowaną z targetem
                    if corr_a < corr_b:
                        to_drop.append(col_a)
                    else:
                        to_drop.append(col_b)

        self.to_drop_ = to_drop
        return self

    def transform(self, X):
        X = X.copy()
        return X.drop(columns=self.to_drop_, errors='ignore')

In [11]:
# Pipeline same transformacje (sprawdzamy czy robi to samo co poprzednia abominacja xddd)

pipeline_transform = Pipeline([
    ("pkd_woe", PKDKodWoEEncoder(top_n=10, smoothing=0.5)),
    ("missing", MissingValueIndicatorAndImputer(strategy="median")),
    ("drop_constant", DropConstantColumns()),
    ("corr_selector", CorrelationBasedFeatureSelector(threshold= 0.9))
])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3, random_state=2137)

X_train_pipe = pipeline_transform.fit_transform(X_train.copy(), y_train.copy())
X_test_pipe  = pipeline_transform.transform(X_test.copy())

In [None]:
## POD ZADNYM POZOREM NIE CZEKAJ AZ TO SIE ZROBI TO CHOLERSTWO KOSZTOWALO PONAD GODZINE MOJEGO ZYCIA
## GOTOWY MODEL JEST NA GITHUBIE!

pipeline = Pipeline([
    ("pkd_woe", PKDKodWoEEncoder(top_n=10, smoothing=0.5)),
    ("missing", MissingValueIndicatorAndImputer(strategy="median")),
    ("drop_constant", DropConstantColumns()),
    ("corr_selector", CorrelationBasedFeatureSelector(threshold=0.8)),
    ("scaler", StandardScaler()),  # WAŻNE dla regresji!
    ("classifier", LogisticRegression(
        random_state=42,
        class_weight="balanced",
        max_iter=1000,
        solver='liblinear'  # dobry dla mniejszych danych
    ))
])


param_grid = {
    "classifier__C": [0.001, 0.01, 0.1, 1, 10, 100],  # siła regularyzacji
    "classifier__penalty": ["l1", "l2"],  # L1 dla selekcji cech, L2 dla stabilności
    "classifier__class_weight": [None, "balanced", {0: 1, 1: 3}, {0: 1, 1: 5}]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# UŻYJ ROC AUC - najlepsze dla nierównowagi
grid = GridSearchCV(pipeline, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=1)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best params from CV: {'classifier__C': 10, 'classifier__class_weight': None, 'classifier__penalty': 'l1'}
Best CV score (ROC AUC): 0.6697408765652524


In [39]:
# wczytanie modelu
loaded_grid = joblib.load('grid_search_model.pkl')

# najlepszy model
best_model = loaded_grid.best_estimator_

print("Best params from CV:", grid.best_params_)
print("Best CV score (ROC AUC):", grid.best_score_)

Best params from CV: {'classifier__C': 10, 'classifier__class_weight': None, 'classifier__penalty': 'l1'}
Best CV score (ROC AUC): 0.6697408765652524


In [40]:
## metryki

# Predykcje na zbiorze testowym
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
y_pred = best_model.predict(X_test)

# Wszystkie wymagane metryki
print("=== OCENA JAKOŚCI MODELU ===")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
print(f"PR AUC: {average_precision_score(y_test, y_pred_proba):.4f}")
print(f"Log Loss: {log_loss(y_test, y_pred_proba):.4f}")
print(f"Brier Score: {brier_score_loss(y_test, y_pred_proba):.4f}")

# Statystyka KS
def calculate_ks_score(y_true, y_pred_proba):
    scores_0 = y_pred_proba[y_true == 0]
    scores_1 = y_pred_proba[y_true == 1]
    ks_stat, _ = ks_2samp(scores_1, scores_0)
    return ks_stat

ks_stat = calculate_ks_score(y_test, y_pred_proba)
print(f"KS Statistic: {ks_stat:.4f}")

=== OCENA JAKOŚCI MODELU ===
ROC AUC: 0.6850
PR AUC: 0.1271
Log Loss: 0.3838
Brier Score: 0.0592
KS Statistic: 0.3224


In [None]:
# Pobieranie nazw cech po preprocessing
feature_names = []
# Dostosuj do swojego preprocessora - przykład:
# if hasattr(preprocessor, 'get_feature_names_out'):
#     feature_names = preprocessor.get_feature_names_out()
# else:
#     feature_names = [f'feature_{i}' for i in range(len(best_model.coef_[0]))]

# Analiza współczynników
coefficients = best_model.coef_[0]
odds_ratios = np.exp(coefficients)

feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefficients,
    'odds_ratio': odds_ratios,
    'abs_importance': np.abs(coefficients)
}).sort_values('abs_importance', ascending=False)

print("\n=== INTERPRETACJA GLOBALNA ===")
print("Top 10 najważniejszych cech:")
print(feature_importance_df.head(10))

# Wizualizacja współczynników
plt.figure(figsize=(10, 8))
top_features = feature_importance_df.head(15)
colors = ['red' if coef < 0 else 'blue' for coef in top_features['coefficient']]
plt.barh(top_features['feature'], top_features['coefficient'], color=colors)
plt.xlabel('Współczynnik (log-odds)')
plt.title('Top 15 najważniejszych cech - współczynniki regresji logistycznej')
plt.tight_layout()
plt.savefig('feature_coefficients.png', dpi=300, bbox_inches='tight')
plt.show()

AttributeError: 'Pipeline' object has no attribute 'coef_'