In [72]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from collections import Counter

In [73]:
data = pd.read_csv("zbiór_10.csv")

In [74]:
X = data.drop(columns=["default"])
y = data["default"]

# dropujemy szczegolna forma wlasnosci (kazdy ma taka sama)
X = X.drop(columns="szczegolnaFormaPrawna_Symbol")

unique_values = X['formaWlasnosci_Symbol'].unique()

categorical_cols = ['formaWlasnosci_Symbol']

# lista kolumn OHE odpowiadających symbolom form własności
numeric_cols = [
    'ohe_fw_214','ohe_fw_215','ohe_fw_113','ohe_fw_216','ohe_fw_225','ohe_fw_226',
    'ohe_fw_224','ohe_fw_227','ohe_fw_234','ohe_fw_111','ohe_fw_112','ohe_fw_235',
    'ohe_fw_132','ohe_fw_123','ohe_fw_133','ohe_fw_122','ohe_fw_338', 'ohe_fw_000'
]

# inicjalizacja OneHotEncoder z ustalonymi kategoriami
ohe = OneHotEncoder(
    categories=[sorted([int(c.split('_')[-1]) for c in numeric_cols])],
    sparse_output=False,  # zmiana z sparse -> sparse_output
    drop=None
)

# dopasowanie i transformacja
ohe_array = ohe.fit_transform(X[['formaWlasnosci_Symbol']])

# utworzenie DataFrame z odpowiednimi nazwami kolumn
df_ohe = pd.DataFrame(ohe_array, columns=numeric_cols, index=X.index)

# połączenie z oryginalnym df
df = pd.concat([X, df_ohe], axis=1)
df = df.drop(columns=["formaWlasnosci_Symbol"])

In [75]:
## analogicznie dla kolumny 'schemat_wsk_bilans'

ohe_cols = ['SFJIN_wsk_bilans', 'SFJMI_wsk_bilans', 'SFJMA_wsk_bilans']

# inicjalizacja OneHotEncoder z ustalonymi kategoriami
ohe = OneHotEncoder(
    categories = [['SFJIN', 'SFJMI', 'SFJMA']],
    sparse_output=False,  # zmiana z sparse -> sparse_output
    drop=None
)

# dopasowanie i transformacja
ohe_array = ohe.fit_transform(X[['schemat_wsk_bilans']])

# utworzenie DataFrame z odpowiednimi nazwami kolumn
df_ohe = pd.DataFrame(ohe_array, columns=ohe_cols, index=X.index)

# połączenie z oryginalnym df
df = pd.concat([df, df_ohe], axis=1)
df = df.drop(columns=["schemat_wsk_bilans"])

In [76]:
## i jeszcze raz dla 'schemat_wsk_rzis'

ohe_cols = ['SFJIN_wsk_rzis', 'SFJMI_wsk_rzis', 'SFJMA_wsk_rzis']

# inicjalizacja OneHotEncoder z ustalonymi kategoriami
ohe = OneHotEncoder(
    categories = [['SFJIN', 'SFJMI', 'SFJMA']],
    sparse_output=False,  # zmiana z sparse -> sparse_output
    drop=None
)

# dopasowanie i transformacja
ohe_array = ohe.fit_transform(X[['schemat_wsk_rzis']])

# utworzenie DataFrame z odpowiednimi nazwami kolumn
df_ohe = pd.DataFrame(ohe_array, columns=ohe_cols, index=X.index)

# połączenie z oryginalnym df
df = pd.concat([df, df_ohe], axis=1)
df = df.drop(columns=["schemat_wsk_rzis"])

In [77]:
# Stratified K Fold


def stratified_train_val_test_split(X, y, test_size = 0.15, val_size = 0.15, random_state = 2137):
    # 1) hold out the test set
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )

    # 2) compute val fraction relative to the remaining (X_temp)
    #    val_size is fraction of original; convert to fraction of X_temp
    val_rel = val_size / (1.0 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_rel, stratify=y_temp, random_state=random_state
    )

    # quick class-count check
    print("Counts: train", Counter(y_train), "val", Counter(y_val), "test", Counter(y_test))
    return X_train, X_val, X_test, y_train, y_val, y_test


In [78]:
X_train, X_val, X_test, y_train, y_val, y_test = stratified_train_val_test_split(df, y)

Counts: train Counter({0: 1971, 1: 128}) val Counter({0: 423, 1: 28}) test Counter({0: 423, 1: 27})


In [79]:
# wybieramy top_n najczęstszych kategorii
X = df.copy()
top_n = 10
top_values = X_train['pkdKod'].value_counts().nlargest(top_n).index

# tworzymy nową kolumnę w X_train, gdzie rzadkie wartości są zastąpione przez '0'
X_train['pkdKod_grouped'] = X_train['pkdKod'].where(X_train['pkdKod'].isin(top_values), other='0')
X_train = X_train.drop(columns=['pkdKod'])

# tworzymy nową kolumnę w X_val, gdzie rzadkie wartości są zastąpione przez '0'
X_val['pkdKod_grouped'] = X_val['pkdKod'].where(X_val['pkdKod'].isin(top_values), other='0')
X_val = X_val.drop(columns=['pkdKod'])

# tworzymy nową kolumnę w X_test, gdzie rzadkie wartości są zastąpione przez '0'
X_test['pkdKod_grouped'] = X_test['pkdKod'].where(X_test['pkdKod'].isin(top_values), other='0')
X_test = X_test.drop(columns=['pkdKod'])


In [80]:
# łączymy X i y do obliczenia statystyk
train_temp = X_train.copy()
train_temp['default'] = y

# agregujemy liczbe przypadkow
agg = train_temp.groupby('pkdKod_grouped')['default'].agg(['sum', 'count']).rename(columns={'sum': 'bad', 'count': 'total'})
agg['good'] = agg['total'] - agg['bad']

# smoothing, żeby uniknąć dzielenia przez zero
smoothing = 0.5
agg['bad_s'] = agg['bad'] + smoothing
agg['good_s'] = agg['good'] + smoothing

total_bad = agg['bad_s'].sum()
total_good = agg['good_s'].sum()

agg['woe'] = np.log((agg['good_s'] / total_good) / (agg['bad_s'] / total_bad))

woe_map = agg['woe'].to_dict()

# dodajemy nową kolumnę do X
X_train['WoE_pkdKod_grouped'] = X_train['pkdKod_grouped'].map(woe_map)

X_train = X_train.drop(columns=['pkdKod_grouped'])

X_val['WoE_pkdKod_grouped'] = X_val['pkdKod_grouped'].map(woe_map)
X_test['WoE_pkdKod_grouped'] = X_test['pkdKod_grouped'].map(woe_map)



In [81]:
# inf na nan
for df in (X_train, X_val, X_test):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

for df in (X_train, X_val, X_test):
    for c in X_train:
        df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)

cols_to_impute = list(X_train.columns) 

imputer = SimpleImputer(strategy="median")
imputer.fit(X_train[cols_to_impute])

X_train[cols_to_impute] = imputer.transform(X_train[cols_to_impute])
X_val[cols_to_impute]   = imputer.transform(X_val.reindex(columns=cols_to_impute))
X_test[cols_to_impute]  = imputer.transform(X_test.reindex(columns=cols_to_impute))


  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mial_braki_danych"] = df[c].isna().astype(int)
  df[f"{c}_mia

In [83]:

# Wywalamy za bardzo skorelowane
corr_matrix = X_train.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

X_train.drop(to_drop, axis=1, inplace=True)

X_val.drop(to_drop, axis=1, inplace=True)
X_test.drop(to_drop, axis=1, inplace=True)