## Задачи
1. Обучить модель логистической регрессии для скоринга заявок не более чем на 20 признаках.
2. Для выбранных переменных рассчитать коэффициенты корреляции в файле формата XLSX.
3. Проскорить тестовую выборку из файла `test_oot.pkl`, получив два поля `application_id_axi` и `score` в формате PKL.


In [1]:
!pip install openpyxl



In [2]:
import warnings

import numpy as np
import pandas as pd
from category_encoders.woe import WOEEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.preprocessing import KBinsDiscretizer

warnings.filterwarnings(
    "ignore",
    message="Bins whose width are too small*",
    module="sklearn.preprocessing._discretization"
)

In [3]:
def convert_float16(df_):
    cols16 = df_.select_dtypes(include=['float16']).columns
    df_[cols16] = df_[cols16].astype(np.float32)
    return df_


exclude = [
    'fpd15', 'finalscore_point', 'limit_final', 'application_id_axi',
    'segment', 'client_id', 'loan_id', 'set_type', 'application_date'
]

leak_keywords = [
    'bad', 'good', 'closed', 'pastdue', 'outst', 'paymnt', 'paymtnbki', 'liv', 'reg', 'date', 'name'
]

df = pd.read_pickle('data/train.pkl')
df = df[df['fpd15'].notna()].reset_index(drop=True)
df = convert_float16(df)

suspicious = [col for col in df.columns if any(key in col.lower() for key in leak_keywords)]
raw_features = [c for c in df.columns if (c not in exclude and c not in suspicious)]

X = df.drop(columns=['fpd15'])
y = df['fpd15']

In [4]:
# Разделение на train/test

X_tr, X_te, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)
X_train, X_test = X_tr[raw_features].copy(), X_te[raw_features].copy()

In [5]:
def fit_num_features(df, missing_threshold=0.8, var_threshold=0.01, n_bins=5):
    num_cols = df.select_dtypes(include='number').columns.tolist()

    # 1. Убираем признаки с большим числом пропусков
    miss_frac = df[num_cols].isnull().mean()
    num_cols = [f for f in num_cols if miss_frac[f] <= missing_threshold]

    # 2. Удаляем квазиконстантные признаки
    vt = VarianceThreshold(threshold=var_threshold)
    vt.fit(df[num_cols])
    num_cols = [c for c, keep in zip(df[num_cols].columns, vt.get_support()) if keep]

    # 3. Убираем дублирующие признаки
    num_cols = df[num_cols].T.drop_duplicates().index.tolist()

    # 4. Импутатор (fit на train)
    imputer = SimpleImputer(strategy='median')
    imputer.fit(df[num_cols])
    df_num_imp = pd.DataFrame(imputer.transform(df[num_cols]), columns=num_cols, index=df.index)

    # 5. Биннинг
    kbin = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile')
    kbin.fit(df_num_imp[num_cols])

    return {'num_cols': num_cols, 'imputer': imputer, 'kbin': kbin}


def transform_num_features(df, params):
    df_num = df[params['num_cols']].copy()
    # Импутация
    df_num_imp = pd.DataFrame(
        params['imputer'].transform(df_num), columns=df_num.columns, index=df_num.index
    )
    # KBinsDiscretizer
    df_binned = pd.DataFrame(
        params['kbin'].transform(df_num_imp), columns=df_num.columns, index=df_num.index
    )
    return df_binned.astype(int)

In [6]:
def fit_cat_features(df, missing_threshold=0.6):
    cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # 1. Убираем признаки с большим числом пропусков
    miss_frac = df[cat_cols].isnull().mean()
    valid_cols = [c for c in cat_cols if miss_frac[c] <= missing_threshold]

    # 2. Удаляем константные признаки
    nunique = df[valid_cols].nunique()
    valid_cols = [c for c in valid_cols if nunique[c] > 1]

    # 3. Сохраняем, каким значением заполнять пропуски
    fill_values = {c: df[c].mode()[0] for c in valid_cols}

    return {'cat_cols': valid_cols, 'fill_values': fill_values}


def transform_cat_features(df, params):
    df_cat = df[params['cat_cols']].copy()
    # Заполняем пропуски заранее сохранёнными mode
    df_cat = df_cat.fillna(params['fill_values'])
    return df_cat

In [7]:
num_params = fit_num_features(X_train)
cat_params = fit_cat_features(X_train)

X_train_num = transform_num_features(X_train, num_params)
X_train_cat = transform_cat_features(X_train, cat_params)

X_test_num = transform_num_features(X_test, num_params)
X_test_cat = transform_cat_features(X_test, cat_params)

X_train = pd.concat([X_train_num, X_train_cat], axis=1)
X_test = pd.concat([X_test_num, X_test_cat], axis=1)


In [8]:
# WOE Encoding

def woe_encode_oof(X, y, cols, n_splits=5, random_state=42):
    X_woe = pd.DataFrame(index=X.index)
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for train_idx, val_idx in kf.split(X):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr = y.iloc[train_idx]
        encoder = WOEEncoder(cols=cols)
        encoder.fit(X_tr, y_tr)
        X_woe_fold = encoder.transform(X_val)
        X_woe.loc[X_val.index, X_woe_fold.columns] = X_woe_fold.values
    return X_woe


X_train_woe = woe_encode_oof(X_train, y_train, X_train.columns)
woe_encoder = WOEEncoder(cols=X_train.columns)
woe_encoder.fit(X_train, y_train)
X_test_woe = woe_encoder.transform(X_test)

In [9]:
# Вычисляем Information Value (IV)

def calc_iv(df, feature, target):
    groups = df.groupby(feature, observed=True)[target].agg(['count', 'sum'])
    groups['non_event'] = groups['count'] - groups['sum']
    groups['event_rate'] = groups['sum'] / groups['sum'].sum()
    groups['non_event_rate'] = groups['non_event'] / groups['non_event'].sum()
    groups['woe'] = np.log((groups['event_rate'] + 1e-8) / (groups['non_event_rate'] + 1e-8))
    groups['iv'] = (groups['event_rate'] - groups['non_event_rate']) * groups['woe']
    return groups['iv'].sum()


iv_dict = {
    col: calc_iv(pd.concat([X_train_woe, y_train], axis=1), col, 'fpd15')
    for col in X_train_woe.columns
}
iv_series = pd.Series(iv_dict).sort_values(ascending=False)


In [10]:
# Ограничиваем 20-ю лучшими по IV

best_features = iv_series.head(20).index.tolist()

X_train_sel = X_train_woe[best_features]
X_test_sel = X_test_woe[best_features]

print("Лучшие фичи: \n" + '\n'.join(f'{i + 1:2d}. {feature}' for i, feature in enumerate(best_features)))

Лучшие фичи: 
 1. sumlimitallactivenbki
 2. avgcurbalall90nbki
 3. avgcurbalall60nbki
 4. avgcurbalall180nbki
 5. sumcurbalallactivenbki
 6. sumcurbalallnbki
 7. avgcurbal_dcurrrequestall90nbki
 8. sumlimitallnbki
 9. sumcurbalall90nbki
10. avgcurbal_dcurrrequestall60nbki
11. avglimitallactivenbki
12. avglimit_dcurrrequestallactivenbki
13. avgcurbal_dlimitall90nbki
14. sumcurbalall180nbki
15. avgcurbal_dcurrrequestall180nbki
16. avglimitall60activenbki
17. sumlimitall365activenbki
18. avglimitall180activenbki
19. avglimitall30nbki
20. avglimitall270activenbki


In [11]:
# Создание файла Excel

corr_matrix = X_train_woe[best_features].corr()
output_path = 'data/correlation_matrix.xlsx'
corr_matrix.to_excel(output_path)

In [12]:
# Поиск лучших гиперпараметров

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

logreg = LogisticRegression(max_iter=1000, class_weight='balanced')

model = GridSearchCV(logreg, param_grid, scoring='roc_auc', cv=5, n_jobs=-1)
model.fit(X_train_sel, y_train)

print("Лучшие параметры:", model.best_params_)

Лучшие параметры: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}


In [13]:
# Расчет метрик

def compute_continuous_metrics(y_true, probs, baseline=None):
    return {
        'roc_auc': roc_auc_score(y_true, probs),
        'log_loss': log_loss(y_true, probs),
        'brier_score': brier_score_loss(y_true, probs),
        'corr_vs_fs': np.corrcoef(probs, baseline)[0, 1],
    }

In [14]:
# Прогнозные вероятности на train и test

p_train = model.predict_proba(X_train_sel)[:, 1]
p_test = model.predict_proba(X_test_sel)[:, 1]
baseline_train = X_tr['finalscore_point']
baseline_test = X_te['finalscore_point']


train_metrics = compute_continuous_metrics(y_train, p_train, baseline=baseline_train)
test_metrics = compute_continuous_metrics(y_test, p_test, baseline=baseline_test)
baseline_metrics = compute_continuous_metrics(y_train, X_tr['finalscore_point'], baseline=baseline_train)

metrics_df = pd.DataFrame({
    'train': train_metrics,
    'test': test_metrics,
    'baseline': baseline_metrics
})

print(metrics_df)

                train      test  baseline
roc_auc      0.612660  0.604439  0.694049
log_loss     0.673237  0.673046  0.442374
brier_score  0.240128  0.240040  0.141154
corr_vs_fs   0.485413  0.481238  1.000000


In [15]:
# Загрузка данных с тестовой выборки

df_oot = pd.read_pickle('data/test_oot.pkl')
X_oot = df_oot[raw_features]

In [16]:
# Трансформация фичей

X_oot_num = transform_num_features(X_oot, num_params)
X_oot_cat = transform_cat_features(X_oot, cat_params)
X_oot = pd.concat([X_oot_num, X_oot_cat], axis=1)

In [17]:
# WOE

X_oot_woe = woe_encoder.transform(X_oot)
X_oot_sel = X_oot_woe[best_features]

In [18]:
# Расчет скоров

df_oot['score'] = model.predict_proba(X_oot_sel)[:, 1]

In [19]:
# Созранение данных скоров

output_df = df_oot[['application_id_axi', 'score']]
output_path = 'data/test_scores.pkl'
output_df.to_pickle(output_path)

In [20]:
# Проверка

test = pd.read_pickle('data/test_scores.pkl')
test.head()

Unnamed: 0,application_id_axi,score
2591,6263684-1,0.629545
2592,6263717-1,0.479027
2593,6263719-1,0.475985
2594,6263722-1,0.70774
2595,6263727-1,0.544553
