In [2]:
import pandas as pd
import numpy as np
import matplotlib as mp
import lightgbm as lgb
import time
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as st
from scipy.stats import probplot, ks_2samp
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import (roc_auc_score, recall_score, accuracy_score, auc, classification_report, \
                            confusion_matrix, plot_confusion_matrix, precision_recall_curve, \
                            plot_precision_recall_curve, roc_curve, plot_roc_curve)

In [3]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
df_app = pd.read_csv('../input/geekbrain-data-analysis/applications_history.csv')
df_bki = pd.read_csv('../input/geekbrain-data-analysis/bki.csv')
df_clp = pd.read_csv('../input/geekbrain-data-analysis/client_profile.csv')
df_pay = pd.read_csv('../input/geekbrain-data-analysis/payments.csv')
df_test = pd.read_csv('../input/geekbrain-data-analysis/test.csv')
df_train = pd.read_csv('../input/geekbrain-data-analysis/train.csv')
#df_sample = pd.read_csv('../input/geekbrain-data-analysis/sample_submit.csv')

print("df_app.shape = {} rows, {} cols".format(*df_app.shape))
print("df_bki.shape = {} rows, {} cols".format(*df_bki.shape))
print("df_clp.shape = {} rows, {} cols".format(*df_clp.shape))
print("df_pay.shape = {} rows, {} cols".format(*df_pay.shape))
print("df_test.shape = {} rows, {} cols".format(*df_test.shape))
print("df_train.shape = {} rows, {} cols".format(*df_train.shape))

In [5]:
# объединяем трейн и тест
data = pd.concat([df_train, df_test], axis=0)
data = data.reset_index(drop=True)
data.head(n=2)

In [6]:
data.shape

In [7]:
def clean_input(data) -> pd.DataFrame:
    """
    Перевод названий колонок в нижний регистр и вывод основной информации о наборе данных

    Parameters
    ----------
    data: Pandas DataFrame

    Returns
    -------
    data: pandas.core.frame.DataFrame
        Загруженный набор данных в pandas.DataFrame

    """
    #base_path = "data"
    #data = pd.read_csv(f"{base_path}/{data_path}")
    # Приводим все названия колонок к нижнему регистру
    data.columns = [col.lower() for col in data.columns]
    #print(f"{data_path}: shape = {data.shape[0]} rows, {data.shape[1]} cols")
    print(f"shape = {data.shape[0]} rows, {data.shape[1]} cols")

    return data

In [8]:
df_clp = clean_input(df_clp)

In [9]:
data = clean_input(data)

In [10]:
df_clp.head()

In [11]:
def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_features = df.select_dtypes(include=[np.object])
    categorical_columns = categorical_features.columns.to_list()
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

## Client profile

In [12]:
def create_client_profile_features(X: pd.DataFrame, copy: bool = True) -> pd.DataFrame:
    """
    Создание признаков на основе профиля клиентов.

    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков с исходным профилем клиента.

    copy: bool, optional, default = True
        Флаг использования копии датафрейма X.
        Опциональный параметр, по умолчанию, равен True.

    Returns
    -------
    X_transformed: pandas.core.frame.DataFrame
        Расширенная матрица признаков с профилем клиентов.

    """
    if copy:
        X = X.copy()

    X['gender'] = X['gender'].replace('XNA', np.nan)    # поменяем 4 значения на пола на F
    X['total_salary'] = X['total_salary'].replace(117000000.0, X['total_salary'].median())
    X = X[X['total_salary'] < 20000000] # удаляем выбросы
    # NaN значения для 'days_on_last_job': 365243 -> nan
    X["days_on_last_job"] = X["days_on_last_job"].replace(365243, np.nan)
    
    # One-hot encoding, nan_as_category = выключено
    X, cat_cols = one_hot_encoder(X, nan_as_category=False)

    def get_age_label(age):
        """ Return the age group label (int). """
        age_years = age / 365
        if age_years < 27: return 1
        elif age_years < 40: return 2
        elif age_years < 50: return 3
        elif age_years < 65: return 4
        elif age_years < 99: return 5
        else: return 0

    # сегментация по возрасту
    X['age_range'] = X['age'].apply(lambda x: get_age_label(x))
    
    # самые важные признаки (веса - по важности)
    X["external_scoring_prod"] = X["external_scoring_rating_1"] * X["external_scoring_rating_2"] * X["external_scoring_rating_3"]
    X["external_scoring_weighted"] = X.external_scoring_rating_1 * 2 + X.external_scoring_rating_2 * 1 + X.external_scoring_rating_3 * 3
    # X["log_external_scoring_rating_1"] = np.log(X["external_scoring_rating_1"])
    # X["log_external_scoring_rating_2"] = np.log(X["external_sco ring_rating_2"])
    # X["log_external_scoring_rating_3"] = np.log(X["external_scoring_rating_3"])

    # статистики по самым важным признакам
    for function_name in ["min", "max", "mean", "nanmedian", "var"]:
        feature_name = "external_scoring_rating_{}".format(function_name)
        X[feature_name] = eval("np.{}".format(function_name))(
            X[["external_scoring_rating_1", "external_scoring_rating_2", "external_scoring_rating_3"]], axis=1
        )
    # Отношение врменных признаков
    # возраст к стажу
    X['ratio_experience_to_age'] = X["days_on_last_job"] / X["age"]
    X['ratio_salary_to_credit'] = X['total_salary'] / X['amount_credit']
    # сколько зарплат клиент хочет одолжить = сумма кредита / суммы зарплаты
    X['ratio_credit_to_salary'] = X['amount_credit'] / X['total_salary']
    X['income_per_person'] = X['total_salary'] / X['family_size']
    # какую долю зарплаты клиент отдаёт за кредит = сумма платежа / зарплату
    X['percent_annuity_salary'] = X['amount_annuity'] / X['total_salary']
    # Отношение между основными фин. показателями
    # срок кредита = сумма кредиты / срок платежа
    X['ratio_credit_to_annuity'] = X['amount_credit'] / X['amount_annuity']
    
    # Отношение фин. показателей к возрасту и временным фичам
    # гипотеза:молодые агрессивнее берут кредиты, чем в возрасте
    X["ratio_salary_to_age"] = X["total_salary"] / X["age"]
    X["ratio_salary_to_experience"] = X["total_salary"] / X["days_on_last_job"]
    X["ratio_annuity_to_age"] = X["amount_annuity"] / X["age"]
    X["ratio_annuity_to_experience"] = X["amount_annuity"] / X["days_on_last_job"]
    X["ratio_credit_to_age"] = X["amount_credit"] / X["age"]
    X["ratio_credit_to_experience"] = X["amount_credit"] / X["days_on_last_job"]

    # свободные средства, остающиеся после выплаты кредита
    X["total_salary_net"] = X["total_salary"] - X["amount_annuity"]

    # регион клиента выражается в нормированом количестве жителей региона
    # насколько клиент по своей зарплате выделяется по своему региону
    X["ratio_car_to_age"] = X["own_car_age"] / X["age"]
    X["ratio_car_to_experience"] = X["own_car_age"] / X["days_on_last_job"]
    X["ratio_salary_to_region_population"] = X["total_salary"] * X["region_population"]
    #отношение внешнего рейтинга к возрасту
    X['ratio_external_scoring_rating_1_to_age'] = X['external_scoring_rating_1'] / (X['age'] / 365.25)
    X['ratio_external_scoring_rating_2_to_age'] = X['external_scoring_rating_2'] / (X['age'] / 365.25)
    X['ratio_external_scoring_rating_3_to_age'] = X['external_scoring_rating_3'] / (X['age'] / 365.25)
    #тношение внешнего рейтинга к кол-ву времени, проведенное на последнем месте работы
    X['ratio_external_scoring_rating_1_to_job'] = X['external_scoring_rating_1'] / (X['days_on_last_job'] / 365.25)
    X['ratio_external_scoring_rating_1_external_scoring_rating_2_to_age'] = X['external_scoring_rating_1'] * X['external_scoring_rating_2'] * X['age']
    X['ratio_external_scoring_rating_1_to_fam_size'] = X['external_scoring_rating_1'] / X['family_size']
    #рацио внешнего скоринга 1 к скорингу 2 и его отношение к величине кредита
    X['ratio_external_scoring_rating_1_to_amount_credit'] = X['external_scoring_rating_1'] / X['amount_credit']
    X['ratio_external_scoring_rating_1_to_scoring_2'] = X['external_scoring_rating_1'] / X['external_scoring_rating_2']
    X['ratio_external_scoring_rating_1_to_scoring_3'] = X['external_scoring_rating_1'] / X['external_scoring_rating_3']
    X['ratio_external_scoring_rating_2_to_scoring_3'] = X['external_scoring_rating_2'] / X['amount_credit']
    #произведение external scoring rating на популяцию региона
    X['external_scoring_rating_2*region_population'] = X['external_scoring_rating_2'] / X['region_population']
    X['external_scoring_rating_1*external_scoring_rating_2'] = X['external_scoring_rating_1'] * X['external_scoring_rating_2']
    X['external_scoring_rating_1*external_scoring_rating_3'] = X['external_scoring_rating_1'] * X['external_scoring_rating_3']
    X['external_scoring_rating_2*external_scoring_rating_3'] = X['external_scoring_rating_2'] * X['external_scoring_rating_3']
    #произведение external scoring rating на кол-во дней на последней работе
    X['external_scoring_rating_1*days_on_last_job'] = X['external_scoring_rating_1'] * X['days_on_last_job']
    X['external_scoring_rating_2*days_on_last_job'] = X['external_scoring_rating_2'] * X['days_on_last_job']
    X['external_scoring_rating_3*days_on_last_job'] = X['external_scoring_rating_3'] * X['days_on_last_job']

    X['total_salary/12_amount_annuity_ratio_diff'] = X['total_salary'] / 12. - X['amount_annuity']
    X['age_days_on_last_job_diff'] = X['age'] - X['days_on_last_job']

    # Произведение фин. показателей кредита на вероятность дефолта
    # Такая штука называется математическим ожиданием дефолта или ожидаемыми потерями
    X["expected_total_loss_1"] = X["external_scoring_rating_1"] * X["amount_credit"]
    X["expected_total_loss_2"] = X["external_scoring_rating_2"] * X["amount_credit"]
    X["expected_total_loss_3"] = X["external_scoring_rating_3"] * X["amount_credit"]
    X["expected_monthly_loss_1"] = X["external_scoring_rating_1"] * X["amount_annuity"]
    X["expected_monthly_loss_2"] = X["external_scoring_rating_2"] * X["amount_annuity"]
    X["expected_monthly_loss_3"] = X["external_scoring_rating_3"] * X["amount_annuity"]

    return X

In [13]:
# создадим новые признаки
df_clp = create_client_profile_features(df_clp)
df_clp.head()

In [14]:
data = data.merge(
    df_clp, how="left", on="application_number"
)

In [15]:
data.shape

In [17]:
# выделяем по маске данные из трейна и из теста
mask = data["target"].isnull()
features_to_drop = ["application_number", "target"]

# разделяем на трейн и тест
train, test = data.loc[~mask], data.loc[mask]

target, test_id = train["target"], test["application_number"]
train = train.drop(features_to_drop, axis=1)
test = test.drop(features_to_drop, axis=1)

# заменяем некорректные значения, которые могли образоваться в результате деления
train = train.replace(np.inf, np.nan)
train = train.replace(-np.inf, np.nan)
test = test.replace(np.inf, np.nan)
test = test.replace(-np.inf, np.nan)

In [18]:
name_contract_type_dict = {
    'Cash': 0, 'Credit Card': 1   
}

In [19]:
train.name_contract_type = train.name_contract_type.map(name_contract_type_dict)
test.name_contract_type = test.name_contract_type.map(name_contract_type_dict)

In [20]:
test.name_contract_type.unique()

In [21]:
import re
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

## Crossvalidation

In [None]:
def lightgbm_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
#     if not categorical:
#         categorical = "auto"

    estimators, encoders = [], {}
    if categorical:
        for feature in categorical:
            encoder = LabelEncoder()
            X[feature] = encoder.fit_transform(X[feature].astype("str").fillna("NA"))
            encoders[feature] = encoder
        
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])
    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = lgb.LGBMClassifier(**params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_valid, y_valid)],
            eval_metric="auc", verbose=0, early_stopping_rounds=5000,
            #categorical_feature=categorical
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds

In [None]:
lgbm_params = {
    'boosting_type': 'gbdt',
    'n_estimators': 10000,
    'learning_rate': 0.05134,
    'num_leaves': 54,
    'max_depth': 10,
    'subsample_for_bin': 240000,
    'reg_alpha': 0.436193,
    'reg_lambda': 0.479169,
    'colsample_bytree': 0.508716,
    'min_split_gain': 0.024766,
    'subsample': 0.7,
    'is_unbalance': False,
    'random_state': 27,
    'silent': -1,
    'verbose': -1,
}

In [None]:
cv = KFold(n_splits=8, random_state=282, shuffle=True)

lgb_estimators, lgb_oof_preds = lightgbm_cross_validation(
    params=lgbm_params, X=train, y=target, cv=cv)

# метрика качества
lgb_oof_score = roc_auc_score(
    target, lgb_oof_preds
)
print(f"OOF-score = {round(lgb_oof_score, 5)}")

In [None]:
y_pred = np.zeros(test.shape[0])

In [None]:
estimators = lgb_estimators
cv = cv

for estimator in estimators:
    y_pred += estimator.predict_proba(test)[:, 1]

In [None]:
test.head()

In [None]:
y_pred = pd.DataFrame({
    "APPLICATION_NUMBER": test_id,
    "TARGET": y_pred / cv.n_splits
})

In [None]:
y_pred.to_csv('submit_13.csv', index=False, encoding='utf-8')

## Crossvalidation Catboost

In [None]:
def catboost_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    encoders: dict
        Список с объектами LabelEncoders.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, encoders = [], {}
    oof_preds = np.zeros(X.shape[0])

    if categorical:
        for feature in categorical:
            encoder = LabelEncoder()
            X[feature] = encoder.fit_transform(X[feature].astype("str").fillna("NA"))
            encoders[feature] = encoder

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_train, y_train), (x_valid, y_valid)],
            early_stopping_rounds=5000,
            cat_features=categorical)
        
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        estimators.append(model)

    return estimators, encoders, oof_preds

## Crossvalidation xgboost

In [None]:
def xgboost_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели xgboost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    encoders: dict
        Список с объектами LabelEncoders.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, encoders = [], {}
    oof_preds = np.zeros(X.shape[0])

    if categorical:
        for feature in categorical:
            encoder = LabelEncoder()
            X[feature] = encoder.fit_transform(X[feature].astype("str").fillna("NA"))
            encoders[feature] = encoder

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]
        
        model = xgb.XGBClassifier(**params)
        model.fit(
            x_train, y_train,
            eval_set=[(x_train, y_train), (x_valid, y_valid)],
            early_stopping_rounds=5000,
            verbose=50)

        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        estimators.append(model)
        
    return estimators, encoders, oof_preds
