Обучить алгоритмы LightGBM и XGBoost, получить OOF прогнозы, оценить корреляцию прогнозов на обучающей выборке. Применить модели на тестовую выборку и оценить корреляцию.

In [35]:
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score

In [2]:
client_profile = pd.read_csv('data/client_profile.csv')
client_profile.head(2)

Unnamed: 0,APPLICATION_NUMBER,GENDER,CHILDRENS,TOTAL_SALARY,AMOUNT_CREDIT,AMOUNT_ANNUITY,EDUCATION_LEVEL,FAMILY_STATUS,REGION_POPULATION,AGE,...,FAMILY_SIZE,EXTERNAL_SCORING_RATING_1,EXTERNAL_SCORING_RATING_2,EXTERNAL_SCORING_RATING_3,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,123666076,F,0,157500.0,270000.0,13500.0,Incomplete higher,Civil marriage,0.008068,8560,...,2.0,0.329471,0.236315,0.678568,0.0,0.0,0.0,0.0,1.0,2.0
1,123423688,F,0,270000.0,536917.5,28467.0,Secondary / secondary special,Married,0.020246,23187,...,2.0,,0.442295,0.802745,0.0,0.0,0.0,0.0,1.0,1.0


In [5]:
test = pd.read_csv('data/test.csv')
test.head(2)

Unnamed: 0,APPLICATION_NUMBER,NAME_CONTRACT_TYPE
0,123724268,Cash
1,123456549,Cash


In [6]:
train = pd.read_csv('data/train.csv')
train.head(2)

Unnamed: 0,APPLICATION_NUMBER,TARGET,NAME_CONTRACT_TYPE
0,123687442,0,Cash
1,123597908,1,Cash


In [7]:
train_index = train['APPLICATION_NUMBER']
test_index = test['APPLICATION_NUMBER']

In [9]:
train = train.merge(client_profile, on='APPLICATION_NUMBER', how='left')
test = test.merge(client_profile, on='APPLICATION_NUMBER', how='left')

In [20]:
train.shape

(110093, 47)

In [19]:
test.shape

(165141, 47)

In [10]:
target = train['TARGET']

In [11]:
train = train.drop(['APPLICATION_NUMBER', 'TARGET'], axis=1)
test = test.drop(['APPLICATION_NUMBER'], axis=1)

In [17]:
def encode_labels(df):
    categorical_cols = df.select_dtypes(include="object")
    for col in categorical_cols:
        df[col] = LabelEncoder().fit_transform(df[col].fillna('null'))
        
    return df

In [18]:
train = encode_labels(train)
test = encode_labels(test)

In [65]:
def make_cross_validation(X: pd.DataFrame,
                          y: pd.Series,
                          estimator: object,
                          metric: callable,
                          cv_strategy):
    """
    Кросс-валидация.

    Parameters
    ----------
    X: pd.DataFrame
        Матрица признаков.

    y: pd.Series
        Вектор целевой переменной.

    estimator: callable
        Объект модели для обучения.

    metric: callable
        Метрика для оценки качества решения.
        Ожидается, что на вход будет передана функция,
        которая принимает 2 аргумента: y_true, y_pred.

    cv_strategy: cross-validation generator
        Объект для описания стратегии кросс-валидации.
        Ожидается, что на вход будет передан объект типа
        KFold или StratifiedKFold.

    Returns
    -------
    oof_score: float
        Значение метрики качества на OOF-прогнозах.

    fold_train_scores: List[float]
        Значение метрики качества на каждом обучающем датасете кросс-валидации.

    fold_valid_scores: List[float]
        Значение метрики качества на каждом валидационном датасете кросс-валидации.

    oof_predictions: np.array
        Прогнозы на OOF.

    """
    estimators, fold_train_scores, fold_valid_scores = [], [], []
    oof_predictions = np.zeros(X.shape[0])

    for fold_number, (train_idx, valid_idx) in enumerate(cv_strategy.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]

        estimator.fit(x_train, y_train)
        y_train_pred = estimator.predict_proba(x_train)[:, 1]
        y_valid_pred = estimator.predict_proba(x_valid)[:, 1]

        fold_train_scores.append(metric(y_train, y_train_pred))
        fold_valid_scores.append(metric(y_valid, y_valid_pred))
        oof_predictions[valid_idx] = y_valid_pred

        msg = (
            f"Fold: {fold_number+1}, train-observations = {len(train_idx)}, "
            f"valid-observations = {len(valid_idx)}\n"
            f"train-score = {round(fold_train_scores[fold_number], 4)}, "
            f"valid-score = {round(fold_valid_scores[fold_number], 4)}" 
        )
        print(msg)
        print("="*69)
        estimators.append(estimator)

    oof_score = metric(y, oof_predictions)
    print(f"CV-results train: {round(np.mean(fold_train_scores), 4)} +/- {round(np.std(fold_train_scores), 3)}")
    print(f"CV-results valid: {round(np.mean(fold_valid_scores), 4)} +/- {round(np.std(fold_valid_scores), 3)}")
    print(f"OOF-score = {round(oof_score, 4)}")

    return estimators, oof_score, fold_train_scores, fold_valid_scores, oof_predictions

In [54]:
x_train, x_valid = train_test_split(
   train, train_size=0.75, shuffle=True, random_state=1,
)
y_train, y_valid = train_test_split(
    target, train_size=0.75, shuffle=True, random_state=1,
)

In [55]:
dtrain = xgb.DMatrix(
    data=x_train, label=y_train
)
dvalid = xgb.DMatrix(
    data=x_valid, label=y_valid
)

dtest = xgb.DMatrix(data=test)


In [61]:
params = {

    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.01,
    "n_estimators": 100000,
    "max_depth": 5,
    "seed": 99,
    'grow_policy': 'lossguide',
    'max_leaves' : 1000
}

model_xgb = xgb.XGBClassifier(params=params,
    dtrain=dtrain,
    num_boost_round=10000,
    early_stopping_rounds=100,
    verbose_eval=100)

In [66]:
cv_strategy = KFold(n_splits=5, random_state=1)

xgb_cv = make_cross_validation(
    train, target, model_xgb, metric=roc_auc_score, cv_strategy=cv_strategy
)



Parameters: { dtrain, early_stopping_rounds, num_boost_round, params, verbose_eval } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Fold: 1, train-observations = 88074, valid-observations = 22019
train-score = 0.8595, valid-score = 0.7029
Parameters: { dtrain, early_stopping_rounds, num_boost_round, params, verbose_eval } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Fold: 2, train-observations = 88074, valid-observations = 22019
train-score = 0.8591, valid-score = 0.7005
Parameters: { dtrain, early_stopping_rounds, num_boost_round, params, verbose_eval } might not be 

In [68]:
dtrain = lgb.Dataset(
    data=x_train, label=y_train
)
dvalid = lgb.Dataset(
    data=x_valid, label=y_valid
)

dtest = lgb.Dataset(
    data=test
)

In [80]:
params = {
    "boosting_type": "goss",
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.01,
    "n_estimators": 10000,
    "n_jobs": 6,
    "seed": 27
}

model_lgbm = lgb.LGBMClassifier(**params)


In [81]:
lgbm_cv = make_cross_validation(
    train, target, model_lgbm, metric=roc_auc_score, cv_strategy=cv_strategy
)

Fold: 1, train-observations = 88074, valid-observations = 22019
train-score = 0.9598, valid-score = 0.7003
Fold: 2, train-observations = 88074, valid-observations = 22019
train-score = 0.9599, valid-score = 0.6993
Fold: 3, train-observations = 88074, valid-observations = 22019
train-score = 0.9596, valid-score = 0.7067
Fold: 4, train-observations = 88075, valid-observations = 22018
train-score = 0.96, valid-score = 0.698
Fold: 5, train-observations = 88075, valid-observations = 22018
train-score = 0.96, valid-score = 0.7067
CV-results train: 0.9598 +/- 0.0
CV-results valid: 0.7022 +/- 0.004
OOF-score = 0.7015
