In [1]:
import numpy as np
import pandas as pd
import catboost as cb
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Optional

import warnings
warnings.filterwarnings('ignore')

from scipy.stats import gmean
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from scipy.stats import skew, kurtosis

In [2]:
def concat_features(df, features, name=None):
    df_copy = df.copy()
    if name is None:
        name = '_'.join(features)
    df_copy[name] = df_copy[features[0]]
    for i, feature in enumerate(features):
        if i == 0:
            continue
        df_copy[name] += '_' + df_copy[feature].astype(str)
    return df_copy

def get_lower(x: str) -> str:
    return x.replace(' ', '_').lower()

def agg_features(df, df_out, by: str, func='mean'):
    df_copy = df.copy()
    df_out_copy = df_out.copy()
    cols = df_copy.drop(by, axis=1).columns
    for feature in cols:
        feature_groupby = df_copy.groupby(by, as_index=False)
        if func == 'mean':
            feature_counts = feature_groupby[feature].mean()
        elif func == 'count':
            feature_counts = feature_groupby[feature].count()
        feature_counts = feature_counts.rename(columns={
            feature: feature + f'_{func}'
        })
        df_out_copy = df_out_copy.merge(feature_counts, how="left", on=by)
    return df_out_copy

def get_input(data_path: str,
              base_path: str= './data/',
              col_lower=True) -> pd.DataFrame:
    data = pd.read_csv(f'{base_path}/{data_path}')
    if col_lower:
        data.columns = [col.lower() for col in data.columns]
    print(f'{data_path}: {data.shape[0]} rows, {data.shape[1]} cols')
    return data


def clip_outliers(X, lower=0.001, upper=0.999):
    new_X = X.copy()
    new_X = new_X.apply(lambda x: x.clip(*x.quantile([lower, upper])), axis=1)
    return new_X

def get_isna_features(X, target_name, treshhold=0.2):
    X_copy = X.copy()
    isna_features = []
    for feature in X.columns:
        new_f = np.zeros(X.shape[0])
        new_f = X[feature].isna()
        new_f_name = f'isna_{feature}'
        X_copy[new_f_name] = new_f.astype(int)
        isna_features.append(new_f_name)

    # Считаем влияние пропуска в признаке на распределение таргета в каждом признаке
    isna_influence = dict()
    for feature in isna_features:
        if feature in ['isna_application_number','isna_target','isna_name_contract_type']:
            continue
        data_target = X_copy.loc[X_copy[target_name] == 1]
        a = X_copy.loc[X_copy[feature] == 0, feature].count()
        a = a / X_copy.loc[X_copy[feature] == 1, feature].count()
        b = data_target.loc[data_target[feature] == 0, feature].count()
        b = b / data_target.loc[data_target[feature] == 1, feature].count()
        isna_influence[feature] = abs(a - b)

    isna_influence = pd.DataFrame(isna_influence.items(), 
                                  columns=['feature', 'influence'])\
                    .sort_values(by='influence', ascending=False)
    isna_influence_list = isna_influence.loc[ \
                isna_influence['influence'] > treshhold, 'feature'].tolist()
    return X_copy[isna_influence_list]

def category_encoding(series, bins=None, schema: dict=None, fillNa=0):
    series = series.copy()
    if not fillNa is None:
        series = series.fillna(value=fillNa)
    names = series.value_counts().keys().tolist()
    k = len(names) // bins if bins else 1
    if schema is None:
        schema = {value: int(key // k) for key, value in enumerate(names, start=1)}
    for name, i in schema.items():
        series[series == name] = i
    return series, schema

In [3]:
def create_categorical_aggs(data: pd.DataFrame,
                            groupby_id: str,
                            features: List[str],
                            prefix: Optional[str] = None,
                            suffix: Optional[str] = None,
                            ) -> pd.DataFrame:
    """
    Построение агрегаций для категориальных признаков.
    Для категориальных признако считаются счетчики для
    каждого значения категории и среднее значение счетчика
    для каждого значения категории.

    Parameters
    ----------
    data: pandas.core.frame.DataFrame
        Выборка для построения агрегаций.

    groupby_id: str
        Название ключа, по которому нужно произвести группировку.

    features: List[str]
        Список с названием признаков, для которых произвести
        группировку.

    prefix: str, optional, default = None
        Префикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    suffix: str, optional, default = None
        Суффикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    stats: pandas.core.frame.DataFrame
        Выборка с рассчитанными агрегациями.

    """
    if not prefix:
        prefix = ""
    if not suffix:
        suffix = ""

    categorical = pd.get_dummies(data[features])
    columns_to_agg = categorical.columns

    categorical[groupby_id] = data[groupby_id]
    data_grouped = categorical.groupby(groupby_id)
    stats = data_grouped.agg({col: ["mean", "sum"] for col in columns_to_agg})
    stats.columns = [f"{prefix}{feature}_{stat}{suffix}".lower() for feature, stat in stats]
    stats.columns = [col.replace("MEAN", "RATIO") for col in stats.columns]
    stats.columns = [col.replace("SUM", "TOTAL") for col in stats.columns]
    stats = stats.reset_index()

    return stats

def create_numerical_aggs(data: pd.DataFrame,
                          groupby_id: str,
                          aggs: dict,
                          prefix: Optional[str] = None,
                          suffix: Optional[str] = None,
                          ) -> pd.DataFrame:
    """
    Построение агрегаций для числовых признаков.

    Parameters
    ----------
    data: pandas.core.frame.DataFrame
        Выборка для построения агрегаций.

    groupby_id: str
        Название ключа, по которому нужно произвести группировку.

    aggs: dict
        Словарь с названием признака и списка функций.
        Ключ словаря - название признака, который используется для
        вычисления агрегаций, значение словаря - список с названием
        функций для вычисления агрегаций.

    prefix: str, optional, default = None
        Префикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    suffix: str, optional, default = None
        Суффикс для названия признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    stats: pandas.core.frame.DataFrame
        Выборка с рассчитанными агрегациями.

    """
    if not prefix:
        prefix = ""
    if not suffix:
        suffix = ""

    data_grouped = data.groupby(groupby_id)
    stats = data_grouped.agg(aggs)
    stats.columns = [f"{prefix}{feature}_{stat}{suffix}".lower() for feature, stat in stats]
    stats = stats.reset_index()

    return stats

In [4]:
def create_client_profile_features(X: pd.DataFrame, copy=True) -> pd.DataFrame:
    if copy:
        X = X.copy()
        
    X['days_on_last_job'] = X['days_on_last_job'].replace(365243, np.nan)
    bki_flags = [flag for flag in X.columns if 'amt_req_credit_bureau' in flag]
    X['bki_request_count'] = X[bki_flags].sum(axis=1)
    X['bki_kurtosis'] = X[bki_flags].kurtosis(axis=1)
    
    X['external_scoring_prod'] = X['external_scoring_rating_1'] * \
            X['external_scoring_rating_2'] * X['external_scoring_rating_3']
    X['external_scoring_weighted'] = X['external_scoring_rating_1']*2 + \
            X['external_scoring_rating_2']*1 + X['external_scoring_rating_3']*3
    
    for function_name in ['min', 'max', 'mean', 'nanmedian', 'var']:
        feature_name = 'external_scoring_rating_{}'.format(function_name)
        X[feature_name] = eval('np.{}'.format(function_name))(
                                            X[['external_scoring_rating_1', 
                                               'external_scoring_rating_2', 
                                               'external_scoring_rating_3']],
                                            axis=1)
    
    # Отношения между основными финансовыми показателями
    X['ratio_credit_to_annuity'] = X['amount_credit'] / X['amount_annuity']
    X['ratio_annuity_to_salary'] = X['amount_annuity'] / X['total_salary']
    X['ratio_credit_to_salary'] = X['amount_credit'] / X['total_salary']
    X['total_salary_net'] = X['total_salary'] - X['amount_annuity']
    
    # Отношение финансовых показателей к возрасту и временным признакам
    X['ratio_annuity_to_age'] = X['amount_annuity'] / X['age']
    X['ratio_credit_to_age'] = X['amount_credit'] / X['age']
    X['ratio_salary_to_age'] = X['total_salary'] / X['age']
    X['ratio_salary_to_experience'] = X['total_salary'] / X['days_on_last_job']
    X['ratio_credit_to_experience'] = X['amount_credit'] / X['days_on_last_job']
    X['ratio_annuity_to_experience'] = X['amount_annuity'] / X['days_on_last_job']
    
    # Отношение временных признаков
    X['ratio_age_to_experience'] = X['age'] / X['days_on_last_job']
    X['ratio_salary_to_region_population'] = X['total_salary'] / X['region_population']
    X['ratio_car_to_experience'] = X['own_car_age'] / X['days_on_last_job']
    X['ratio_car_to_age'] = X['own_car_age'] / X['age']
    
    # Произведение финансовых показателей кредита на вероятность дефолта
    # (математическое ожидание дефолта или ожидаемые потери)
    X['expected_total_loss_1'] = X['external_scoring_rating_1'] * X['amount_credit']
    X['expected_total_loss_2'] = X['external_scoring_rating_2'] * X['amount_credit']
    X['expected_total_loss_3'] = X['external_scoring_rating_3'] * X['amount_credit']
    X['expected_monthly_loss_1'] = X['external_scoring_rating_1'] * X['amount_annuity']
    X['expected_monthly_loss_2'] = X['external_scoring_rating_2'] * X['amount_annuity']
    X['expected_monthly_loss_3'] = X['external_scoring_rating_3'] * X['amount_annuity']
    
#     X['external_scoring_rating_1_plus_2'] = np.nansum(
#         X[['external_scoring_rating_1', 'external_scoring_rating_2']], axis=1
#     )
#     X['external_scoring_rating_1_plus_3'] = np.nansum(
#         X[['external_scoring_rating_1', 'external_scoring_rating_3']], axis=1
#     )
#     X['external_scoring_rating_2_plus_3'] = np.nansum(
#         X[['external_scoring_rating_2', 'external_scoring_rating_3']], axis=1
# #     )
#     X['external_scoring_rating_1_is_nan'] = np.isnan(X['external_scoring_rating_1'])
#     X['external_scoring_rating_2_is_nan'] = np.isnan(X['external_scoring_rating_2'])
#     X['external_scoring_rating_3_is_nan'] = np.isnan(X['external_scoring_rating_3'])
    
    # Дополнительные признаки
#     X['ratio_credit_per_family_size'] = X['amount_credit'] / X['family_size']
#     X['ratio_credit_per_childrens'] = X['amount_credit'] / (1 + X['childrens'])
#     X['ratio_children_to_family_size'] = X['childrens'] / X['family_size']
#     X['ratio_salary_per_family_size'] = X['total_salary'] / X['family_size']
#     X['ratio_salary_per_child'] = X['total_salary'] / X['childrens']
#     X['non_child'] = X['family_size'] - X['childrens']
#     X['ratio_child_to_non_child'] = X['childrens'] / X['non_child']
#     X['ratio_salary_per_non_child'] = X['total_salary'] / X['non_child']
#     X['ratio_credit_per_non_child'] = X['amount_credit'] / X['non_child']
    
    return X

def create_app_history_features(X: pd.DataFrame, copy=True) -> pd.DataFrame:
    if copy:
        X = X.copy()
        
    X['amount_annuity_amount_credit'] = X['amount_annuity'] / X['amount_credit']
    X['delay_first_payment'] = X['days_first_drawing'] - X['days_first_due']
    X['delay_last_payment'] = X['days_last_due'] - X['days_last_due_1st_version']
    X['payment_in_advance'] = X['days_termination'] - X['days_last_due_1st_version']
    
    return X

def create_bki_features(X: pd.DataFrame, copy=True) -> pd.DataFrame:
    if copy:
        X = X.copy()
        
    X['delays'] = X['amt_credit_sum_overdue'] * X['amt_credit_sum_debt'] * \
                  X['credit_day_overdue'] * X['amt_annuity'] / X['days_enddate_fact']
    
    return X

def create_payments_features(X: pd.DataFrame, copy=True) -> pd.DataFrame:
    if copy:
        X = X.copy()
        
    X['last_delay'] = X['days_instalment'] - X['days_entry_payment']
    X['last_paid_amount'] = (X['amt_instalment'] - X['amt_payment']) \
                            * X['num_instalment_version']
    
    return X

In [5]:
train = get_input('train.csv')
test = get_input('test.csv')

data = pd.concat([train, test], axis=0)
data = data.reset_index(drop=True)
data.head(3)

train.csv: 110093 rows, 3 cols
test.csv: 165141 rows, 2 cols


Unnamed: 0,application_number,target,name_contract_type
0,123687442,0.0,Cash
1,123597908,1.0,Cash
2,123526683,0.0,Cash


In [6]:
target_name = 'target'

### profile

In [7]:
profile = get_input('client_profile.csv')
profile.head(3)

client_profile.csv: 250000 rows, 24 cols


Unnamed: 0,application_number,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,...,family_size,external_scoring_rating_1,external_scoring_rating_2,external_scoring_rating_3,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year
0,123666076,F,0,157500.0,270000.0,13500.0,Incomplete higher,Civil marriage,0.008068,8560,...,2.0,0.329471,0.236315,0.678568,0.0,0.0,0.0,0.0,1.0,2.0
1,123423688,F,0,270000.0,536917.5,28467.0,Secondary / secondary special,Married,0.020246,23187,...,2.0,,0.442295,0.802745,0.0,0.0,0.0,0.0,1.0,1.0
2,123501780,M,1,427500.0,239850.0,23850.0,Incomplete higher,Married,0.072508,14387,...,3.0,0.409017,0.738159,,,,,,,


In [8]:
numerical_features = profile.select_dtypes(exclude=["object"]).columns.tolist()
categorical_features = profile.select_dtypes(include=["object"]).columns.tolist()

In [9]:
categorical_features

['gender', 'education_level', 'family_status']

In [10]:
profile = concat_features(profile, ['education_level', 'family_status'])

In [11]:
profile = create_client_profile_features(profile)

In [12]:
data = data.merge(profile, how="left", on='application_number')

In [13]:
aggs = {
    "total_salary": ["mean"],
    "amount_credit": ["mean"],
}

stats = create_numerical_aggs(
    profile, groupby_id="education_level", aggs=aggs, suffix="_by_education"
)
stats

Unnamed: 0,education_level,total_salary_mean_by_education,amount_credit_mean_by_education
0,Academic degree,244621.323529,729561.606618
1,Higher education,208989.672806,689809.957142
2,Incomplete higher,181446.844502,565921.004786
3,Lower secondary,130223.217137,491498.704966
4,Secondary / secondary special,155414.744584,571624.325064


In [14]:
features = ['application_number', 'education_level'] + list(aggs.keys())
profile_stats = profile[features]
profile_stats = profile_stats.merge(
    stats, how="left", on="education_level"
)
profile_stats["total_salary_to_mean_salary_by_education"] = profile_stats["total_salary"] / profile_stats["total_salary_mean_by_education"]
profile_stats["delta_salary_to_mean_salary_by_education"] = profile_stats["total_salary"] - profile_stats["total_salary_mean_by_education"]
profile_stats.head(3)

Unnamed: 0,application_number,education_level,total_salary,amount_credit,total_salary_mean_by_education,amount_credit_mean_by_education,total_salary_to_mean_salary_by_education,delta_salary_to_mean_salary_by_education
0,123666076,Incomplete higher,157500.0,270000.0,181446.844502,565921.004786,0.868023,-23946.844502
1,123423688,Secondary / secondary special,270000.0,536917.5,155414.744584,571624.325064,1.737287,114585.255416
2,123501780,Incomplete higher,427500.0,239850.0,181446.844502,565921.004786,2.356062,246053.155498


In [15]:
data = data.merge(profile_stats, how="left", on='application_number')

### applications history

In [16]:
app_history = get_input('applications_history.csv')
app_history = create_app_history_features(app_history)
app_history.head(3)

applications_history.csv: 1670214 rows, 26 cols


Unnamed: 0,prev_application_number,application_number,name_contract_type,amount_annuity,amt_application,amount_credit,amount_payment,amount_goods_payment,name_contract_status,days_decision,...,days_first_drawing,days_first_due,days_last_due_1st_version,days_last_due,days_termination,nflag_insured_on_approval,amount_annuity_amount_credit,delay_first_payment,delay_last_payment,payment_in_advance
0,49298709,123595216,,1730.43,17145.0,17145.0,0.0,17145.0,Approved,73,...,365243.0,42.0,300.0,42.0,37.0,0.0,0.100929,365201.0,-258.0,-263.0
1,50070639,123431468,Cash,25188.615,607500.0,679671.0,,607500.0,Approved,164,...,365243.0,134.0,916.0,365243.0,365243.0,1.0,0.03706,365109.0,364327.0,364327.0
2,49791680,123445379,Cash,15060.735,112500.0,136444.5,,112500.0,Approved,301,...,365243.0,271.0,59.0,365243.0,365243.0,1.0,0.11038,364972.0,365184.0,365184.0


In [17]:
numerical_features = app_history.select_dtypes(exclude=["object"]).columns.tolist()
categorical_features = app_history.select_dtypes(include=["object"]).columns.tolist()

In [18]:
numerical_features

['prev_application_number',
 'application_number',
 'amount_annuity',
 'amt_application',
 'amount_credit',
 'amount_payment',
 'amount_goods_payment',
 'days_decision',
 'sellerplace_area',
 'cnt_payment',
 'days_first_drawing',
 'days_first_due',
 'days_last_due_1st_version',
 'days_last_due',
 'days_termination',
 'nflag_insured_on_approval',
 'amount_annuity_amount_credit',
 'delay_first_payment',
 'delay_last_payment',
 'payment_in_advance']

In [19]:
aggs = { 
         'amount_annuity': [np.mean, np.sum],
         'amt_application': [np.mean, np.sum, "max"],
         'amount_credit': ["max"],
         'amount_payment': [np.sum],
         'amount_goods_payment': [np.mean, "max"],
         'days_decision': ["min"],
#          'sellerplace_area': [],
         'cnt_payment': [np.mean],
#          'days_first_drawing': [np.mean, np.sum],
#          'days_first_due': ["max"],
#          'days_last_due_1st_version': [],
         'days_last_due': ["min"],
         'days_termination': ["min"],
         'nflag_insured_on_approval': [np.sum],
         'amount_annuity_amount_credit': ["max"],
         'delay_first_payment': ["min"],
         'delay_last_payment': ["min"],
         'payment_in_advance': ["min"],
}

mask = app_history["name_contract_status"] == "Approved"
stats = create_numerical_aggs(
    app_history[mask], groupby_id="application_number", aggs=aggs, prefix="prev_app_", suffix="_approved_apps"
)
stats.head(2)

Unnamed: 0,application_number,prev_app_amount_annuity_mean_approved_apps,prev_app_amount_annuity_sum_approved_apps,prev_app_amt_application_mean_approved_apps,prev_app_amt_application_sum_approved_apps,prev_app_amt_application_max_approved_apps,prev_app_amount_credit_max_approved_apps,prev_app_amount_payment_sum_approved_apps,prev_app_amount_goods_payment_mean_approved_apps,prev_app_amount_goods_payment_max_approved_apps,prev_app_days_decision_min_approved_apps,prev_app_cnt_payment_mean_approved_apps,prev_app_days_last_due_min_approved_apps,prev_app_days_termination_min_approved_apps,prev_app_nflag_insured_on_approval_sum_approved_apps,prev_app_amount_annuity_amount_credit_max_approved_apps,prev_app_delay_first_payment_min_approved_apps,prev_app_delay_last_payment_min_approved_apps,prev_app_payment_in_advance_min_approved_apps
0,123423340,3951.0,3951.0,24835.5,24835.5,24835.5,23787.0,2520.0,24835.5,24835.5,1740,8.0,1619.0,1612.0,0.0,0.166099,363534.0,120.0,113.0
1,123423341,9251.775,9251.775,179055.0,179055.0,179055.0,179055.0,0.0,179055.0,179055.0,606,24.0,25.0,17.0,0.0,0.05167,364678.0,-100.0,-108.0


In [20]:
data = data.merge(stats, how="left", on='application_number')

In [21]:
categorical_features

['name_contract_type',
 'name_contract_status',
 'name_payment_type',
 'code_reject_reason',
 'name_type_suite',
 'name_client_type',
 'name_goods_category',
 'name_portfolio',
 'name_product_type',
 'name_yield_group']

In [22]:
stats = create_categorical_aggs(
    app_history, groupby_id="application_number", features=["name_contract_status"], prefix=""
)
stats.head(2)

Unnamed: 0,application_number,name_contract_status_approved_mean,name_contract_status_approved_sum,name_contract_status_canceled_mean,name_contract_status_canceled_sum,name_contract_status_refused_mean,name_contract_status_refused_sum,name_contract_status_unused offer_mean,name_contract_status_unused offer_sum
0,123423340,1.0,1,0.0,0,0.0,0,0.0,0
1,123423341,1.0,1,0.0,0,0.0,0,0.0,0


In [23]:
data = data.merge(stats, how="left", on='application_number')

In [24]:
data.shape

(275234, 89)

### bki

In [25]:
bki = get_input('bki.csv')
bki = create_bki_features(bki)
bki.head(3)

bki.csv: 945234 rows, 17 cols


Unnamed: 0,application_number,bureau_id,credit_active,credit_currency,days_credit,credit_day_overdue,days_credit_enddate,days_enddate_fact,amt_credit_max_overdue,cnt_credit_prolong,amt_credit_sum,amt_credit_sum_debt,amt_credit_sum_limit,amt_credit_sum_overdue,credit_type,days_credit_update,amt_annuity,delays
0,123538884,5223613,Active,currency 1,718.0,0,377.0,,19386.81,0,675000.0,320265.495,0.0,0.0,Consumer credit,39.0,,
1,123436670,6207544,Closed,currency 1,696.0,0,511.0,511.0,0.0,0,93111.66,0.0,0.0,0.0,Consumer credit,505.0,,
2,123589020,6326395,Closed,currency 1,165.0,0,149.0,160.0,,0,36000.0,0.0,0.0,0.0,Consumer credit,150.0,0.0,0.0


In [26]:
numerical_features = bki.select_dtypes(exclude=["object"]).columns.tolist()
categorical_features = bki.select_dtypes(include=["object"]).columns.tolist()

In [27]:
numerical_features

['application_number',
 'bureau_id',
 'days_credit',
 'credit_day_overdue',
 'days_credit_enddate',
 'days_enddate_fact',
 'amt_credit_max_overdue',
 'cnt_credit_prolong',
 'amt_credit_sum',
 'amt_credit_sum_debt',
 'amt_credit_sum_limit',
 'amt_credit_sum_overdue',
 'days_credit_update',
 'amt_annuity',
 'delays']

In [28]:
aggs = { 
        'days_credit': ['min'],
#         'credit_day_overdue': ['max'],
#         'days_credit_enddate': ['max'],
        'days_enddate_fact': ['min'],
#         'amt_credit_max_overdue': ['mean'],
        'cnt_credit_prolong': ['sum'],
        'amt_credit_sum': ['mean'],
#         'amt_credit_sum_debt': ['max'],
        'amt_credit_sum_limit': ['mean'],
        'amt_credit_sum_overdue': ['sum'],
        'days_credit_update': ['min'],
#         'amt_annuity': ['mean'],
        'delays': ['max', 'mean']
}


stats = create_numerical_aggs(
    bki[mask], groupby_id="application_number", aggs=aggs, prefix="prev_credit_"
)
stats.head(2)

Unnamed: 0,application_number,prev_credit_days_credit_min,prev_credit_days_enddate_fact_min,prev_credit_cnt_credit_prolong_sum,prev_credit_amt_credit_sum_mean,prev_credit_amt_credit_sum_limit_mean,prev_credit_amt_credit_sum_overdue_sum,prev_credit_days_credit_update_min,prev_credit_delays_max,prev_credit_delays_mean
0,123423340,559.0,,0,337680.0,0.0,0.0,6.0,,
1,123423341,103.0,36.0,0,167555.89125,10662.855,0.0,7.0,0.0,0.0


In [29]:
data = data.merge(stats, how="left", on='application_number')

In [30]:
categorical_features

['credit_active', 'credit_currency', 'credit_type']

In [31]:
data.shape

(275234, 98)

### payments

In [32]:
payments = get_input('payments.csv')
payments = create_payments_features(payments)
payments.head(3)

payments.csv: 1023932 rows, 8 cols


Unnamed: 0,prev_application_number,application_number,num_instalment_version,num_instalment_number,days_instalment,days_entry_payment,amt_instalment,amt_payment,last_delay,last_paid_amount
0,49011181,123664960,1.0,5,1002.0,1015.0,12156.615,12156.615,-13.0,0.0
1,48683432,123497205,1.0,13,442.0,432.0,18392.535,10047.645,10.0,8344.89
2,48652024,123749925,1.0,10,8.0,23.0,5499.945,5499.945,-15.0,0.0


In [33]:
numerical_features = payments.select_dtypes(exclude=["object"]).columns.tolist()
categorical_features = payments.select_dtypes(include=["object"]).columns.tolist()

In [34]:
numerical_features

['prev_application_number',
 'application_number',
 'num_instalment_version',
 'num_instalment_number',
 'days_instalment',
 'days_entry_payment',
 'amt_instalment',
 'amt_payment',
 'last_delay',
 'last_paid_amount']

In [35]:
aggs = {
#         'num_instalment_version': ['sum'],
#         'num_instalment_number': [],
#         'days_instalment': [],
#         'days_entry_payment': [],
#         'amt_instalment': [],
#         'amt_payment': [],
        'last_delay': ['mean', 'max'],
        'last_paid_amount': ['mean', 'max']
}

stats = create_numerical_aggs(
    payments[mask], groupby_id="application_number", aggs=aggs, prefix="prev_pay_"
)
stats.head(2)

Unnamed: 0,application_number,prev_pay_last_delay_mean,prev_pay_last_delay_max,prev_pay_last_paid_amount_mean,prev_pay_last_paid_amount_max
0,123423340,-10.0,-9.0,0.0,0.0
1,123423341,-20.0,-20.0,0.0,0.0


In [36]:
data = data.merge(stats, how="left", on='application_number')

In [37]:
categorical_features

[]

In [38]:
data.shape

(275234, 102)

### isna features

In [39]:
data_isna = get_isna_features(data, target_name, treshhold=0.2)
data = pd.concat([data, data_isna], axis=1)
data['isna_sum'] = data_isna.sum(axis=1)

In [40]:
data.shape

(275234, 125)

### category encode

In [41]:
numerical_features = data.select_dtypes(exclude=["object"]).columns.tolist()
categorical_features = data.select_dtypes(include=["object"]).columns.tolist()

In [42]:
for feature in categorical_features:
    print(f'\n{data[feature].value_counts()}')


Cash           248983
Credit Card     26251
Name: name_contract_type, dtype: int64

F      147444
M       76267
XNA         4
Name: gender, dtype: int64

Secondary / secondary special    158887
Higher education                  54516
Incomplete higher                  7456
Lower secondary                    2736
Academic degree                     120
Name: education_level_x, dtype: int64

Married                 143149
Single / not married     32888
Civil marriage           21687
Separated                14295
Widow                    11694
Unknown                      2
Name: family_status, dtype: int64

Secondary / secondary special_Married                 101852
Higher education_Married                               35564
Secondary / secondary special_Single / not married     21331
Secondary / secondary special_Civil marriage           16078
Secondary / secondary special_Separated                10056
Secondary / secondary special_Widow                     9570
Higher education_Si

In [43]:
for feature in categorical_features:
    data[feature] = category_encoding(data[feature])[0]

In [44]:
for feature in data.columns:
    data = data.rename(columns={
        feature: get_lower(feature)
    })

In [45]:
import re
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [46]:
data = data.replace(np.inf, np.nan)

In [47]:
data.shape

(275234, 125)

In [50]:
categorical_features

['name_contract_type',
 'gender',
 'education_level_x',
 'family_status',
 'education_level_family_status',
 'education_level_y']

In [49]:
data.to_pickle('prepared_data.pkl')

In [51]:
data = clip_outliers(data, lower=0.001, upper=0.999)

In [52]:
data.to_pickle('prepared_data_without_outliers.pkl')