In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer

from functions import *

pd.set_option('display.max_columns', 100)

In [2]:
def application(df):

    # general cleaning procedures
    df = df[df['CODE_GENDER'] != 'XNA']
    df = df[df['AMT_INCOME_TOTAL'] < 20000000] # remove a outlier 117M
    
    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True) # set null value
    df['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True) # set null value

    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    
    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)

    # Flag_document features - count and kurtosis
    docs = [f for f in df.columns if 'FLAG_DOC' in f]
    df['DOCUMENT_COUNT'] = df[docs].sum(axis=1)
    df['NEW_DOC_KURT'] = df[docs].kurtosis(axis=1)

    def get_age_label(days_birth):
        """ Return the age group label (int). """
        age_years = -days_birth / 365
        if age_years < 27: return 1
        elif age_years < 40: return 2
        elif age_years < 50: return 3
        elif age_years < 65: return 4
        elif age_years < 99: return 5
        else: return 0

    # Categorical age - based on target=1 plot
    df['AGE_RANGE'] = df['DAYS_BIRTH'].apply(lambda x: get_age_label(x))

    # New features based on External sources
    df['EXT_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    df['EXT_SOURCES_WEIGHTED'] = df.EXT_SOURCE_1 * 2 + df.EXT_SOURCE_2 * 1 + df.EXT_SOURCE_3 * 3
    np.warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered')
    for function_name in ['min', 'max', 'mean', 'nanmedian', 'var']:
        feature_name = 'EXT_SOURCES_{}'.format(function_name.upper())
        df[feature_name] = eval('np.{}'.format(function_name))(
            df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)

    # Some simple new features (percentages)
    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']

    # Credit ratios
    df['CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
    
    # Income ratios
    df['INCOME_TO_EMPLOYED_RATIO'] = df['AMT_INCOME_TOTAL'] / df['DAYS_EMPLOYED']
    df['INCOME_TO_BIRTH_RATIO'] = df['AMT_INCOME_TOTAL'] / df['DAYS_BIRTH']
    
    # Time ratios
    df['ID_TO_BIRTH_RATIO'] = df['DAYS_ID_PUBLISH'] / df['DAYS_BIRTH']
    df['CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
    df['CAR_TO_EMPLOYED_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
    df['PHONE_TO_BIRTH_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']

    # EXT_SOURCE_X FEATURE
    df['APPS_EXT_SOURCE_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    df['APPS_EXT_SOURCE_STD'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    df['APPS_EXT_SOURCE_STD'] = df['APPS_EXT_SOURCE_STD'].fillna(df['APPS_EXT_SOURCE_STD'].mean())
    df['APP_SCORE1_TO_BIRTH_RATIO'] = df['EXT_SOURCE_1'] / (df['DAYS_BIRTH'] / 365.25)
    df['APP_SCORE2_TO_BIRTH_RATIO'] = df['EXT_SOURCE_2'] / (df['DAYS_BIRTH'] / 365.25)
    df['APP_SCORE3_TO_BIRTH_RATIO'] = df['EXT_SOURCE_3'] / (df['DAYS_BIRTH'] / 365.25)
    df['APP_SCORE1_TO_EMPLOY_RATIO'] = df['EXT_SOURCE_1'] / (df['DAYS_EMPLOYED'] / 365.25)
    df['APP_EXT_SOURCE_2*EXT_SOURCE_3*DAYS_BIRTH'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['DAYS_BIRTH']
    df['APP_SCORE1_TO_FAM_CNT_RATIO'] = df['EXT_SOURCE_1'] / df['CNT_FAM_MEMBERS']
    df['APP_SCORE1_TO_GOODS_RATIO'] = df['EXT_SOURCE_1'] / df['AMT_GOODS_PRICE']
    df['APP_SCORE1_TO_CREDIT_RATIO'] = df['EXT_SOURCE_1'] / df['AMT_CREDIT']
    df['APP_SCORE1_TO_SCORE2_RATIO'] = df['EXT_SOURCE_1'] / df['EXT_SOURCE_2']
    df['APP_SCORE1_TO_SCORE3_RATIO'] = df['EXT_SOURCE_1'] / df['EXT_SOURCE_3']
    df['APP_SCORE2_TO_CREDIT_RATIO'] = df['EXT_SOURCE_2'] / df['AMT_CREDIT']
    df['APP_SCORE2_TO_REGION_RATING_RATIO'] = df['EXT_SOURCE_2'] / df['REGION_RATING_CLIENT']
    df['APP_SCORE2_TO_CITY_RATING_RATIO'] = df['EXT_SOURCE_2'] / df['REGION_RATING_CLIENT_W_CITY']
    df['APP_SCORE2_TO_POP_RATIO'] = df['EXT_SOURCE_2'] / df['REGION_POPULATION_RELATIVE']
    df['APP_SCORE2_TO_PHONE_CHANGE_RATIO'] = df['EXT_SOURCE_2'] / df['DAYS_LAST_PHONE_CHANGE']
    df['APP_EXT_SOURCE_1*EXT_SOURCE_2'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2']
    df['APP_EXT_SOURCE_1*EXT_SOURCE_3'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_3']
    df['APP_EXT_SOURCE_1*DAYS_EMPLOYED'] = df['EXT_SOURCE_1'] * df['DAYS_EMPLOYED']
    df['APP_EXT_SOURCE_2*EXT_SOURCE_3'] = df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    df['APP_EXT_SOURCE_2*DAYS_EMPLOYED'] = df['EXT_SOURCE_2'] * df['DAYS_EMPLOYED']
    df['APP_EXT_SOURCE_3*DAYS_EMPLOYED'] = df['EXT_SOURCE_3'] * df['DAYS_EMPLOYED']

    # AMT_INCOME_TOTAL : income
    # CNT_FAM_MEMBERS  : the number of family members
    df['APPS_GOODS_INCOME_RATIO'] = df['AMT_GOODS_PRICE'] / df['AMT_INCOME_TOTAL']
    df['APPS_CNT_FAM_INCOME_RATIO'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    
    # DAYS_BIRTH : Client's age in days at the time of application
    # DAYS_EMPLOYED : How many days before the application the person started current employment
    df['APPS_INCOME_EMPLOYED_RATIO'] = df['AMT_INCOME_TOTAL'] / df['DAYS_EMPLOYED']

    # other feature from better than 0.8
    df['CREDIT_TO_GOODS_RATIO_2'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
    df['APP_AMT_INCOME_TOTAL_12_AMT_ANNUITY_ratio'] = df['AMT_INCOME_TOTAL'] / 12. - df['AMT_ANNUITY']
    df['APP_INCOME_TO_EMPLOYED_RATIO'] = df['AMT_INCOME_TOTAL'] / df['DAYS_EMPLOYED']
    df['APP_DAYS_LAST_PHONE_CHANGE_DAYS_EMPLOYED_ratio'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED']
    df['APP_DAYS_EMPLOYED_DAYS_BIRTH_diff'] = df['DAYS_EMPLOYED'] - df['DAYS_BIRTH']

    print('"Application_Train_Test" final shape:', df.shape)
    return df

raw

In [3]:
app_train = pd.read_csv('raw-data/dseb63_application_train.csv')
app_train.drop(['Unnamed: 0'], axis=1, inplace=True)
app_train.set_index('SK_ID_CURR', inplace=True)
app_train.head(10)

Unnamed: 0_level_0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,...,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
278621,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,School,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,...,0.0968,0.0529,0.9851,0.7987,0.0608,0.08,0.0345,0.2917,0.3333,0.0132,0.0787,0.0558,0.0039,0.01,reg oper account,block of flats,0.0714,Block,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
139008,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,-9833.0,-2437,,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.650442,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
138348,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,-4311.0,-3458,,1,1,0,1,0,0,Core staff,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.322738,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
64140,0,Cash loans,M,N,Y,0,99000.0,490495.5,27517.5,454500.0,"Spouse, partner",State servant,Secondary / secondary special,Married,House / apartment,0.035792,-16941,-1588,-4970.0,-477,,1,1,1,1,1,0,Laborers,2.0,2,2,WEDNESDAY,16,0,0,0,0,0,0,Other,,0.354225,0.621226,,,,,,,,...,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-2536.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0
219374,0,Cash loans,M,Y,Y,0,360000.0,1530000.0,42075.0,1530000.0,Unaccompanied,State servant,Higher education,Married,House / apartment,0.003122,-18850,-449,-4597.0,-2379,8.0,1,1,1,1,0,0,Managers,2.0,3,3,MONDAY,16,0,0,0,0,1,1,Other,,0.714279,0.540654,,,,,,,,...,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-1070.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
53466,0,Cash loans,F,N,Y,0,112500.0,1019610.0,33826.5,913500.0,Children,Pensioner,Secondary / secondary special,Married,House / apartment,0.018634,-20099,365243,-7427.0,-3514,,1,0,0,1,0,0,,2.0,2,2,WEDNESDAY,14,0,0,0,0,0,0,XNA,0.587334,0.205747,0.751724,,,,,,,,...,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
27272,0,Revolving loans,M,N,Y,0,135000.0,405000.0,20250.0,405000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.019689,-14469,-2019,-14437.0,-3992,,1,1,0,1,0,0,Laborers,1.0,2,2,THURSDAY,8,0,0,0,0,0,0,Electricity,,0.746644,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-1673.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
218056,0,Cash loans,F,N,Y,1,112500.0,652500.0,21177.0,652500.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.0228,-10197,-679,-4427.0,-738,,1,1,0,1,0,0,Core staff,3.0,2,2,SATURDAY,15,0,0,0,0,0,0,Medicine,0.31976,0.651862,0.363945,,,,,,,,...,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-844.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
52390,0,Cash loans,M,Y,N,1,225000.0,918468.0,28966.5,697500.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.016612,-14086,-3028,-643.0,-4911,23.0,1,1,0,1,0,0,Drivers,3.0,2,2,THURSDAY,13,0,0,0,0,0,0,Self-employed,,0.566907,0.770087,0.1474,0.0973,0.9806,0.7348,0.0582,0.16,0.1379,...,0.1489,0.0973,0.9806,0.7383,0.0585,0.16,0.1379,0.3333,0.375,0.0947,0.1223,0.1422,0.0,0.0,reg oper account,block of flats,0.1417,Panel,No,0.0,0.0,0.0,0.0,-4.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
268198,0,Cash loans,F,N,Y,0,189000.0,773680.5,32778.0,679500.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.010006,-14583,-203,-615.0,-2056,,1,1,0,1,0,0,Laborers,2.0,2,1,MONDAY,9,0,0,0,0,0,0,Transport: type 2,0.72194,0.642656,,0.3495,0.1335,0.9985,0.9796,0.1143,0.4,0.1724,...,0.3529,0.1335,0.9985,0.9799,0.115,0.4,0.1724,0.6667,0.7083,0.1789,0.2899,0.3842,0.0194,0.1022,reg oper account,block of flats,0.3811,Panel,No,0.0,0.0,0.0,0.0,-188.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,


In [4]:
app_train_clean = application(app_train)
app_train_clean.head(20)

"Application_Train_Test" final shape: (246006, 307)


Unnamed: 0_level_0,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,...,EXT_SOURCES_PROD,EXT_SOURCES_WEIGHTED,EXT_SOURCES_MIN,EXT_SOURCES_MAX,EXT_SOURCES_MEAN,EXT_SOURCES_NANMEDIAN,EXT_SOURCES_VAR,DAYS_EMPLOYED_PERC,INCOME_CREDIT_PERC,INCOME_PER_PERSON,ANNUITY_INCOME_PERC,PAYMENT_RATE,CREDIT_TO_GOODS_RATIO,INCOME_TO_EMPLOYED_RATIO,INCOME_TO_BIRTH_RATIO,ID_TO_BIRTH_RATIO,CAR_TO_BIRTH_RATIO,CAR_TO_EMPLOYED_RATIO,PHONE_TO_BIRTH_RATIO,APPS_EXT_SOURCE_MEAN,APPS_EXT_SOURCE_STD,APP_SCORE1_TO_BIRTH_RATIO,APP_SCORE2_TO_BIRTH_RATIO,APP_SCORE3_TO_BIRTH_RATIO,APP_SCORE1_TO_EMPLOY_RATIO,APP_EXT_SOURCE_2*EXT_SOURCE_3*DAYS_BIRTH,APP_SCORE1_TO_FAM_CNT_RATIO,APP_SCORE1_TO_GOODS_RATIO,APP_SCORE1_TO_CREDIT_RATIO,APP_SCORE1_TO_SCORE2_RATIO,APP_SCORE1_TO_SCORE3_RATIO,APP_SCORE2_TO_CREDIT_RATIO,APP_SCORE2_TO_REGION_RATING_RATIO,APP_SCORE2_TO_CITY_RATING_RATIO,APP_SCORE2_TO_POP_RATIO,APP_SCORE2_TO_PHONE_CHANGE_RATIO,APP_EXT_SOURCE_1*EXT_SOURCE_2,APP_EXT_SOURCE_1*EXT_SOURCE_3,APP_EXT_SOURCE_1*DAYS_EMPLOYED,APP_EXT_SOURCE_2*EXT_SOURCE_3,APP_EXT_SOURCE_2*DAYS_EMPLOYED,APP_EXT_SOURCE_3*DAYS_EMPLOYED,APPS_GOODS_INCOME_RATIO,APPS_CNT_FAM_INCOME_RATIO,APPS_INCOME_EMPLOYED_RATIO,CREDIT_TO_GOODS_RATIO_2,APP_AMT_INCOME_TOTAL_12_AMT_ANNUITY_ratio,APP_INCOME_TO_EMPLOYED_RATIO,APP_DAYS_LAST_PHONE_CHANGE_DAYS_EMPLOYED_ratio,APP_DAYS_EMPLOYED_DAYS_BIRTH_diff
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
278621,0,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188.0,-1186.0,-291,,1,1,0,1,1,0,2.0,1,1,11,0,0,0,0,0,0,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.0924,0.0538,...,,,0.311267,0.622246,0.466757,0.466757,0.024177,0.070862,0.208736,135000.0,0.132217,0.027598,1.145199,-227.272727,-16.104981,0.017358,,,0.049389,0.466757,0.219895,-0.006781,-0.013557,,-0.095699,-3247.12516,0.155634,2.755797e-07,2.406391e-07,0.500232,,4.810549e-07,0.622246,0.622246,175.726003,-0.000752,0.193685,,-369.785566,,-739.227981,,4.183333,135000.0,-227.272727,1.145199,-13198.5,-227.272727,0.69697,15577.0
139008,0,0,0,1,0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005,-3039.0,-9833.0,-2437,,1,1,0,1,0,0,2.0,2,2,17,0,0,0,0,0,0,,0.650442,,,,,,,,,,,,,,,,,,...,,,0.650442,0.650442,0.650442,0.650442,0.0,0.159905,0.431748,67500.0,0.2199,0.094941,1.052803,-44.422507,-7.103394,0.128229,,,0.032465,0.650442,0.151115,,-0.012501,,,,,,,,,2.080199e-06,0.325221,0.325221,81.112569,-0.001054,,,,,-1976.692297,,2.2,67500.0,-44.422507,1.052803,-18436.5,-44.422507,0.203027,15966.0
138348,0,1,0,1,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,-3038.0,-4311.0,-3458,,1,1,0,1,0,0,1.0,2,2,11,0,0,0,0,1,1,,0.322738,,,,,,,,,,,,,,,,,,...,,,0.322738,0.322738,0.322738,0.322738,0.0,0.152418,0.236842,121500.0,0.179963,0.042623,1.0,-39.993417,-6.095725,0.17349,,,0.055489,0.322738,0.151115,,-0.005914,,,,,,,,,6.291195e-07,0.161369,0.161369,11.259753,-0.000292,,,,,-980.478916,,4.222222,121500.0,-39.993417,1.0,-11740.5,-39.993417,0.364055,16894.0
64140,0,1,0,1,0,99000.0,490495.5,27517.5,454500.0,0.035792,-16941,-1588.0,-4970.0,-477,,1,1,1,1,1,0,2.0,2,2,16,0,0,0,0,0,0,,0.354225,0.621226,,,,,,,,,,,,,,,,,...,,,0.354225,0.621226,0.487726,0.487726,0.017822,0.093737,0.201837,49500.0,0.277955,0.056101,1.079198,-62.342569,-5.843811,0.028157,,,0.149696,0.487726,0.188799,,-0.007637,-0.013394,,,,,,,,7.221773e-07,0.177112,0.177112,9.896757,-0.00014,,,,0.220054,-562.508874,-986.507425,4.590909,49500.0,-62.342569,1.079198,-19267.5,-62.342569,1.596977,15353.0
219374,0,1,1,1,0,360000.0,1530000.0,42075.0,1530000.0,0.003122,-18850,-449.0,-4597.0,-2379,8.0,1,1,1,1,0,0,2.0,3,3,16,0,0,0,0,1,1,,0.714279,0.540654,,,,,,,,,,,,,,,,,...,,,0.540654,0.714279,0.627467,0.627467,0.007536,0.02382,0.235294,180000.0,0.116875,0.0275,1.0,-801.781737,-19.098143,0.126207,-0.000424,-0.017817,0.056764,0.627467,0.122771,,-0.01384,-0.010476,,,,,,,,4.668492e-07,0.238093,0.238093,228.789009,-0.000668,,,,0.386178,-320.7114,-242.753848,4.25,180000.0,-801.781737,1.0,-12075.0,-801.781737,2.383073,18401.0
53466,0,0,0,1,0,112500.0,1019610.0,33826.5,913500.0,0.018634,-20099,,-7427.0,-3514,,1,0,0,1,0,0,2.0,2,2,14,0,0,0,0,0,0,0.587334,0.205747,0.751724,,,,,,,,,,,,,,,,,...,0.09084,3.635587,0.205747,0.751724,0.514935,0.587334,0.052303,,0.110336,56250.0,0.30068,0.033176,1.116158,,-5.597293,0.174835,,,,0.514935,0.280096,-0.010673,-0.003739,-0.013661,,-2428.811142,0.293667,6.429491e-07,5.760379e-07,2.854638,0.781316,2.017902e-07,0.102874,0.102874,11.041499,,0.120842,0.441513,,0.154665,,,8.12,56250.0,,1.116158,-24451.5,,,
27272,0,1,0,1,0,135000.0,405000.0,20250.0,405000.0,0.019689,-14469,-2019.0,-14437.0,-3992,,1,1,0,1,0,0,1.0,2,2,8,0,0,0,0,0,0,,0.746644,,,,,,,,,,,,,,,,,,...,,,0.746644,0.746644,0.746644,0.746644,0.0,0.13954,0.333333,135000.0,0.15,0.05,1.0,-66.864785,-9.330292,0.2759,,,0.115627,0.746644,0.151115,,-0.018848,,,,,,,,,1.843565e-06,0.373322,0.373322,37.921866,-0.000446,,,,,-1507.473488,,3.0,135000.0,-66.864785,1.0,-9000.0,-66.864785,0.828628,12450.0
218056,0,0,0,1,1,112500.0,652500.0,21177.0,652500.0,0.0228,-10197,-679.0,-4427.0,-738,,1,1,0,1,0,0,3.0,2,2,15,0,0,0,0,0,0,0.31976,0.651862,0.363945,,,,,,,,,,,,,,,,,...,0.075861,2.383218,0.31976,0.651862,0.445189,0.363945,0.021682,0.066588,0.172414,37500.0,0.18824,0.032455,1.0,-165.684831,-11.032657,0.072374,,,0.082769,0.445189,0.180342,-0.011454,-0.023349,-0.013036,-0.172006,-2125.45872,0.106587,4.900539e-07,4.900539e-07,0.490533,0.878594,9.990227e-07,0.325931,0.325931,28.590453,-0.000772,0.20844,0.116375,-217.117157,0.237242,-442.614524,-247.118817,5.8,37500.0,-165.684831,1.0,-11802.0,-165.684831,1.243004,9518.0
52390,0,1,1,0,1,225000.0,918468.0,28966.5,697500.0,0.016612,-14086,-3028.0,-643.0,-4911,23.0,1,1,0,1,0,0,3.0,2,2,13,0,0,0,0,0,0,,0.566907,0.770087,0.1474,0.0973,0.9806,0.7348,0.0582,0.16,0.1379,0.3333,0.375,0.0931,0.1202,0.1397,0.0,0.0,0.1502,0.101,...,,,0.566907,0.770087,0.668497,0.668497,0.010321,0.214965,0.244973,75000.0,0.12874,0.031538,1.3168,-74.306473,-15.973307,0.348644,-0.001633,-0.007596,0.000284,0.668497,0.14367,,-0.0147,-0.019968,,,,,,,,6.172307e-07,0.283453,0.283453,34.126331,-0.141727,,,,0.436567,-1716.593225,-2331.823648,3.1,75000.0,-74.306473,1.3168,-10216.5,-74.306473,0.001321,11058.0
268198,0,0,0,1,0,189000.0,773680.5,32778.0,679500.0,0.010006,-14583,-203.0,-615.0,-2056,,1,1,0,1,0,0,2.0,2,1,9,0,0,0,0,0,0,0.72194,0.642656,,0.3495,0.1335,0.9985,0.9796,0.1143,0.4,0.1724,0.6667,0.7083,0.1758,0.2849,0.3774,0.0193,0.1001,0.3561,0.1386,...,,,0.642656,0.72194,0.682298,0.682298,0.001571,0.01392,0.244287,94500.0,0.173429,0.042366,1.138603,-931.034483,-12.960296,0.140986,,,0.012892,0.682298,0.056062,-0.018082,-0.016096,,-1.298958,-6765.915145,0.36097,1.062457e-06,9.331239e-07,1.123369,,8.306481e-07,0.321328,0.642656,64.227084,-0.003418,0.463959,,-146.553773,,-130.45921,,3.595238,94500.0,-931.034483,1.138603,-17028.0,-931.034483,0.926108,14380.0


In [5]:
app_train_clean.isnull().sum().sort_values(ascending=False) / len(app_train_clean)

COMMONAREA_MEDI                   0.698877
COMMONAREA_AVG                    0.698877
COMMONAREA_MODE                   0.698877
NONLIVINGAPARTMENTS_MEDI          0.694568
NONLIVINGAPARTMENTS_MODE          0.694568
                                    ...   
OCCUPATION_TYPE_Accountants       0.000000
OCCUPATION_TYPE_Cleaning staff    0.000000
OCCUPATION_TYPE_Cooking staff     0.000000
OCCUPATION_TYPE_Core staff        0.000000
OCCUPATION_TYPE_HR staff          0.000000
Length: 307, dtype: float64

In [6]:
# drop columns with more than 50% missing values
app_train_clean = app_train_clean[app_train_clean.columns[app_train_clean.isnull().mean() < 0.4]]
app_train_clean.isnull().sum().sort_values(ascending=False) / len(app_train_clean)

APP_EXT_SOURCE_3*DAYS_EMPLOYED                    0.342520
APP_DAYS_LAST_PHONE_CHANGE_DAYS_EMPLOYED_ratio    0.274290
APP_EXT_SOURCE_2*EXT_SOURCE_3                     0.199885
APP_SCORE3_TO_BIRTH_RATIO                         0.198467
EXT_SOURCE_3                                      0.198467
                                                    ...   
NAME_HOUSING_TYPE_House / apartment               0.000000
NAME_HOUSING_TYPE_Municipal apartment             0.000000
NAME_HOUSING_TYPE_Office apartment                0.000000
NAME_HOUSING_TYPE_Rented apartment                0.000000
WEEKDAY_APPR_PROCESS_START_MONDAY                 0.000000
Length: 247, dtype: float64

In [7]:
# replace inf values with nan
app_train_clean = app_train_clean.replace([np.inf, -np.inf], np.nan)

In [8]:
X, y = app_train_clean.drop(['TARGET'], axis=1), app_train_clean['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# fill missing values with median
imputer = KNNImputer(n_neighbors=5)
imputer.fit(X_train)
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

# scale values
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

: 

In [None]:
# train model
logreg = LogisticRegression(C=10, class_weight='balanced', solver='newton-cholesky')
logreg.fit(X_train, y_train)

# predict
y_pred = logreg.predict_proba(X_test)[:, 1]

# evaluate
print('ROC AUC: {:.4f}'.format(roc_auc_score(y_test, y_pred)))
print('GINI: {:.4f}'.format(2 * roc_auc_score(y_test, y_pred) - 1))

NameError: name 'X_train' is not defined