In [76]:
# We can scroll horizontally for the output
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

#Use pyspark to manipulate very large dataset
import pyspark
from pyspark.sql import SparkSession
# create spark object to import dataset
spark = SparkSession \
  .builder \
  .appName("Python Spark SQL basic example") \
  .config("spark.memory.fraction", 0.8) \
  .config("spark.executor.memory", "14g") \
  .config("spark.driver.memory", "12g")\
  .config("spark.sql.shuffle.partitions" , "800") \
  .getOrCreate()
from pyspark.sql.window import Window  # Use window function
from pyspark.sql.functions import *  # max, sum, avg....
from pyspark.ml.feature import Imputer  # use imputer to deal with missing values

#pandas - they standard data manipulation package
import pandas as pd
#numpy - scientific computation - matrix operations, etc.
import numpy as np
#for feature selection
from sklearn import feature_selection
import os
from sklearn.preprocessing import MinMaxScaler # Normalization
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer # Approximate missing values
from sklearn.feature_selection import VarianceThreshold # remove low variance features
from sklearn.feature_selection import f_classif # F test
from sklearn.model_selection import train_test_split

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_rows = 2000

# Combine Tables
Data cleaning, feature engineering

- Table 'credit_card_balance'
- Features used：SK_ID_CURR, MONTHS_BALANCE, AMT_BALANCE, AMT_CREDIT_LIMIT_ACTUAL
- No missing value

In [3]:

# Import dataset and select demanded features
cash_balance = spark.read.format("csv").option("header", "true").load("credit_card_balance.csv")

df_cash_balance = cash_balance.select(col("SK_ID_CURR").cast('integer'), 
                                      col("MONTHS_BALANCE").cast('double'), 
                                      col("AMT_BALANCE").cast('double'),
                                      col("AMT_CREDIT_LIMIT_ACTUAL").cast('double'))

df_cash_balance = df_cash_balance.withColumn("pct_balance", col("AMT_BALANCE")/col("AMT_CREDIT_LIMIT_ACTUAL"))
print(df_cash_balance.dtypes)

cash_balance_feats=df_cash_balance.withColumn("pct_balance_0_1_year", when(col("MONTHS_BALANCE") >= -12, col("pct_balance"))) \
  .withColumn("pct_balance_1_2_year", when((col("MONTHS_BALANCE")<-12) & \
                                           (col("MONTHS_BALANCE")>=-24), col("pct_balance"))) \
  .withColumn("pct_balance_2+_year", when(col("MONTHS_BALANCE") < -24, col("pct_balance"))) \
  .groupBy("SK_ID_CURR") \
  .agg(sum("pct_balance").alias("pct_balance_total"),
       sum("pct_balance_0_1_year").alias("pct_balance_0_1_year"),
       sum("pct_balance_1_2_year").alias("pct_balance_1_2_year"),
       sum("pct_balance_2+_year").alias("pct_balance_2+_year"))

# Fill Null with 0
cash_balance_feats=cash_balance_feats.fillna(0, subset=['pct_balance_total', 'pct_balance_0_1_year', \
                                                        'pct_balance_1_2_year', 'pct_balance_2+_year'])
cash_balance_feats.show(10)

'''
# Pandas version: very slow ...
cash_balance = pd.read_csv ('credit_card_balance.csv')
cash_balance[["MONTHS_BALANCE", "AMT_BALANCE"]] = cash_balance[["MONTHS_BALANCE", "AMT_BALANCE"]].apply(pd.to_numeric)

cash_balance_feats = cash_balance.groupby('SK_ID_CURR').apply(lambda x: pd.Series({
      'amt_balance': x['AMT_BALANCE'].sum(),
      'amt_balance_0_1_year': x['AMT_BALANCE'][x['MONTHS_BALANCE'] >= -12].sum(),
      'amt_balance_1_2_ear': x['AMT_BALANCE'][(x['MONTHS_BALANCE'] < -12) & (x['MONTHS_BALANCE'] >= -24)].sum(),
      'amt_balance_2+_year': x['AMT_BALANCE'][(x['MONTHS_BALANCE'] < -24)].sum()
  })
)



'''

[('SK_ID_CURR', 'int'), ('MONTHS_BALANCE', 'double'), ('AMT_BALANCE', 'double'), ('AMT_CREDIT_LIMIT_ACTUAL', 'double'), ('pct_balance', 'double')]
+----------+--------------------+--------------------+--------------------+-------------------+
|SK_ID_CURR|   pct_balance_total|pct_balance_0_1_year|pct_balance_1_2_year|pct_balance_2+_year|
+----------+--------------------+--------------------+--------------------+-------------------+
|    357089|0.014510736842105264|0.014510736842105264|                 0.0|                0.0|
|    424333|                 0.0|                 0.0|                 0.0|                0.0|
|    343570|   43.58243300000001|   5.973389333333333|  11.940160666666667|          25.668883|
|    105665|   48.74947733333333|                 0.0|                 0.0|  48.74947733333333|
|    295286|          28.2322329|           12.278263|           8.7464449|           7.207525|
|    278515|           4.7070325|           3.7024989|           1.0045336|          

'\n# Pandas version: very slow ...\ncash_balance = pd.read_csv (\'credit_card_balance.csv\')\ncash_balance[["MONTHS_BALANCE", "AMT_BALANCE"]] = cash_balance[["MONTHS_BALANCE", "AMT_BALANCE"]].apply(pd.to_numeric)\n\ncash_balance_feats = cash_balance.groupby(\'SK_ID_CURR\').apply(lambda x: pd.Series({\n      \'amt_balance\': x[\'AMT_BALANCE\'].sum(),\n      \'amt_balance_0_1_year\': x[\'AMT_BALANCE\'][x[\'MONTHS_BALANCE\'] >= -12].sum(),\n      \'amt_balance_1_2_ear\': x[\'AMT_BALANCE\'][(x[\'MONTHS_BALANCE\'] < -12) & (x[\'MONTHS_BALANCE\'] >= -24)].sum(),\n      \'amt_balance_2+_year\': x[\'AMT_BALANCE\'][(x[\'MONTHS_BALANCE\'] < -24)].sum()\n  })\n)\n\n\n\n'

In [4]:
# Check if the above output is correct (select ID = 357089)
a = df_cash_balance.where(df_cash_balance["SK_ID_CURR"] == 357089).select("pct_balance")
a.select(sum("pct_balance")).collect()[0][0]

0.014510736842105264

- Table 'installments_payments': 
- Features used: SK_ID_CURR, DAYS_ENTRY_PAYMENT, DAYS_INSTALMENT, AMT_INSTALMENT, AMT_PAYMENT
- No missing value

In [5]:

# Import dataset and select demanded features
install_balance = spark.read.format("csv").option("header", "true").load("installments_payments.csv")

# Create features
df_install_balance = install_balance.withColumn("install_delay", col("DAYS_ENTRY_PAYMENT")-col("DAYS_INSTALMENT")) \
                                       .withColumn("install_default", col("AMT_INSTALMENT")-col("AMT_PAYMENT"))
    
df_install_balance = df_install_balance.select(col("SK_ID_CURR").cast('integer'), 
                                  col("install_delay").cast('double'), 
                                  col("install_default").cast('double'))

install_balance_feats=df_install_balance.withColumn("no_delay_default", when(col("install_delay") <= 0, col("install_default"))) \
  .withColumn("<=10d_default", when((col("install_delay")>0) & (col("install_delay")<=10), col("install_default"))) \
  .withColumn("10-20d_default", when((col("install_delay")>10) & (col("install_delay")<=20), col("install_default"))) \
  .withColumn("20-30d_default", when((col("install_delay")>20) & (col("install_delay")<=30), col("install_default"))) \
  .withColumn(">30d_default", when(col("install_delay")>30, col("install_default"))) \
  .groupBy("SK_ID_CURR") \
  .agg(sum("install_default").alias("install_default_total"),
       sum("no_delay_default").alias("no_delay_default"),
       sum("<=10d_default").alias("<=10d_default"),
       sum("10-20d_default").alias("10-20d_default"),
       sum("20-30d_default").alias("20-30d_default"),
       sum(">30d_default").alias(">30d_default"))

install_balance_feats=install_balance_feats.fillna(0, subset=["install_default_total", \
                                               "no_delay_default", "<=10d_default", "10-20d_default", \
                                                    "20-30d_default", ">30d_default"])

install_balance_feats.show(10)



+----------+---------------------+-------------------+------------------+------------------+----------------+------------+
|SK_ID_CURR|install_default_total|   no_delay_default|     <=10d_default|    10-20d_default|  20-30d_default|>30d_default|
+----------+---------------------+-------------------+------------------+------------------+----------------+------------+
|    145504|   29998.574999999997|          25602.435| 4396.140000000001|               0.0|             0.0|         0.0|
|    197588|                  0.0|                0.0|               0.0|               0.0|             0.0|         0.0|
|    154034|             18798.84|           16291.62|               0.0|2507.2200000000003|             0.0|         0.0|
|    101094|            27132.975|           27128.25| 4.725000000000364|               0.0|             0.0|         0.0|
|    137055|                  0.0|                0.0|               0.0|               0.0|             0.0|         0.0|
|    169588|    

In [6]:
# Check if the above output is correct
a = df_install_balance.where(df_install_balance["SK_ID_CURR"] == 145504).select("install_default")
a.select(sum("install_default")).collect()[0][0]

29998.574999999997

- Table 'POS_CASH_balance':
- Useful features:SK_DPD, SK_DPD_DEF
- Too few observations for these two features. Not gonna use this table

- Table 'bureau_balance':
- Too many observations missing. Not gonna use this table

- Table 'bureau':
- Features used: DAYS_CREDIT, AMT_CREDIT_MAX_OVERDUE, AMT_CREDIT_SUM, AMT_CREDIT_SUM_DEBT, AMT_CREDIT_SUM_LIMIT, AMT_ANNUITY

In [7]:

# Import dataset and select demanded features
bureau = spark.read.format("csv").option("header", "true").load("bureau.csv")

# Create features
df_bureau = bureau.select(col("SK_ID_CURR").cast('integer'), \
                                  col("DAYS_CREDIT").cast('double'), \
                                  col("AMT_CREDIT_MAX_OVERDUE").cast('double'), \
                                  col("AMT_CREDIT_SUM").cast('double'), \
                                  col("AMT_CREDIT_SUM_DEBT").cast('double'), \
                                  col("AMT_CREDIT_SUM_LIMIT").cast('double'), \
                                  col("AMT_ANNUITY").cast('double'))
df_bureau = df_bureau.withColumn("pct_credit-debit", (col("AMT_CREDIT_SUM")-col("AMT_CREDIT_SUM_DEBT")) \
                                                     /col("AMT_CREDIT_SUM_LIMIT"))

bureau_feats = df_bureau.groupby("SK_ID_CURR").agg(
      avg("DAYS_CREDIT").alias("avg_DAYS_CREDIT"),
      avg("AMT_CREDIT_MAX_OVERDUE").alias("avg_AMT_CREDIT_MAX_OVERDUE"),
      sum("pct_credit-debit").alias("pct_credit-debit_total"),
      sum("AMT_ANNUITY").alias("AMT_ANNUITY_total"))

bureau_feats.show(10)

+----------+-------------------+--------------------------+----------------------+-----------------+
|SK_ID_CURR|    avg_DAYS_CREDIT|avg_AMT_CREDIT_MAX_OVERDUE|pct_credit-debit_total|AMT_ANNUITY_total|
+----------+-------------------+--------------------------+----------------------+-----------------+
|    330299|            -1645.0|                   5496.56|                  null|         23906.07|
|    355377|            -1540.0|                       0.0|                  null|             null|
|    295286| -1264.888888888889|                   6191.37|                  null|        14999.985|
|    166160|             -856.0|                      null|                  null|             null|
|    191350|             -151.0|                      null|                  null|             null|
|    436175|-1871.2727272727273|        1804.5549999999998|     4.749633054619763|             null|
|    429364|            -2639.5|                    9265.5|                  null|         

In [8]:
# Check if the above output is correct
a = df_bureau.where(df_bureau["SK_ID_CURR"] == 295286).select("DAYS_CREDIT")
a.select(avg("DAYS_CREDIT")).collect()[0][0]

-1264.888888888889

In [9]:
# Check percentage of missing for each column

bureau_feats = bureau_feats.drop("pct_credit-debit_total") # Drop the column with more than 80% missing values

amount_missing_df = bureau_feats.select([(count(when(isnan(c) | col(c).isNull(), c))/count(lit(1))) \
                                         .alias(c) for c in bureau_feats.columns])
amount_missing_df.show()

# Deal with missing values for multiple columns using imputer (replace with median)
imputer =  Imputer(inputCols=bureau_feats.columns, 
                   outputCols=["{}".format(c) for c in bureau_feats.columns],
                   strategy='median')
bureau_feats = imputer.fit(bureau_feats).transform(bureau_feats)

bureau_feats.show(10)

+----------+---------------+--------------------------+------------------+
|SK_ID_CURR|avg_DAYS_CREDIT|avg_AMT_CREDIT_MAX_OVERDUE| AMT_ANNUITY_total|
+----------+---------------+--------------------------+------------------+
|       0.0|            0.0|        0.3035862019351822|0.6134082815856853|
+----------+---------------+--------------------------+------------------+

+----------+-------------------+--------------------------+-----------------+
|SK_ID_CURR|    avg_DAYS_CREDIT|avg_AMT_CREDIT_MAX_OVERDUE|AMT_ANNUITY_total|
+----------+-------------------+--------------------------+-----------------+
|    330299|            -1645.0|                   5496.56|         23906.07|
|    355377|            -1540.0|                       0.0|          20836.8|
|    295286| -1264.888888888889|                   6191.37|        14999.985|
|    166160|             -856.0|                       0.0|          20836.8|
|    191350|             -151.0|                       0.0|          20836.8|


- Table 'previous_application':
- Features used: AMT_ANNUITY, AMT_CREDIT, AMT_GOODS_PRICE, NAME_YIELD_GROUP, NFLAG_INSURED_ON_APPROVAL

In [10]:
def find_mode(values_list):
    try:
        mode = max(set(values_list), key=values_list.count) #get the median of values in a list in each row
        return mode
    except Exception:
        return None #if there is anything wrong with the given values

mode_finder = udf(find_mode)

In [11]:
# Import dataset and select demanded features
prev_application = spark.read.format("csv").option("header", "true").load("previous_application.csv")

# Select features
df_prev_application = prev_application.select(col("SK_ID_CURR").cast('integer'), \
                                  col("AMT_ANNUITY").cast('double'), \
                                  col("AMT_CREDIT").cast('double'), \
                                  col("AMT_GOODS_PRICE").cast('double'), \
                                  col("NAME_YIELD_GROUP"), \
                                  col("NFLAG_INSURED_ON_APPROVAL").cast('integer'))

df_prev_application = df_prev_application.withColumn("label_YIELD_GROUP", \
              when((df_prev_application["NAME_YIELD_GROUP"] == 'low_action') | \
                   (df_prev_application["NAME_YIELD_GROUP"] == 'low_normal'), 1).otherwise( \
              when(df_prev_application["NAME_YIELD_GROUP"] == 'middle', 2).otherwise( \
              when(df_prev_application["NAME_YIELD_GROUP"] == 'high', 3).otherwise(lit(None)))))

# Check percentage of missing for each column
amount_missing_df = df_prev_application.select([(count(when(isnan(c) | col(c).isNull(), c))/count(lit(1))) \
                                         .alias(c) for c in df_prev_application.columns])
amount_missing_df.show()


+----------+-------------------+--------------------+-------------------+----------------+-------------------------+------------------+
|SK_ID_CURR|        AMT_ANNUITY|          AMT_CREDIT|    AMT_GOODS_PRICE|NAME_YIELD_GROUP|NFLAG_INSURED_ON_APPROVAL| label_YIELD_GROUP|
+----------+-------------------+--------------------+-------------------+----------------+-------------------------+------------------+
|       0.0|0.22286665062081865|5.987256722791211E-7|0.23081772754868538|             0.0|      0.40298129461254667|0.3096698985878456|
+----------+-------------------+--------------------+-------------------+----------------+-------------------------+------------------+



In [12]:
# Deal with missing values for multiple columns using imputer (replace with median)
df_prev_application = df_prev_application.drop("NAME_YIELD_GROUP")
imputer =  Imputer(inputCols=df_prev_application.columns, 
                   outputCols=["{}".format(c) for c in df_prev_application.columns],
                   strategy='median')
df_prev_application = imputer.fit(df_prev_application).transform(df_prev_application)

In [13]:

# Create new features that are the mode for "label_YIELD_GROUP" and "NFLAG_INSURED_ON_APPROVAL"
grouped = df_prev_application.groupBy("SK_ID_CURR", "label_YIELD_GROUP").count()
window = Window.partitionBy("SK_ID_CURR").orderBy(desc("count"))
df_yield = grouped \
    .withColumn("order", row_number().over(window)) \
    .where(col("order") == 1) \
    .select(col("SK_ID_CURR"), col("label_YIELD_GROUP").alias("prev_label_YIELD_mode"))


grouped = df_prev_application.groupBy("SK_ID_CURR", "NFLAG_INSURED_ON_APPROVAL").count()
window = Window.partitionBy("SK_ID_CURR").orderBy(desc("count"))
df_insured = grouped \
    .withColumn("order", row_number().over(window)) \
    .where(col("order") == 1) \
    .select(col("SK_ID_CURR"), col("NFLAG_INSURED_ON_APPROVAL").alias("prev_NFLAG_INSURED_mode"))

# Create other features                                                                      
prev_application_feats = df_prev_application.groupby("SK_ID_CURR").agg(
      sum("AMT_ANNUITY").alias("prev_AMT_ANNUITY_total"),
      sum("AMT_CREDIT").alias("prev_AMT_CREDIT_total"),
      sum("AMT_GOODS_PRICE").alias("prev_GOODS_price_total"))

# Left join all features                                                                    
prev_application_feats = prev_application_feats.join(df_yield, on = 'SK_ID_CURR', how = 'left')
prev_application_feats = prev_application_feats.join(df_insured, on = 'SK_ID_CURR', how = 'left')

prev_application_feats.show(10)

+----------+----------------------+---------------------+----------------------+---------------------+-----------------------+
|SK_ID_CURR|prev_AMT_ANNUITY_total|prev_AMT_CREDIT_total|prev_GOODS_price_total|prev_label_YIELD_mode|prev_NFLAG_INSURED_mode|
+----------+----------------------+---------------------+----------------------+---------------------+-----------------------+
|    420485|     74637.04500000001|             640035.0|              863203.5|                    2|                      0|
|    239142|             233017.38|            4441855.5|             4291650.0|                    2|                      0|
|    201148|            140416.695|            1815498.0|             1781370.0|                    3|                      0|
|    397549|    192520.03499999997|            2653483.5|            2572793.73|                    2|                      0|
|    251529|             56862.405|             778959.0|              463095.0|                    2|         

In [14]:
# Check if the above output is correct
a = df_prev_application.where(df_prev_application["SK_ID_CURR"] == 397549).select("AMT_GOODS_PRICE")
a.select(sum("AMT_GOODS_PRICE")).collect()[0][0]

2572793.73

In [15]:
# Check total number of insured clients (is this feature significant?)
prev_application_feats.select(sum("prev_NFLAG_INSURED_mode")).collect()[0][0]

39275

- Table 'application_train': 
- data cleaning, combine tables, feature selection 
- Use pandas below because training set has fewer observations so have relatively higher running speed, \
also pyspark doesn't have an efficient function to do one-hot encoding)

In [42]:
# Import the training set
train = pd.read_csv('application_train.csv')

print(train.dtypes)

SK_ID_CURR                        int64
TARGET                            int64
NAME_CONTRACT_TYPE               object
CODE_GENDER                      object
FLAG_OWN_CAR                     object
FLAG_OWN_REALTY                  object
CNT_CHILDREN                      int64
AMT_INCOME_TOTAL                float64
AMT_CREDIT                      float64
AMT_ANNUITY                     float64
AMT_GOODS_PRICE                 float64
NAME_TYPE_SUITE                  object
NAME_INCOME_TYPE                 object
NAME_EDUCATION_TYPE              object
NAME_FAMILY_STATUS               object
NAME_HOUSING_TYPE                object
REGION_POPULATION_RELATIVE      float64
DAYS_BIRTH                        int64
DAYS_EMPLOYED                     int64
DAYS_REGISTRATION               float64
DAYS_ID_PUBLISH                   int64
OWN_CAR_AGE                     float64
FLAG_MOBIL                        int64
FLAG_EMP_PHONE                    int64
FLAG_WORK_PHONE                   int64


Categorical features: NAME_CONTRACT_TYPE, CODE_GENDER, FLAG_OWN_CAR, FLAG_OWN_REALTY, NAME_TYPE_SUITE, NAME_INCOME_TYPE
NAME_FAMILY_STATUS, NAME_HOUSING_TYPE, OCCUPATION_TYPE, WEEKDAY_APPR_PROCESS_START, ORGANIZATION_TYPE, FONDKAPREMONT_MODE
HOUSETYPE_MODE, WALLSMATERIAL_MODE, EMERGENCYSTATE_MODE, NAME_EDUCATION_TYPE

In [43]:
# Split response and features
y = train['TARGET']
X = train.drop(['TARGET'],axis=1)

In [44]:
#one-hot encode the categorical variables
X_factorized = pd.get_dummies(X)  # Convert all non-numeric variables to numeric

#clean up infinite values also - just in case
X_factorized.replace([np.inf, -np.inf], np.nan)

X_factorized.head(10)

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,100002,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,-3648.0,...,0,0,0,0,0,0,1,0,1,0
1,100003,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,-1186.0,...,0,1,0,0,0,0,0,0,1,0
2,100004,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225,-4260.0,...,0,0,0,0,0,0,0,0,0,0
3,100006,0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005,-3039,-9833.0,...,0,0,0,0,0,0,0,0,0,0
4,100007,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,-3038,-4311.0,...,0,0,0,0,0,0,0,0,0,0
5,100008,0,99000.0,490495.5,27517.5,454500.0,0.035792,-16941,-1588,-4970.0,...,0,0,0,0,0,0,0,0,0,0
6,100009,1,171000.0,1560726.0,41301.0,1395000.0,0.035792,-13778,-3130,-1213.0,...,0,0,0,0,0,0,0,0,0,0
7,100010,0,360000.0,1530000.0,42075.0,1530000.0,0.003122,-18850,-449,-4597.0,...,0,0,0,0,0,0,0,0,0,0
8,100011,0,112500.0,1019610.0,33826.5,913500.0,0.018634,-20099,365243,-7427.0,...,0,0,0,0,0,0,0,0,0,0
9,100012,0,135000.0,405000.0,20250.0,405000.0,0.019689,-14469,-2019,-14437.0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
# Delete the features that are not in the test set
X_factorized = X_factorized.drop(['CODE_GENDER_XNA', 'NAME_INCOME_TYPE_Maternity leave', 'NAME_FAMILY_STATUS_Unknown'], \
                                 axis=1)
print(len(X_factorized.columns))

242


In [46]:
#Downsample our data (stratified) for speed 
train_factorized = pd.concat([X_factorized, y], axis=1)
train_downsample = train_factorized[train_factorized['TARGET'] ==1]
train_downsample = train_downsample.append(train_factorized[train_factorized['TARGET'] ==0] \
                                           .sample(frac = 0.1, replace = False))
print(train_factorized.shape, train_downsample.shape)

(307511, 243) (53094, 243)


Now we have the final training set before removing any insignificant features \
We want to merge this training set with the 4 pyspark tables that we generated previously to have all features together

In [47]:
# Convert from pandas to pyspark

train = spark.createDataFrame(train_downsample)

train.show(10)

+----------+------------+----------------+----------+-----------+---------------+--------------------------+----------+-------------+-----------------+---------------+-----------+----------+--------------+---------------+----------------+----------+----------+---------------+--------------------+---------------------------+-----------------------+--------------------------+--------------------------+---------------------------+----------------------+----------------------+-----------------------+-------------------+-------------------+-------------------+--------------+----------------+---------------------------+---------------+--------------+-------------+-------------+-------------+-------------+------------+--------------------+--------------+-----------------------+-----------------+---------------+-----------------+----------------------------+----------------+-------------------+--------------+--------------+--------------+--------------+-------------+---------------------+-----

In [48]:
# Join the all the feature tables with the training dataset

train = train.join(cash_balance_feats, on = 'SK_ID_CURR', how = 'left')
train = train.join(install_balance_feats, on = 'SK_ID_CURR', how = 'left')
train = train.join(bureau_feats, on = 'SK_ID_CURR', how = 'left')
train = train.join(prev_application_feats, on = 'SK_ID_CURR', how = 'left')

train.show(10)

+----------+------------+----------------+----------+-----------+---------------+--------------------------+----------+-------------+-----------------+---------------+-----------+----------+--------------+---------------+----------------+----------+----------+---------------+--------------------+---------------------------+-----------------------+--------------------------+--------------------------+---------------------------+----------------------+----------------------+-----------------------+-------------------+-------------------+-------------------+--------------+----------------+---------------------------+---------------+--------------+-------------+-------------+-------------+-------------+------------+--------------------+--------------+-----------------------+-----------------+-------------------+-----------------+----------------------------+----------------+---------------+--------------+--------------+--------------+--------------+-------------+---------------------+-----

In [49]:
# Convert the final pyspark table back to pandas to do variable screening (pandas is more convenient)
train = train.toPandas()

# It takes around 5 min to run this line on my pc ...

train.head()

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,20-30d_default,>30d_default,avg_DAYS_CREDIT,avg_AMT_CREDIT_MAX_OVERDUE,AMT_ANNUITY_total,prev_AMT_ANNUITY_total,prev_AMT_CREDIT_total,prev_GOODS_price_total,prev_label_YIELD_mode,prev_NFLAG_INSURED_mode
0,101552,0,126000.0,566055.0,16681.5,472500.0,0.028663,-18311,-4927,-4593.0,...,0.0,0.0,-1643.0,1300.78125,20836.8,1906.515,17523.0,17716.5,3.0,0.0
1,113770,1,49500.0,652500.0,21046.5,652500.0,0.00712,-15291,-3378,-477.0,...,0.0,0.0,-1434.285714,531.61875,36360.0,3639.06,33115.5,33115.5,1.0,0.0
2,117180,0,135000.0,835605.0,24561.0,697500.0,0.010556,-19589,-424,-2267.0,...,0.0,0.0,-1008.375,0.0,20836.8,39046.5,830655.0,722529.0,1.0,0.0
3,124966,1,126000.0,495000.0,30411.0,495000.0,0.006852,-12332,-3747,-2530.0,...,0.0,0.0,-1750.5,0.0,20836.8,6974.775,112275.0,136800.0,1.0,0.0
4,136815,0,90000.0,454500.0,15151.5,454500.0,0.006296,-23781,365243,-14303.0,...,0.0,0.0,-344.333333,0.0,20836.8,55439.415,339741.0,628515.0,2.0,0.0


In [50]:
# checking missing data
y = train['TARGET']
X_miss = train.drop(['TARGET'],axis=1)
total_null = X_miss.isnull().count()
percent = (X_miss.isnull().sum()/X_miss.isnull().count()*100)
missing_check  = pd.concat([total_null, percent], axis=1, keys=['Total Null', 'Percent']).sort_values(ascending = False, by = ['Percent'])
print(missing_check)

                                                   Total Null    Percent
COMMONAREA_MEDI                                         53094  71.644630
COMMONAREA_MODE                                         53094  71.644630
COMMONAREA_AVG                                          53094  71.644630
NONLIVINGAPARTMENTS_MEDI                                53094  71.254756
NONLIVINGAPARTMENTS_AVG                                 53094  71.254756
NONLIVINGAPARTMENTS_MODE                                53094  71.254756
pct_balance_0_1_year                                    53094  70.751874
pct_balance_total                                       53094  70.751874
pct_balance_2+_year                                     53094  70.751874
pct_balance_1_2_year                                    53094  70.751874
LIVINGAPARTMENTS_MODE                                   53094  70.281011
LIVINGAPARTMENTS_AVG                                    53094  70.281011
LIVINGAPARTMENTS_MEDI                              

In [51]:
# Approximate missing values using imputer (for all features)

# Use iterative imputer
#imputer = IterativeImputer(random_state=0, initial_strategy='median')
#train_factorized = pd.DataFrame(imputer.fit_transform(train_factorized),columns = train_factorized.columns)

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X_complete = pd.DataFrame(imputer.fit_transform(X_miss),columns = X_miss.columns)
X_complete.head(10)

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,20-30d_default,>30d_default,avg_DAYS_CREDIT,avg_AMT_CREDIT_MAX_OVERDUE,AMT_ANNUITY_total,prev_AMT_ANNUITY_total,prev_AMT_CREDIT_total,prev_GOODS_price_total,prev_label_YIELD_mode,prev_NFLAG_INSURED_mode
0,101552.0,0.0,126000.0,566055.0,16681.5,472500.0,0.028663,-18311.0,-4927.0,-4593.0,...,0.0,0.0,-1643.0,1300.78125,20836.8,1906.515,17523.0,17716.5,3.0,0.0
1,113770.0,1.0,49500.0,652500.0,21046.5,652500.0,0.00712,-15291.0,-3378.0,-477.0,...,0.0,0.0,-1434.285714,531.61875,36360.0,3639.06,33115.5,33115.5,1.0,0.0
2,117180.0,0.0,135000.0,835605.0,24561.0,697500.0,0.010556,-19589.0,-424.0,-2267.0,...,0.0,0.0,-1008.375,0.0,20836.8,39046.5,830655.0,722529.0,1.0,0.0
3,124966.0,1.0,126000.0,495000.0,30411.0,495000.0,0.006852,-12332.0,-3747.0,-2530.0,...,0.0,0.0,-1750.5,0.0,20836.8,6974.775,112275.0,136800.0,1.0,0.0
4,136815.0,0.0,90000.0,454500.0,15151.5,454500.0,0.006296,-23781.0,365243.0,-14303.0,...,0.0,0.0,-344.333333,0.0,20836.8,55439.415,339741.0,628515.0,2.0,0.0
5,137147.0,0.0,135000.0,180000.0,17932.5,180000.0,0.015221,-17721.0,-775.0,-6875.0,...,0.0,0.0,-829.166667,0.0,20836.8,69227.955,953991.0,1154322.0,2.0,0.0
6,142956.0,2.0,202500.0,1305000.0,38286.0,1305000.0,0.010147,-13316.0,-1231.0,-7035.0,...,0.0,0.0,-2089.4,0.0,20836.8,115061.58,1145340.0,1416150.0,2.0,0.0
7,143916.0,0.0,157500.0,781920.0,32998.5,675000.0,0.035792,-16410.0,-595.0,-1587.0,...,0.0,0.0,-451.166667,0.0,13117.5,69327.81,1584036.0,1609042.5,1.0,0.0
8,169522.0,0.0,108000.0,180000.0,9000.0,180000.0,0.010966,-7712.0,-264.0,-3828.0,...,0.0,0.0,-73.8,0.0,20836.8,17507.7,160092.0,160092.0,1.0,0.0
9,171935.0,2.0,123750.0,382500.0,19125.0,382500.0,0.04622,-12722.0,-1374.0,-3821.0,...,0.0,0.0,-963.8,0.0,20836.8,29107.17,31689.0,267255.0,2.0,0.0


Now we have a clean and complete training set. \
Next we need to do variable screening (feature selection)

In [52]:
# Check the variance for all features
print(X_complete.var().sort_values(kind="quicksort",ascending=True))

NAME_INCOME_TYPE_Student                             0.000000e+00
FLAG_DOCUMENT_12                                     0.000000e+00
FLAG_MOBIL                                           0.000000e+00
NAME_INCOME_TYPE_Businessman                         1.883452e-05
FLAG_DOCUMENT_10                                     3.766833e-05
FLAG_DOCUMENT_2                                      7.533382e-05
FLAG_DOCUMENT_4                                      7.533382e-05
ORGANIZATION_TYPE_Industry: type 8                   9.416550e-05
ORGANIZATION_TYPE_Trade: type 5                      1.129965e-04
ORGANIZATION_TYPE_Trade: type 4                      1.129965e-04
FLAG_DOCUMENT_7                                      1.506563e-04
FLAG_DOCUMENT_17                                     1.694851e-04
REGION_POPULATION_RELATIVE                           1.720488e-04
NAME_INCOME_TYPE_Unemployed                          2.071407e-04
ORGANIZATION_TYPE_Religion                           2.636187e-04
NAME_EDUCA

In [53]:
# Remove features with vairance lower than 
selector = VarianceThreshold(threshold=0.001)
selector.fit(X_complete)
X_var = X_complete[X_complete.columns[selector.get_support(indices=True)]]

len(X_var.columns)

229

In [62]:
# Calculate F Statistic and the corresponding p values
F_stat, p_value = f_classif(X_var, y)

# convert to a DF
ANOVA = pd.DataFrame(data = {'Features': X_var.columns.values, \
                'F-value': F_stat, 'p-value': p_value.round(decimals=10)})
ANOVA.sort_values(by = ['F-value'], ascending = False, ignore_index=True, inplace = True)
print(ANOVA)

# select the features with F-value > 3.95
index = ANOVA['F-value']>3.95
X_sel=X_var[ANOVA['Features'][index]]

                                              Features      F-value  \
0                                         EXT_SOURCE_3  4363.283099   
1                                         EXT_SOURCE_2  4241.002720   
2                                         EXT_SOURCE_1  1867.954989   
3                                      avg_DAYS_CREDIT  1238.394950   
4                                           DAYS_BIRTH  1141.815937   
5                          REGION_RATING_CLIENT_W_CITY   716.635675   
6                                 REGION_RATING_CLIENT   673.994964   
7                 NAME_EDUCATION_TYPE_Higher education   635.488809   
8                             NAME_INCOME_TYPE_Working   590.002150   
9                                 pct_balance_0_1_year   558.110440   
10                              DAYS_LAST_PHONE_CHANGE   543.963536   
11                                       CODE_GENDER_F   543.473735   
12                                       CODE_GENDER_M   543.473735   
13   N

In [73]:
# Remove features that might cause multicollinearity issues
train_cov = pd.concat([X_sel, y], axis=1,sort=False)
cor = train_cov.corr().abs()
# Select upper/lower triangle of correlation matrix
upper_triangle = cor.where(np.triu(np.ones(cor.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.85
drop_lst1 = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.85)]

print('number of features dropped that can cause multicollinearity: ' + str(len(drop_lst1)))
# Drop features 
train_cov.drop(drop_lst1, axis=1, inplace=True)
print('number of features left: ' + str(len(train_cov.columns)))

number of features dropped that can cause multicollinearity: 44
number of features left: 127


Now we have the finalized training set \
Gonna split the training set into training + valiadation

In [213]:
# Normalized X and y
y = train_cov['TARGET']
X = train_cov.drop(['TARGET'],axis=1)

sc = MinMaxScaler()
X = pd.DataFrame(sc.fit_transform(X), columns=X.columns)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

We are ready to do model fitting 

For the above training and validation sets, we are gonna fit models, do hyperparameter tuning, do cross-evaluation \
on different models to compare the model performance, then select the best model. Then we will use the best model \
to predict for the ultimate test set

Notice we will do further feature engineering and feature selection before we fit the logistic regression model \
(maybe do clustering, stepwise selection and WOE transformation)

Below we will deal with the test set

- Table 'application_test'
- Data cleaning, combine table, and feature selection based on what we did for the training set
- test set doesn't have a reponse variable

In [78]:
# Import the test set
test = pd.read_csv('application_test.csv')

test.head(10)

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.01885,-19241,-2329,-5170.0,-812,,1,1,0,1,0,1,,2.0,2,2,TUESDAY,18,0,0,0,0,0,0,Kindergarten,0.752614,0.789654,0.15952,0.066,0.059,0.9732,,,,0.1379,0.125,,,,0.0505,,,0.0672,0.0612,0.9732,,,,0.1379,0.125,,,,0.0526,,,0.0666,0.059,0.9732,,,,0.1379,0.125,,,,0.0514,,,,block of flats,0.0392,"Stone, brick",No,0.0,0.0,0.0,0.0,-1740.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.035792,-18064,-4469,-9118.0,-1623,,1,1,0,1,0,0,Low-skill Laborers,2.0,2,2,FRIDAY,9,0,0,0,0,0,0,Self-employed,0.56499,0.291656,0.432962,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,,Working,Higher education,Married,House / apartment,0.019101,-20038,-4458,-2175.0,-3503,5.0,1,1,0,1,0,0,Drivers,2.0,2,2,MONDAY,14,0,0,0,0,0,0,Transport: type 3,,0.699787,0.610991,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-856.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.026392,-13976,-1866,-2000.0,-4208,,1,1,0,1,1,0,Sales staff,4.0,2,2,WEDNESDAY,11,0,0,0,0,0,0,Business Entity Type 3,0.525734,0.509677,0.612704,0.3052,0.1974,0.997,0.9592,0.1165,0.32,0.2759,0.375,0.0417,0.2042,0.2404,0.3673,0.0386,0.08,0.3109,0.2049,0.997,0.9608,0.1176,0.3222,0.2759,0.375,0.0417,0.2089,0.2626,0.3827,0.0389,0.0847,0.3081,0.1974,0.997,0.9597,0.1173,0.32,0.2759,0.375,0.0417,0.2078,0.2446,0.3739,0.0388,0.0817,reg oper account,block of flats,0.37,Panel,No,0.0,0.0,0.0,0.0,-1805.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.010032,-13040,-2191,-4000.0,-4262,16.0,1,1,1,1,0,0,,3.0,2,2,FRIDAY,5,0,0,0,0,1,1,Business Entity Type 3,0.202145,0.425687,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-821.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
5,100042,Cash loans,F,Y,Y,0,270000.0,959688.0,34600.5,810000.0,Unaccompanied,State servant,Secondary / secondary special,Married,House / apartment,0.025164,-18604,-12009,-6116.0,-2027,10.0,1,1,0,1,1,0,Drivers,2.0,2,2,MONDAY,15,0,0,0,0,0,0,Government,,0.628904,0.392774,0.2412,0.0084,0.9821,0.7552,0.0452,0.16,0.1379,0.3333,0.375,0.1683,0.1942,0.2218,0.0116,0.0731,0.2458,0.0088,0.9821,0.7648,0.0457,0.1611,0.1379,0.3333,0.375,0.1721,0.2121,0.2311,0.0117,0.0774,0.2436,0.0084,0.9821,0.7585,0.0455,0.16,0.1379,0.3333,0.375,0.1712,0.1975,0.2258,0.0116,0.0746,not specified,block of flats,0.2151,Block,No,0.0,0.0,0.0,0.0,-1705.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,2.0
6,100057,Cash loans,M,Y,Y,2,180000.0,499221.0,22117.5,373500.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.0228,-16685,-2580,-10125.0,-241,3.0,1,1,0,1,0,0,High skill tech staff,4.0,2,2,THURSDAY,9,0,0,0,0,1,1,Industry: type 9,0.760851,0.571084,0.65126,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,0.0,-1182.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
7,100065,Cash loans,M,N,Y,0,166500.0,180000.0,14220.0,180000.0,Unaccompanied,Working,Higher education,Single / not married,With parents,0.005144,-9516,-1387,-5063.0,-2055,,1,1,1,1,1,0,Core staff,1.0,2,2,FRIDAY,7,0,0,0,0,0,0,Self-employed,0.56529,0.613033,0.312365,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1182.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
8,100066,Cash loans,F,N,Y,0,315000.0,364896.0,28957.5,315000.0,Unaccompanied,State servant,Higher education,Married,House / apartment,0.04622,-12744,-1013,-1686.0,-3171,,1,1,0,1,0,0,Core staff,2.0,1,1,THURSDAY,18,0,0,0,0,0,0,School,0.718507,0.808788,0.522697,0.1031,0.1115,0.9781,,,0.0,0.2069,0.1667,,,,,,,0.105,0.1157,0.9782,,,0.0,0.2069,0.1667,,,,,,,0.1041,0.1115,0.9781,,,0.0,0.2069,0.1667,,,,,,,,block of flats,0.0702,"Stone, brick",No,0.0,0.0,0.0,0.0,-829.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,5.0
9,100067,Cash loans,F,Y,Y,1,162000.0,45000.0,5337.0,45000.0,Family,Working,Higher education,Civil marriage,House / apartment,0.018634,-10395,-2625,-8124.0,-3041,5.0,1,1,1,1,1,0,Sales staff,3.0,2,2,TUESDAY,14,0,0,0,0,0,0,Trade: type 2,0.210562,0.444848,0.194068,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,0.0,4.0,0.0,-1423.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0


In [79]:
#one-hot encode the categorical variables
test_factorized = pd.get_dummies(test)  # Convert all non-numeric variables to numeric

#clean up infinite values also - just in case
test_factorized.replace([np.inf, -np.inf], np.nan)

test_factorized.head(10)

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,TOTALAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Revolving loans,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_N,FLAG_OWN_REALTY_Y,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,NAME_TYPE_SUITE_Group of people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,NAME_INCOME_TYPE_Businessman,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Unemployed,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Academic degree,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Lower secondary,NAME_EDUCATION_TYPE_Secondary / secondary special,NAME_FAMILY_STATUS_Civil marriage,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Accountants,OCCUPATION_TYPE_Cleaning staff,OCCUPATION_TYPE_Cooking staff,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_HR staff,OCCUPATION_TYPE_High skill tech staff,OCCUPATION_TYPE_IT staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY,ORGANIZATION_TYPE_Advertising,ORGANIZATION_TYPE_Agriculture,ORGANIZATION_TYPE_Bank,ORGANIZATION_TYPE_Business Entity Type 1,ORGANIZATION_TYPE_Business Entity Type 2,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Cleaning,ORGANIZATION_TYPE_Construction,ORGANIZATION_TYPE_Culture,ORGANIZATION_TYPE_Electricity,ORGANIZATION_TYPE_Emergency,ORGANIZATION_TYPE_Government,ORGANIZATION_TYPE_Hotel,ORGANIZATION_TYPE_Housing,ORGANIZATION_TYPE_Industry: type 1,ORGANIZATION_TYPE_Industry: type 10,ORGANIZATION_TYPE_Industry: type 11,ORGANIZATION_TYPE_Industry: type 12,ORGANIZATION_TYPE_Industry: type 13,ORGANIZATION_TYPE_Industry: type 2,ORGANIZATION_TYPE_Industry: type 3,ORGANIZATION_TYPE_Industry: type 4,ORGANIZATION_TYPE_Industry: type 5,ORGANIZATION_TYPE_Industry: type 6,ORGANIZATION_TYPE_Industry: type 7,ORGANIZATION_TYPE_Industry: type 8,ORGANIZATION_TYPE_Industry: type 9,ORGANIZATION_TYPE_Insurance,ORGANIZATION_TYPE_Kindergarten,ORGANIZATION_TYPE_Legal Services,ORGANIZATION_TYPE_Medicine,ORGANIZATION_TYPE_Military,ORGANIZATION_TYPE_Mobile,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Postal,ORGANIZATION_TYPE_Realtor,ORGANIZATION_TYPE_Religion,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Security,ORGANIZATION_TYPE_Security Ministries,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Services,ORGANIZATION_TYPE_Telecom,ORGANIZATION_TYPE_Trade: type 1,ORGANIZATION_TYPE_Trade: type 2,ORGANIZATION_TYPE_Trade: type 3,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA,FONDKAPREMONT_MODE_not specified,FONDKAPREMONT_MODE_org spec account,FONDKAPREMONT_MODE_reg oper account,FONDKAPREMONT_MODE_reg oper spec account,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,100001,0,135000.0,568800.0,20560.5,450000.0,0.01885,-19241,-2329,-5170.0,-812,,1,1,0,1,0,1,2.0,2,2,18,0,0,0,0,0,0,0.752614,0.789654,0.15952,0.066,0.059,0.9732,,,,0.1379,0.125,,,,0.0505,,,0.0672,0.0612,0.9732,,,,0.1379,0.125,,,,0.0526,,,0.0666,0.059,0.9732,,,,0.1379,0.125,,,,0.0514,,,0.0392,0.0,0.0,0.0,0.0,-1740.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
1,100005,0,99000.0,222768.0,17370.0,180000.0,0.035792,-18064,-4469,-9118.0,-1623,,1,1,0,1,0,0,2.0,2,2,9,0,0,0,0,0,0,0.56499,0.291656,0.432962,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,100013,0,202500.0,663264.0,69777.0,630000.0,0.019101,-20038,-4458,-2175.0,-3503,5.0,1,1,0,1,0,0,2.0,2,2,14,0,0,0,0,0,0,,0.699787,0.610991,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-856.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,100028,2,315000.0,1575000.0,49018.5,1575000.0,0.026392,-13976,-1866,-2000.0,-4208,,1,1,0,1,1,0,4.0,2,2,11,0,0,0,0,0,0,0.525734,0.509677,0.612704,0.3052,0.1974,0.997,0.9592,0.1165,0.32,0.2759,0.375,0.0417,0.2042,0.2404,0.3673,0.0386,0.08,0.3109,0.2049,0.997,0.9608,0.1176,0.3222,0.2759,0.375,0.0417,0.2089,0.2626,0.3827,0.0389,0.0847,0.3081,0.1974,0.997,0.9597,0.1173,0.32,0.2759,0.375,0.0417,0.2078,0.2446,0.3739,0.0388,0.0817,0.37,0.0,0.0,0.0,0.0,-1805.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0
4,100038,1,180000.0,625500.0,32067.0,625500.0,0.010032,-13040,-2191,-4000.0,-4262,16.0,1,1,1,1,0,0,3.0,2,2,5,0,0,0,0,1,1,0.202145,0.425687,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-821.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,1,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,100042,0,270000.0,959688.0,34600.5,810000.0,0.025164,-18604,-12009,-6116.0,-2027,10.0,1,1,0,1,1,0,2.0,2,2,15,0,0,0,0,0,0,,0.628904,0.392774,0.2412,0.0084,0.9821,0.7552,0.0452,0.16,0.1379,0.3333,0.375,0.1683,0.1942,0.2218,0.0116,0.0731,0.2458,0.0088,0.9821,0.7648,0.0457,0.1611,0.1379,0.3333,0.375,0.1721,0.2121,0.2311,0.0117,0.0774,0.2436,0.0084,0.9821,0.7585,0.0455,0.16,0.1379,0.3333,0.375,0.1712,0.1975,0.2258,0.0116,0.0746,0.2151,0.0,0.0,0.0,0.0,-1705.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,2.0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0
6,100057,2,180000.0,499221.0,22117.5,373500.0,0.0228,-16685,-2580,-10125.0,-241,3.0,1,1,0,1,0,0,4.0,2,2,9,0,0,0,0,1,1,0.760851,0.571084,0.65126,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,0.0,-1182.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,100065,0,166500.0,180000.0,14220.0,180000.0,0.005144,-9516,-1387,-5063.0,-2055,,1,1,1,1,1,0,1.0,2,2,7,0,0,0,0,0,0,0.56529,0.613033,0.312365,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1182.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,100066,0,315000.0,364896.0,28957.5,315000.0,0.04622,-12744,-1013,-1686.0,-3171,,1,1,0,1,0,0,2.0,1,1,18,0,0,0,0,0,0,0.718507,0.808788,0.522697,0.1031,0.1115,0.9781,,,0.0,0.2069,0.1667,,,,,,,0.105,0.1157,0.9782,,,0.0,0.2069,0.1667,,,,,,,0.1041,0.1115,0.9781,,,0.0,0.2069,0.1667,,,,,,,0.0702,0.0,0.0,0.0,0.0,-829.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,5.0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
9,100067,1,162000.0,45000.0,5337.0,45000.0,0.018634,-10395,-2625,-8124.0,-3041,5.0,1,1,1,1,1,0,3.0,2,2,14,0,0,0,0,0,0,0.210562,0.444848,0.194068,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,0.0,4.0,0.0,-1423.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0,1,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [80]:
# Check if test set has same features as the training set
# If not, delete those features from the traning's features
lst = []
for c in X_factorized.columns:
    if c not in test_factorized.columns:
        lst.append(c)
print(lst)

[]


In [81]:
# Convert from pandas to pyspark

test = spark.createDataFrame(test_factorized)

test.show(10)

+----------+------------+----------------+----------+-----------+---------------+--------------------------+----------+-------------+-----------------+---------------+-----------+----------+--------------+---------------+----------------+----------+----------+---------------+--------------------+---------------------------+-----------------------+--------------------------+--------------------------+---------------------------+----------------------+----------------------+-----------------------+-------------------+-------------------+------------------+--------------+--------------------+---------------------------+---------------+--------------+-------------+-------------+-------------+-------------+------------+--------------------+--------------+-----------------------+-----------------+---------------+-----------------+----------------------------+----------------+---------------+--------------+--------------+--------------+--------------+-------------+---------------------+------

In [82]:
# Join the all the feature tables with the test dataset

test = test.join(cash_balance_feats, on = 'SK_ID_CURR', how = 'left')
test = test.join(install_balance_feats, on = 'SK_ID_CURR', how = 'left')
test = test.join(bureau_feats, on = 'SK_ID_CURR', how = 'left')
test = test.join(prev_application_feats, on = 'SK_ID_CURR', how = 'left')

test.show(10)

+----------+------------+----------------+----------+-----------+---------------+--------------------------+----------+-------------+-----------------+---------------+-----------+----------+--------------+---------------+----------------+----------+----------+---------------+--------------------+---------------------------+-----------------------+--------------------------+--------------------------+---------------------------+----------------------+----------------------+-----------------------+-------------------+-------------------+-------------------+--------------+----------------+---------------------------+---------------+--------------+-------------+-------------+-------------+-------------+------------+--------------------+--------------+-----------------------+-----------------+---------------+-----------------+----------------------------+----------------+---------------+--------------+--------------+--------------+--------------+-------------+---------------------+---------

In [83]:
# Convert the final pyspark table back to pandas to do variable screening (pandas is more convenient)
test = test.toPandas()

# It takes around 5 min to run this line on my pc ...

test.head()

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,TOTALAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Revolving loans,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_N,FLAG_OWN_REALTY_Y,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,NAME_TYPE_SUITE_Group of people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,NAME_INCOME_TYPE_Businessman,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Unemployed,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Academic degree,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Lower secondary,NAME_EDUCATION_TYPE_Secondary / secondary special,NAME_FAMILY_STATUS_Civil marriage,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Accountants,OCCUPATION_TYPE_Cleaning staff,OCCUPATION_TYPE_Cooking staff,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_HR staff,OCCUPATION_TYPE_High skill tech staff,OCCUPATION_TYPE_IT staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY,ORGANIZATION_TYPE_Advertising,ORGANIZATION_TYPE_Agriculture,ORGANIZATION_TYPE_Bank,ORGANIZATION_TYPE_Business Entity Type 1,ORGANIZATION_TYPE_Business Entity Type 2,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Cleaning,ORGANIZATION_TYPE_Construction,ORGANIZATION_TYPE_Culture,ORGANIZATION_TYPE_Electricity,ORGANIZATION_TYPE_Emergency,ORGANIZATION_TYPE_Government,ORGANIZATION_TYPE_Hotel,ORGANIZATION_TYPE_Housing,ORGANIZATION_TYPE_Industry: type 1,ORGANIZATION_TYPE_Industry: type 10,ORGANIZATION_TYPE_Industry: type 11,ORGANIZATION_TYPE_Industry: type 12,ORGANIZATION_TYPE_Industry: type 13,ORGANIZATION_TYPE_Industry: type 2,ORGANIZATION_TYPE_Industry: type 3,ORGANIZATION_TYPE_Industry: type 4,ORGANIZATION_TYPE_Industry: type 5,ORGANIZATION_TYPE_Industry: type 6,ORGANIZATION_TYPE_Industry: type 7,ORGANIZATION_TYPE_Industry: type 8,ORGANIZATION_TYPE_Industry: type 9,ORGANIZATION_TYPE_Insurance,ORGANIZATION_TYPE_Kindergarten,ORGANIZATION_TYPE_Legal Services,ORGANIZATION_TYPE_Medicine,ORGANIZATION_TYPE_Military,ORGANIZATION_TYPE_Mobile,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Postal,ORGANIZATION_TYPE_Realtor,ORGANIZATION_TYPE_Religion,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Security,ORGANIZATION_TYPE_Security Ministries,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Services,ORGANIZATION_TYPE_Telecom,ORGANIZATION_TYPE_Trade: type 1,ORGANIZATION_TYPE_Trade: type 2,ORGANIZATION_TYPE_Trade: type 3,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA,FONDKAPREMONT_MODE_not specified,FONDKAPREMONT_MODE_org spec account,FONDKAPREMONT_MODE_reg oper account,FONDKAPREMONT_MODE_reg oper spec account,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,pct_balance_total,pct_balance_0_1_year,pct_balance_1_2_year,pct_balance_2+_year,install_default_total,no_delay_default,<=10d_default,10-20d_default,20-30d_default,>30d_default,avg_DAYS_CREDIT,avg_AMT_CREDIT_MAX_OVERDUE,AMT_ANNUITY_total,prev_AMT_ANNUITY_total,prev_AMT_CREDIT_total,prev_GOODS_price_total,prev_label_YIELD_mode,prev_NFLAG_INSURED_mode
0,104473,1,202500.0,225000.0,25447.5,225000.0,0.030755,-11684,-233,-3559.0,-4197,11.0,1,1,0,1,1,0,3.0,2,2,14,0,0,0,0,0,0,0.488005,0.57162,0.237916,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,0.0,-2099.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0,1,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,-1153.25,4862.88,9058.5,19447.695,59143.5,172755.0,2.0,0.0
1,105403,0,90000.0,179865.0,14341.5,148500.0,0.035792,-20541,365243,-9452.0,-3671,,1,0,0,1,0,0,2.0,2,2,14,0,0,0,0,0,0,0.733277,0.564193,0.255332,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-739.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,2.0,3.0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11.624557,10.783308,0.841249,0.0,30801.06,26304.84,4496.22,0.0,0.0,0.0,-779.5,0.0,17208.0,200290.05,2702839.5,3044034.0,2.0,0.0
2,111537,0,144000.0,315000.0,13369.5,315000.0,0.026392,-10567,365243,-5046.0,-2091,,1,0,0,1,0,1,2.0,2,2,11,0,0,0,0,0,0,0.700692,0.748138,0.743559,0.0701,,0.9767,,,0.0,0.1379,0.1667,,0.0495,,0.0605,,0.0086,0.0714,,0.9767,,,0.0,0.1379,0.1667,,0.0506,,0.063,,0.0091,0.0708,,0.9767,,,0.0,0.1379,0.1667,,0.0503,,0.0615,,0.0088,0.0494,1.0,0.0,1.0,0.0,-1029.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,,,,,268840.89,210221.28,43551.585,60.48,0.0,15007.545,-1847.0,0.0,157941.0,92454.165,1307209.5,1157719.5,3.0,1.0
3,121621,0,112500.0,599778.0,25542.0,477000.0,0.020713,-19767,-7467,-8736.0,-3207,,1,1,0,1,0,1,2.0,3,3,12,0,0,0,0,0,0,0.550916,0.459738,0.657784,0.0784,0.1019,0.9881,0.8368,0.0779,0.0,0.2069,0.1667,0.2083,0.0486,0.0639,0.0748,0.0,0.0,0.0798,0.1058,0.9881,0.8432,0.0786,0.0,0.2069,0.1667,0.2083,0.0497,0.0698,0.078,0.0,0.0,0.0791,0.1019,0.9881,0.839,0.0784,0.0,0.2069,0.1667,0.2083,0.0495,0.065,0.0762,0.0,0.0,0.0589,7.0,0.0,7.0,0.0,-2404.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,3.0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,-1286.428571,0.0,34591.5,84910.005,1212345.0,1243755.0,2.0,0.0
4,121997,0,112500.0,315000.0,16083.0,315000.0,0.025164,-11821,-3975,-432.0,-4146,,1,1,1,1,0,0,2.0,2,2,14,0,0,0,0,0,0,0.017517,0.073669,0.227613,0.0722,0.0484,0.9767,0.6804,0.0222,0.0,0.1379,0.1667,0.2083,0.0586,0.058,0.0653,0.0039,0.0596,0.0735,0.0502,0.9767,0.6929,0.0224,0.0,0.1379,0.1667,0.2083,0.0599,0.0634,0.068,0.0039,0.063,0.0729,0.0484,0.9767,0.6847,0.0224,0.0,0.1379,0.1667,0.2083,0.0596,0.059,0.0665,0.0039,0.0608,0.0643,0.0,0.0,0.0,0.0,-1791.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,1.0,3.0,2.0,3.0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,3.076487,3.076487,0.0,0.0,23216.625,23186.25,30.375,0.0,0.0,0.0,-651.625,0.0,19431.0,52120.665,626634.0,686610.0,3.0,0.0


In [85]:
# Approximate missing values using imputer (fit on the training dataset and apply on the test set)

# Use iterative imputer
#imputer = IterativeImputer(random_state=0, initial_strategy='median')
#train_factorized = pd.DataFrame(imputer.fit_transform(train_factorized),columns = train_factorized.columns)
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
test_complete = pd.DataFrame(imputer.fit(X_miss).transform(test),columns = test.columns)
test_complete.head(10)

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,TOTALAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Revolving loans,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_N,FLAG_OWN_REALTY_Y,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,NAME_TYPE_SUITE_Group of people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,NAME_INCOME_TYPE_Businessman,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Unemployed,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Academic degree,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Lower secondary,NAME_EDUCATION_TYPE_Secondary / secondary special,NAME_FAMILY_STATUS_Civil marriage,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Accountants,OCCUPATION_TYPE_Cleaning staff,OCCUPATION_TYPE_Cooking staff,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_HR staff,OCCUPATION_TYPE_High skill tech staff,OCCUPATION_TYPE_IT staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY,ORGANIZATION_TYPE_Advertising,ORGANIZATION_TYPE_Agriculture,ORGANIZATION_TYPE_Bank,ORGANIZATION_TYPE_Business Entity Type 1,ORGANIZATION_TYPE_Business Entity Type 2,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Cleaning,ORGANIZATION_TYPE_Construction,ORGANIZATION_TYPE_Culture,ORGANIZATION_TYPE_Electricity,ORGANIZATION_TYPE_Emergency,ORGANIZATION_TYPE_Government,ORGANIZATION_TYPE_Hotel,ORGANIZATION_TYPE_Housing,ORGANIZATION_TYPE_Industry: type 1,ORGANIZATION_TYPE_Industry: type 10,ORGANIZATION_TYPE_Industry: type 11,ORGANIZATION_TYPE_Industry: type 12,ORGANIZATION_TYPE_Industry: type 13,ORGANIZATION_TYPE_Industry: type 2,ORGANIZATION_TYPE_Industry: type 3,ORGANIZATION_TYPE_Industry: type 4,ORGANIZATION_TYPE_Industry: type 5,ORGANIZATION_TYPE_Industry: type 6,ORGANIZATION_TYPE_Industry: type 7,ORGANIZATION_TYPE_Industry: type 8,ORGANIZATION_TYPE_Industry: type 9,ORGANIZATION_TYPE_Insurance,ORGANIZATION_TYPE_Kindergarten,ORGANIZATION_TYPE_Legal Services,ORGANIZATION_TYPE_Medicine,ORGANIZATION_TYPE_Military,ORGANIZATION_TYPE_Mobile,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Postal,ORGANIZATION_TYPE_Realtor,ORGANIZATION_TYPE_Religion,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Security,ORGANIZATION_TYPE_Security Ministries,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Services,ORGANIZATION_TYPE_Telecom,ORGANIZATION_TYPE_Trade: type 1,ORGANIZATION_TYPE_Trade: type 2,ORGANIZATION_TYPE_Trade: type 3,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA,FONDKAPREMONT_MODE_not specified,FONDKAPREMONT_MODE_org spec account,FONDKAPREMONT_MODE_reg oper account,FONDKAPREMONT_MODE_reg oper spec account,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,pct_balance_total,pct_balance_0_1_year,pct_balance_1_2_year,pct_balance_2+_year,install_default_total,no_delay_default,<=10d_default,10-20d_default,20-30d_default,>30d_default,avg_DAYS_CREDIT,avg_AMT_CREDIT_MAX_OVERDUE,AMT_ANNUITY_total,prev_AMT_ANNUITY_total,prev_AMT_CREDIT_total,prev_GOODS_price_total,prev_label_YIELD_mode,prev_NFLAG_INSURED_mode
0,104473.0,1.0,202500.0,225000.0,25447.5,225000.0,0.030755,-11684.0,-233.0,-3559.0,-4197.0,11.0,1.0,1.0,0.0,1.0,1.0,0.0,3.0,2.0,2.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.488005,0.57162,0.237916,0.0835,0.0754,0.9816,0.7552,0.0204,0.0,0.1379,0.1667,0.2083,0.0468,0.074,0.0711,0.0,0.0034,0.084,0.074,0.9811,0.7583,0.0185,0.0,0.1379,0.1667,0.2083,0.0445,0.0735,0.0707,0.0,0.0009,0.0833,0.0748,0.9816,0.7585,0.0201,0.0,0.1379,0.1667,0.2083,0.0472,0.0744,0.0719,0.0,0.0029,0.0669,1.0,0.0,1.0,0.0,-2099.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.781439,0.073774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1153.25,4862.88,9058.5,19447.695,59143.5,172755.0,2.0,0.0
1,105403.0,0.0,90000.0,179865.0,14341.5,148500.0,0.035792,-20541.0,365243.0,-9452.0,-3671.0,10.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,2.0,2.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.733277,0.564193,0.255332,0.0835,0.0754,0.9816,0.7552,0.0204,0.0,0.1379,0.1667,0.2083,0.0468,0.074,0.0711,0.0,0.0034,0.084,0.074,0.9811,0.7583,0.0185,0.0,0.1379,0.1667,0.2083,0.0445,0.0735,0.0707,0.0,0.0009,0.0833,0.0748,0.9816,0.7585,0.0201,0.0,0.1379,0.1667,0.2083,0.0472,0.0744,0.0719,0.0,0.0029,0.0669,0.0,0.0,0.0,0.0,-739.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.624557,10.783308,0.841249,0.0,30801.06,26304.84,4496.22,0.0,0.0,0.0,-779.5,0.0,17208.0,200290.05,2702839.5,3044034.0,2.0,0.0
2,111537.0,0.0,144000.0,315000.0,13369.5,315000.0,0.026392,-10567.0,365243.0,-5046.0,-2091.0,10.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0,2.0,2.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.700692,0.748138,0.743559,0.0701,0.0754,0.9767,0.7552,0.0204,0.0,0.1379,0.1667,0.2083,0.0495,0.074,0.0605,0.0,0.0086,0.0714,0.074,0.9767,0.7583,0.0185,0.0,0.1379,0.1667,0.2083,0.0506,0.0735,0.063,0.0,0.0091,0.0708,0.0748,0.9767,0.7585,0.0201,0.0,0.1379,0.1667,0.2083,0.0503,0.0744,0.0615,0.0,0.0088,0.0494,1.0,0.0,1.0,0.0,-1029.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.781439,0.073774,0.0,0.0,268840.89,210221.28,43551.585,60.48,0.0,15007.545,-1847.0,0.0,157941.0,92454.165,1307209.5,1157719.5,3.0,1.0
3,121621.0,0.0,112500.0,599778.0,25542.0,477000.0,0.020713,-19767.0,-7467.0,-8736.0,-3207.0,10.0,1.0,1.0,0.0,1.0,0.0,1.0,2.0,3.0,3.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.550916,0.459738,0.657784,0.0784,0.1019,0.9881,0.8368,0.0779,0.0,0.2069,0.1667,0.2083,0.0486,0.0639,0.0748,0.0,0.0,0.0798,0.1058,0.9881,0.8432,0.0786,0.0,0.2069,0.1667,0.2083,0.0497,0.0698,0.078,0.0,0.0,0.0791,0.1019,0.9881,0.839,0.0784,0.0,0.2069,0.1667,0.2083,0.0495,0.065,0.0762,0.0,0.0,0.0589,7.0,0.0,7.0,0.0,-2404.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.781439,0.073774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1286.428571,0.0,34591.5,84910.005,1212345.0,1243755.0,2.0,0.0
4,121997.0,0.0,112500.0,315000.0,16083.0,315000.0,0.025164,-11821.0,-3975.0,-432.0,-4146.0,10.0,1.0,1.0,1.0,1.0,0.0,0.0,2.0,2.0,2.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017517,0.073669,0.227613,0.0722,0.0484,0.9767,0.6804,0.0222,0.0,0.1379,0.1667,0.2083,0.0586,0.058,0.0653,0.0039,0.0596,0.0735,0.0502,0.9767,0.6929,0.0224,0.0,0.1379,0.1667,0.2083,0.0599,0.0634,0.068,0.0039,0.063,0.0729,0.0484,0.9767,0.6847,0.0224,0.0,0.1379,0.1667,0.2083,0.0596,0.059,0.0665,0.0039,0.0608,0.0643,0.0,0.0,0.0,0.0,-1791.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,2.0,3.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,3.076487,3.076487,0.0,0.0,23216.625,23186.25,30.375,0.0,0.0,0.0,-651.625,0.0,19431.0,52120.665,626634.0,686610.0,3.0,0.0
5,123688.0,1.0,157500.0,440784.0,34956.0,360000.0,0.002134,-10277.0,-817.0,-1130.0,-1332.0,10.0,1.0,1.0,0.0,1.0,0.0,0.0,3.0,3.0,3.0,8.0,0.0,0.0,0.0,1.0,1.0,0.0,0.34664,0.618466,0.465069,0.0835,0.0754,0.9816,0.7552,0.0204,0.0,0.1379,0.1667,0.2083,0.0468,0.074,0.0711,0.0,0.0034,0.084,0.074,0.9811,0.7583,0.0185,0.0,0.1379,0.1667,0.2083,0.0445,0.0735,0.0707,0.0,0.0009,0.0833,0.0748,0.9816,0.7585,0.0201,0.0,0.1379,0.1667,0.2083,0.0472,0.0744,0.0719,0.0,0.0029,0.0669,1.0,1.0,1.0,1.0,-982.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.781439,0.073774,0.0,0.0,-17518.185,-27021.105,9502.92,0.0,0.0,0.0,-535.5,4500.0,80730.0,238993.02,2490592.5,2475155.7,1.0,0.0
6,136683.0,0.0,315000.0,790830.0,62613.0,675000.0,0.04622,-10714.0,-1069.0,-850.0,-856.0,10.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,12.0,0.0,0.0,0.0,0.0,1.0,1.0,0.319013,0.387157,0.277886,0.0835,0.0754,0.9816,0.7552,0.0204,0.0,0.1379,0.1667,0.2083,0.0468,0.074,0.0711,0.0,0.0034,0.084,0.074,0.9811,0.7583,0.0185,0.0,0.1379,0.1667,0.2083,0.0445,0.0735,0.0707,0.0,0.0009,0.0833,0.0748,0.9816,0.7585,0.0201,0.0,0.1379,0.1667,0.2083,0.0472,0.0744,0.0719,0.0,0.0029,0.0669,0.0,0.0,0.0,0.0,-734.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.781439,0.073774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-561.4,0.0,27679.5,70974.99,466128.0,672547.5,2.0,0.0
7,152892.0,0.0,135000.0,550489.5,22099.5,454500.0,0.04622,-23566.0,365243.0,-11730.0,-3937.0,10.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.441809,0.734636,0.622922,0.1485,0.0979,0.9836,0.7552,0.0204,0.16,0.1379,0.3333,0.2083,0.0468,0.074,0.1465,0.0,0.0006,0.1513,0.1016,0.9836,0.7583,0.0185,0.1611,0.1379,0.3333,0.2083,0.0445,0.0735,0.1526,0.0,0.0006,0.1499,0.0979,0.9836,0.7585,0.0201,0.16,0.1379,0.3333,0.2083,0.0472,0.0744,0.1491,0.0,0.0006,0.1153,0.0,0.0,0.0,0.0,-469.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5.781439,0.073774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1180.5,0.0,6750.0,24226.065,255145.5,255145.5,1.0,0.0
8,153094.0,0.0,157500.0,1256103.0,40648.5,1048500.0,0.020713,-15735.0,-831.0,-7323.0,-4257.0,10.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,3.0,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.485601,0.458234,0.385915,0.1227,0.0,0.9771,0.7552,0.0204,0.0,0.2069,0.1667,0.2083,0.0484,0.074,0.0964,0.0,0.0029,0.125,0.0,0.9772,0.7583,0.0185,0.0,0.2069,0.1667,0.2083,0.0496,0.0735,0.1004,0.0,0.0031,0.1239,0.0,0.9771,0.7585,0.0201,0.0,0.2069,0.1667,0.2083,0.0493,0.0744,0.0981,0.0,0.003,0.0765,0.0,0.0,0.0,0.0,-650.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,5.781439,0.073774,0.0,0.0,7373.25,7101.63,0.0,271.62,0.0,0.0,-826.5,0.0,61375.5,48739.05,357012.0,369189.0,3.0,0.0
9,154377.0,0.0,225000.0,350415.0,32269.5,292500.0,0.009175,-10259.0,-862.0,-1839.0,-2745.0,7.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,2.0,2.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.151479,0.69477,0.283712,0.0835,0.0754,0.9896,0.8572,0.0146,0.0,0.1379,0.1667,0.2083,0.0468,0.0681,0.0882,0.0,0.0395,0.0851,0.074,0.9896,0.8628,0.0147,0.0,0.1379,0.1667,0.2083,0.0445,0.0744,0.0919,0.0,0.0418,0.0843,0.0748,0.9896,0.8591,0.0147,0.0,0.1379,0.1667,0.2083,0.0472,0.0693,0.0898,0.0,0.0403,0.078,0.0,0.0,0.0,0.0,-432.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,5.781439,0.073774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-435.5,0.0,48474.0,7725.87,72810.0,64296.0,2.0,0.0


In [89]:
# The finalized test set for model prediction after normalization
# Note that we would do further feature selection and feature engineering before fitting the logistic model
test = test_complete[X.columns]

sc = MinMaxScaler()
test = pd.DataFrame(sc.fit_transform(test), columns=test.columns)
test.head()

Unnamed: 0,EXT_SOURCE_3,EXT_SOURCE_2,EXT_SOURCE_1,avg_DAYS_CREDIT,DAYS_BIRTH,REGION_RATING_CLIENT_W_CITY,NAME_EDUCATION_TYPE_Higher education,NAME_INCOME_TYPE_Working,pct_balance_0_1_year,DAYS_LAST_PHONE_CHANGE,CODE_GENDER_F,DAYS_ID_PUBLISH,NAME_INCOME_TYPE_Pensioner,REG_CITY_NOT_WORK_CITY,FLAG_DOCUMENT_3,DAYS_REGISTRATION,EMERGENCYSTATE_MODE_No,FLOORSMAX_AVG,AMT_GOODS_PRICE,OCCUPATION_TYPE_Laborers,ELEVATORS_AVG,prev_label_YIELD_mode,REG_CITY_NOT_LIVE_CITY,WALLSMATERIAL_MODE_Panel,TOTALAREA_MODE,NAME_CONTRACT_TYPE_Cash loans,LIVE_CITY_NOT_WORK_CITY,FLAG_DOCUMENT_6,DEF_30_CNT_SOCIAL_CIRCLE,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_With parents,install_default_total,OCCUPATION_TYPE_Drivers,ORGANIZATION_TYPE_Self-employed,FLAG_WORK_PHONE,HOUR_APPR_PROCESS_START,NAME_FAMILY_STATUS_Single / not married,FLAG_PHONE,NAME_INCOME_TYPE_State servant,NAME_FAMILY_STATUS_Married,OCCUPATION_TYPE_Low-skill Laborers,LIVINGAPARTMENTS_AVG,FLOORSMIN_AVG,OCCUPATION_TYPE_Accountants,FONDKAPREMONT_MODE_reg oper account,ORGANIZATION_TYPE_Business Entity Type 3,NAME_FAMILY_STATUS_Civil marriage,OCCUPATION_TYPE_Core staff,BASEMENTAREA_AVG,NONLIVINGAREA_AVG,NAME_FAMILY_STATUS_Widow,FLAG_OWN_CAR_Y,OCCUPATION_TYPE_Sales staff,COMMONAREA_AVG,CNT_CHILDREN,OCCUPATION_TYPE_Managers,AMT_REQ_CREDIT_BUREAU_YEAR,ORGANIZATION_TYPE_Construction,ENTRANCES_AVG,NAME_HOUSING_TYPE_Rented apartment,OWN_CAR_AGE,OCCUPATION_TYPE_Security staff,pct_balance_1_2_year,AMT_REQ_CREDIT_BUREAU_MON,FLAG_DOCUMENT_13,AMT_ANNUITY,ORGANIZATION_TYPE_Transport: type 3,FLAG_DOCUMENT_14,OCCUPATION_TYPE_High skill tech staff,LANDAREA_MEDI,ORGANIZATION_TYPE_School,FLAG_DOCUMENT_16,prev_NFLAG_INSURED_mode,FONDKAPREMONT_MODE_reg oper spec account,pct_balance_2+_year,YEARS_BUILD_MODE,WALLSMATERIAL_MODE_Monolithic,"WALLSMATERIAL_MODE_Stone, brick",FONDKAPREMONT_MODE_org spec account,NAME_INCOME_TYPE_Commercial associate,ORGANIZATION_TYPE_Police,>30d_default,OBS_30_CNT_SOCIAL_CIRCLE,ORGANIZATION_TYPE_Military,OCCUPATION_TYPE_Private service staff,ORGANIZATION_TYPE_Security,ORGANIZATION_TYPE_Security Ministries,ORGANIZATION_TYPE_Medicine,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_Trade: type 3,ORGANIZATION_TYPE_Bank,NAME_TYPE_SUITE_Family,OCCUPATION_TYPE_Medicine staff,prev_GOODS_price_total,AMT_REQ_CREDIT_BUREAU_QRT,NAME_EDUCATION_TYPE_Lower secondary,OCCUPATION_TYPE_Waiters/barmen staff,NAME_TYPE_SUITE_Unaccompanied,FLAG_DOCUMENT_8,<=10d_default,ORGANIZATION_TYPE_Industry: type 3,ORGANIZATION_TYPE_Trade: type 7,OCCUPATION_TYPE_Cooking staff,FLAG_DOCUMENT_18,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Industry: type 12,WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_Block,ORGANIZATION_TYPE_Government,ORGANIZATION_TYPE_Kindergarten,REG_REGION_NOT_WORK_REGION,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_Industry: type 9,WEEKDAY_APPR_PROCESS_START_TUESDAY,ORGANIZATION_TYPE_Industry: type 1,WEEKDAY_APPR_PROCESS_START_MONDAY,avg_AMT_CREDIT_MAX_OVERDUE,YEARS_BEGINEXPLUATATION_AVG,ORGANIZATION_TYPE_Agriculture,OCCUPATION_TYPE_Cleaning staff,WEEKDAY_APPR_PROCESS_START_SATURDAY,ORGANIZATION_TYPE_Insurance,NAME_TYPE_SUITE_Other_B,ORGANIZATION_TYPE_Industry: type 4,FLAG_OWN_REALTY_Y,HOUSETYPE_MODE_specific housing
0,0.237916,0.57162,0.488005,-1153.25,-11684.0,2.0,0.0,0.0,0.073774,-2099.0,0.0,-4197.0,0.0,0.0,1.0,-3559.0,0.0,0.1667,225000.0,0.0,0.0,2.0,0.0,0.0,0.0669,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,14.0,0.0,1.0,0.0,1.0,0.0,0.074,0.2083,0.0,0.0,1.0,0.0,0.0,0.0754,0.0034,0.0,1.0,0.0,0.0204,1.0,0.0,1.0,0.0,0.1379,0.0,11.0,0.0,0.0,0.0,0.0,25447.5,0.0,0.0,0.0,0.0472,0.0,0.0,0.0,0.0,0.0,0.7583,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,172755.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4862.88,0.9816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.255332,0.564193,0.733277,-779.5,-20541.0,2.0,0.0,0.0,10.783308,-739.0,1.0,-3671.0,1.0,0.0,1.0,-9452.0,0.0,0.1667,148500.0,0.0,0.0,2.0,0.0,0.0,0.0669,1.0,0.0,0.0,0.0,1.0,0.0,30801.06,0.0,0.0,0.0,14.0,0.0,0.0,0.0,1.0,0.0,0.074,0.2083,0.0,0.0,0.0,0.0,0.0,0.0754,0.0034,0.0,0.0,0.0,0.0204,0.0,0.0,3.0,0.0,0.1379,0.0,10.0,0.0,0.841249,0.0,0.0,14341.5,0.0,0.0,0.0,0.0472,0.0,0.0,0.0,0.0,0.0,0.7583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3044034.0,2.0,0.0,0.0,1.0,0.0,4496.22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9816,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.743559,0.748138,0.700692,-1847.0,-10567.0,2.0,1.0,0.0,0.073774,-1029.0,1.0,-2091.0,1.0,0.0,1.0,-5046.0,1.0,0.1667,315000.0,0.0,0.0,3.0,0.0,0.0,0.0494,1.0,0.0,0.0,0.0,1.0,0.0,268840.89,0.0,0.0,0.0,11.0,0.0,0.0,0.0,1.0,0.0,0.074,0.2083,0.0,0.0,0.0,0.0,0.0,0.0754,0.0086,0.0,0.0,0.0,0.0204,0.0,0.0,1.0,0.0,0.1379,0.0,10.0,0.0,0.0,0.0,0.0,13369.5,0.0,0.0,0.0,0.0503,0.0,0.0,1.0,0.0,0.0,0.7583,0.0,1.0,0.0,0.0,0.0,15007.545,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1157719.5,1.0,0.0,0.0,1.0,0.0,43551.585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.9767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.657784,0.459738,0.550916,-1286.428571,-19767.0,3.0,0.0,1.0,0.073774,-2404.0,0.0,-3207.0,0.0,0.0,1.0,-8736.0,1.0,0.1667,477000.0,1.0,0.0,2.0,0.0,0.0,0.0589,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,1.0,0.0,0.0639,0.2083,0.0,1.0,1.0,0.0,0.0,0.1019,0.0,0.0,0.0,0.0,0.0779,0.0,0.0,3.0,0.0,0.2069,0.0,10.0,0.0,0.0,0.0,0.0,25542.0,0.0,0.0,0.0,0.0495,0.0,0.0,0.0,0.0,0.0,0.8432,0.0,1.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1243755.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.9881,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.227613,0.073669,0.017517,-651.625,-11821.0,2.0,0.0,0.0,3.076487,-1791.0,0.0,-4146.0,0.0,0.0,1.0,-432.0,1.0,0.1667,315000.0,0.0,0.0,3.0,0.0,0.0,0.0643,1.0,0.0,0.0,0.0,1.0,0.0,23216.625,0.0,0.0,1.0,14.0,0.0,0.0,0.0,1.0,0.0,0.058,0.2083,0.0,1.0,0.0,0.0,0.0,0.0484,0.0596,0.0,0.0,0.0,0.0222,0.0,0.0,3.0,0.0,0.1379,0.0,10.0,0.0,0.0,3.0,0.0,16083.0,0.0,0.0,0.0,0.0596,0.0,0.0,0.0,0.0,0.0,0.6929,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,686610.0,2.0,0.0,0.0,1.0,0.0,30.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.9767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- Additional Feature Engineering for Logistic Regression

Deal with training set

In [90]:
# Stepwise logistic regression to do further feature selection
import statsmodels.api as sm

def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):

    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        pv_list = pd.DataFrame(columns=('VAR_name', 'P-value'))
        for new_column in excluded:
            model = sm.Logit(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval = pd.Series([new_column, model.pvalues[new_column]], index=pv_list.columns)
            pv_list = pv_list.append(new_pval,ignore_index=True)
        best_pval = pv_list['P-value'].min()
        if best_pval < threshold_in:
            best_feature = pv_list['VAR_name'].iloc[pv_list['P-value'].idxmin(axis=1)]
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.Logit(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = included[pvalues.argmax()]
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

result = stepwise_selection(X, y)
print('resulting features:')
print(result)

Optimization terminated successfully.
         Current function value: 0.690137
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690261
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.690931
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690775
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690937
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.690780
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.687597
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690971
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690398
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.690214
  

Optimization terminated successfully.
         Current function value: 0.690504
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.689874
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.690929
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.686932
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690738
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688204
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690722
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690342
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.687090
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690910
  

Optimization terminated successfully.
         Current function value: 0.651083
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.645693
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.651412
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.651476
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.651696
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.651648
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.651633
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.650962
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.647066
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.650970
  

Add EXT_SOURCE_2                   with p-value 0.0
Optimization terminated successfully.
         Current function value: 0.621526
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.621028
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.621267
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.621515
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.621346
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.621521
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.621508
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.619131
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.621450
         Iterations 5
Optimization terminated succes

Optimization terminated successfully.
         Current function value: 0.621488
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.619338
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.621299
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.619705
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.621408
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.621249
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.620131
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.621391
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.621306
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.621320
  

Optimization terminated successfully.
         Current function value: 0.613214
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.613125
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.613237
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.613138
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.610941
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.613038
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.613312
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.613078
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.613164
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.613305
  

Optimization terminated successfully.
         Current function value: 0.610098
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.610197
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.610360
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.610249
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.610396
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.610384
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.608570
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.610289
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.610292
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.610015
  

Optimization terminated successfully.
         Current function value: 0.610375
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.609284
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.610193
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.609343
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.610300
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.610254
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.609595
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.610293
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.610151
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.610239
  

Optimization terminated successfully.
         Current function value: 0.607615
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.607761
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.607706
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.607808
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.607844
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.605443
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.607561
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.607862
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.607690
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.607731
  

Optimization terminated successfully.
         Current function value: 0.605316
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.605219
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.605330
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.605434
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.605442
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.603478
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.605289
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.605324
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.605189
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.604682
  

Optimization terminated successfully.
         Current function value: 0.605260
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.605191
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.605376
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.605024
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.604965
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.605397
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.605348
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.605323
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.605424
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.605442
  

Optimization terminated successfully.
         Current function value: 0.603448
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.603274
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.603424
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.603409
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.602944
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.603477
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.603312
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.603402
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.603415
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.603447
  

Optimization terminated successfully.
         Current function value: 0.602265
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.602365
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.602363
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.602199
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.602242
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.602152
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601618
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.602167
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.602354
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.602347
  

Optimization terminated successfully.
         Current function value: 0.601972
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.602299
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.602281
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.602246
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.602316
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.602366
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.602339
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601772
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.602363
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601846
  

Optimization terminated successfully.
         Current function value: 0.601524
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601367
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601298
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601475
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601498
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601452
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601188
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601416
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601484
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.601468
  

Optimization terminated successfully.
         Current function value: 0.600629
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.600570
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.600023
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.600565
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.600760
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.600746
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.600440
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.600322
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.600764
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.600393
  

Optimization terminated successfully.
         Current function value: 0.600745
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.600169
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.600764
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.600364
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.600524
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.600679
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.600655
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.600754
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.600729
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.600646
  

Optimization terminated successfully.
         Current function value: 0.599968
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.600014
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599787
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599902
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599940
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599683
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599766
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599880
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.600000
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599940
  

Optimization terminated successfully.
         Current function value: 0.599414
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599101
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599432
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599262
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599292
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599369
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599327
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599347
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599424
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599298
  

Optimization terminated successfully.
         Current function value: 0.599035
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599432
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599141
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599336
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599431
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599429
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599146
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599127
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599398
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.599338
  

Optimization terminated successfully.
         Current function value: 0.598895
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.598809
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.598900
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.598771
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.598827
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.598697
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.598638
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.598855
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.598782
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.598882
  

Optimization terminated successfully.
         Current function value: 0.598432
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.598399
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.598290
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.598289
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.598456
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.598284
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.598428
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.598417
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597981
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.598477
  

Optimization terminated successfully.
         Current function value: 0.597878
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597980
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597980
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597788
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597836
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597866
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597810
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597787
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597967
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597614
  

Optimization terminated successfully.
         Current function value: 0.597966
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597978
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597698
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597749
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597893
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597860
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.597974
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597946
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597857
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597897
  

Optimization terminated successfully.
         Current function value: 0.597434
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597428
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597274
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597425
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597482
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597422
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.597477
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597460
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597429
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597507
  

Optimization terminated successfully.
         Current function value: 0.597101
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597103
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596995
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597031
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596911
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596999
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597067
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597007
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597067
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597050
  

Optimization terminated successfully.
         Current function value: 0.596881
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597065
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597085
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597063
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.597097
         Iterations 5
Add install_default_total          with p-value 2.27243e-10
Optimization terminated successfully.
         Current function value: 0.596719
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596675
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596665
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596672
         Iterations 5
Optimization terminate

Optimization terminated successfully.
         Current function value: 0.596683
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596706
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596488
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596611
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596646
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596627
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596656
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596657
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596719
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596705
  

Optimization terminated successfully.
         Current function value: 0.596325
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596396
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596367
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596392
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596423
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596394
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596359
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596350
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596190
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596344
  

Optimization terminated successfully.
         Current function value: 0.595965
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595971
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596018
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595990
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596069
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596071
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595960
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596078
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595877
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595968
  

Optimization terminated successfully.
         Current function value: 0.595900
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595858
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596039
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596054
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596030
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.596064
         Iterations 5
Add ORGANIZATION_TYPE_Self-employed with p-value 1.37072e-07
Optimization terminated successfully.
         Current function value: 0.595817
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595772
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595767
         Iterations 5
Optimization terminat

Optimization terminated successfully.
         Current function value: 0.595597
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595765
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595806
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595686
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595697
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595761
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595735
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595745
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595766
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595817
  

Optimization terminated successfully.
         Current function value: 0.595509
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595536
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595557
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595549
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595500
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595495
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595408
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595488
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595568
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595492
  

Optimization terminated successfully.
         Current function value: 0.595303
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595207
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595311
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595129
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595229
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595269
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595199
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595282
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595249
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595298
  

Optimization terminated successfully.
         Current function value: 0.595024
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595019
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595068
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595026
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595069
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595029
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594942
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594934
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594935
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594975
  

Optimization terminated successfully.
         Current function value: 0.595037
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595060
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594995
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595020
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594941
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.595047
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595065
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594963
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595047
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595064
  

Optimization terminated successfully.
         Current function value: 0.594776
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594843
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594843
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594827
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594842
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594635
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594824
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594705
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594707
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594816
  

Optimization terminated successfully.
         Current function value: 0.594554
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594632
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594595
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594446
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594528
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594616
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594609
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594599
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594564
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594612
  

Optimization terminated successfully.
         Current function value: 0.594411
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594317
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594316
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594404
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594275
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594406
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594406
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594311
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594415
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594262
  

Optimization terminated successfully.
         Current function value: 0.594403
         Iterations 5
Add OWN_CAR_AGE                    with p-value 5.80747e-06
Optimization terminated successfully.
         Current function value: 0.594222
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594175
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594174
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594215
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594184
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594221
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594183
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594117
         Iterations 5
Optimization terminate

Optimization terminated successfully.
         Current function value: 0.594221
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594186
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594208
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594155
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594184
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594110
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.594193
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594217
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594124
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594199
  

Optimization terminated successfully.
         Current function value: 0.594000
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594017
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594004
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593929
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593892
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593993
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594019
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594003
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.594009
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593990
  

Optimization terminated successfully.
         Current function value: 0.593828
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593796
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593836
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593802
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593825
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593845
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593842
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593797
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593791
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593735
  

Optimization terminated successfully.
         Current function value: 0.593701
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593700
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593686
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593685
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593679
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593702
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593698
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593691
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593675
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.593701
  

Optimization terminated successfully.
         Current function value: 0.593460
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593499
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593514
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593469
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593458
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593491
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593418
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593512
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593462
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593473
  

Optimization terminated successfully.
         Current function value: 0.593507
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593458
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593518
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593497
         Iterations 6
Add OCCUPATION_TYPE_Security staff with p-value 0.000126441
Optimization terminated successfully.
         Current function value: 0.593390
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593390
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593336
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593362
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593326
         Iterations 6
Optimization terminate

Optimization terminated successfully.
         Current function value: 0.593264
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593372
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593340
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593291
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593318
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593332
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593333
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593368
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593353
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593322
  

Optimization terminated successfully.
         Current function value: 0.593240
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593133
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593239
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593261
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593247
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593242
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593224
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593245
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593194
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593146
  

Optimization terminated successfully.
         Current function value: 0.593032
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593045
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593107
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593120
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593089
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593136
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593068
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593061
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593138
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593132
  

Optimization terminated successfully.
         Current function value: 0.592961
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592890
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592990
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.593005
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592974
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592918
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592950
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592991
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592966
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592966
  

Optimization terminated successfully.
         Current function value: 0.592896
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592895
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592882
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592877
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592886
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592897
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592896
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592887
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592873
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592896
  

Optimization terminated successfully.
         Current function value: 0.592757
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592709
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592720
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592757
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592714
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592694
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592775
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592690
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.592725
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592775
  

Optimization terminated successfully.
         Current function value: 0.592641
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592635
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592575
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592611
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592628
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592638
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592624
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592605
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592531
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592623
  

Optimization terminated successfully.
         Current function value: 0.592451
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592445
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592399
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592434
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592413
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592451
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592420
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592449
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592449
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592386
  

Optimization terminated successfully.
         Current function value: 0.592371
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592376
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592395
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592384
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592440
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592425
         Iterations 6
Add ORGANIZATION_TYPE_Industry: type 9 with p-value 0.000329283
Optimization terminated successfully.
         Current function value: 0.592328
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592321
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592278
         Iterations 6
Optimization termi

Optimization terminated successfully.
         Current function value: 0.592279
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592303
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592287
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592261
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592320
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592323
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592250
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592251
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592274
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592262
  

Optimization terminated successfully.
         Current function value: 0.592174
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592211
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592178
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592199
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592156
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592164
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592191
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592171
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592143
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592203
  

Optimization terminated successfully.
         Current function value: 0.592103
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592063
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592072
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592101
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592077
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592114
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592082
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592102
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592059
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592067
  

Optimization terminated successfully.
         Current function value: 0.592006
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592024
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591986
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592016
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591973
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591975
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592010
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591986
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.592024
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591992
  

Optimization terminated successfully.
         Current function value: 0.591918
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591942
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591924
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591943
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591905
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591935
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591917
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591894
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591929
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591906
  

Optimization terminated successfully.
         Current function value: 0.591840
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591836
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591860
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591858
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591861
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591823
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591852
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591835
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591811
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591847
  

Optimization terminated successfully.
         Current function value: 0.591783
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591763
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591760
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591784
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591782
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591785
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591747
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591776
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591759
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591734
  

Optimization terminated successfully.
         Current function value: 0.591710
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591690
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591693
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591711
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591709
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591712
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591682
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591703
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591686
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591662
  

Optimization terminated successfully.
         Current function value: 0.591619
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591637
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591635
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591638
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591609
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591629
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591612
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591584
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591625
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591603
  

Optimization terminated successfully.
         Current function value: 0.591550
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591566
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591566
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591566
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591568
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591554
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591552
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591566
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591525
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591509
  

Optimization terminated successfully.
         Current function value: 0.591487
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591472
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591470
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591484
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591450
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591427
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591488
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591457
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591457
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591474
  

Optimization terminated successfully.
         Current function value: 0.591317
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591379
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591349
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591350
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591365
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591370
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591374
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591378
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591326
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591372
  

Optimization terminated successfully.
         Current function value: 0.591306
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591310
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591258
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591304
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591277
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591311
         Iterations 6
Add ORGANIZATION_TYPE_Security Ministries with p-value 0.00967953
Optimization terminated successfully.
         Current function value: 0.591247
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591246
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.591148
         Iterations 7
Optimization ter

In [91]:
# Final selected variables
features_lst = pd.DataFrame({"variables":result})
print(features_lst)
print('final number of variables: ' + str(len(features_lst)))

                                   variables
0                               EXT_SOURCE_3
1                               EXT_SOURCE_2
2                               EXT_SOURCE_1
3                       pct_balance_0_1_year
4       NAME_EDUCATION_TYPE_Higher education
5                              CODE_GENDER_F
6                            FLAG_DOCUMENT_3
7                             FLAG_OWN_CAR_Y
8                            DAYS_ID_PUBLISH
9                REGION_RATING_CLIENT_W_CITY
10                  DEF_30_CNT_SOCIAL_CIRCLE
11                     prev_label_YIELD_mode
12                  NAME_INCOME_TYPE_Working
13                               AMT_ANNUITY
14                NAME_FAMILY_STATUS_Married
15                  WALLSMATERIAL_MODE_Panel
16                    DAYS_LAST_PHONE_CHANGE
17                     install_default_total
18                 AMT_REQ_CREDIT_BUREAU_QRT
19                    prev_GOODS_price_total
20           ORGANIZATION_TYPE_Self-employed
21  ORGANI

In [214]:
# Select the features form the stepwise selection
X_logit = X[result].copy()

In [215]:
# WOE transformation. Deal with categorical and cts variables separately
print(X_logit.nunique())
index_categ = X_logit.nunique()<7
index_numeric = X_logit.nunique()>=7
df_categ = X_logit[X_logit.columns[index_categ]]
df_numeric = X_logit[X_logit.columns[index_numeric]]

print(df_categ.nunique())
print('number of continuous features: ' + str(len(df_numeric.columns)))
print('number of categorical features: ' + str(len(df_categ.columns)))

EXT_SOURCE_3                                  776
EXT_SOURCE_2                                42307
EXT_SOURCE_1                                21926
pct_balance_0_1_year                         8046
NAME_EDUCATION_TYPE_Higher education            2
CODE_GENDER_F                                   2
FLAG_DOCUMENT_3                                 2
FLAG_OWN_CAR_Y                                  2
DAYS_ID_PUBLISH                              5744
REGION_RATING_CLIENT_W_CITY                     3
DEF_30_CNT_SOCIAL_CIRCLE                        7
prev_label_YIELD_mode                           3
NAME_INCOME_TYPE_Working                        2
AMT_ANNUITY                                  8746
NAME_FAMILY_STATUS_Married                      2
WALLSMATERIAL_MODE_Panel                        2
DAYS_LAST_PHONE_CHANGE                       3366
install_default_total                       24738
AMT_REQ_CREDIT_BUREAU_QRT                      10
prev_GOODS_price_total                      41999


In [216]:
# WOE transformation

# Categorical variables
for c in df_categ.columns:
    col_binned = df_categ[c].copy()
    cross = pd.crosstab(col_binned, y)
    for i in range(len(cross.index)):
        # if some bins have 0 event, use adjust WOE by adding 0.5
        if cross.iloc[i,0] == 0 or cross.iloc[i,1] == 0:
            non_default = (cross.iloc[i,0]+0.5)/cross.iloc[:,0].values.sum()
            default = (cross.iloc[i,1]+0.5)/cross.iloc[:,1].values.sum()
            woe = np.log(non_default/default)
        else:
            non_default = cross.iloc[i,0]/cross.iloc[:,0].values.sum()
            default = cross.iloc[i,1]/cross.iloc[:,1].values.sum()
            woe = np.log(non_default/default)
        col_binned[col_binned == cross.index[i]] = woe
    df_categ = df_categ.drop(c,axis=1)
    df_categ = pd.concat((df_categ, col_binned.rename('WOE_' + c)), axis=1)
df_categ

Unnamed: 0,WOE_NAME_EDUCATION_TYPE_Higher education,WOE_CODE_GENDER_F,WOE_FLAG_DOCUMENT_3,WOE_FLAG_OWN_CAR_Y,WOE_REGION_RATING_CLIENT_W_CITY,WOE_prev_label_YIELD_mode,WOE_NAME_INCOME_TYPE_Working,WOE_NAME_FAMILY_STATUS_Married,WOE_WALLSMATERIAL_MODE_Panel,WOE_ORGANIZATION_TYPE_Self-employed,WOE_ORGANIZATION_TYPE_Business Entity Type 3,WOE_NAME_CONTRACT_TYPE_Cash loans,WOE_ORGANIZATION_TYPE_Construction,WOE_ORGANIZATION_TYPE_Transport: type 3,WOE_FLAG_DOCUMENT_16,WOE_FLAG_DOCUMENT_18,WOE_FLAG_DOCUMENT_13,WOE_OCCUPATION_TYPE_Security staff,WOE_REG_CITY_NOT_LIVE_CITY,WOE_ORGANIZATION_TYPE_Military,WOE_OCCUPATION_TYPE_Drivers,WOE_OCCUPATION_TYPE_Low-skill Laborers,WOE_OCCUPATION_TYPE_Laborers,WOE_OCCUPATION_TYPE_Sales staff,WOE_ORGANIZATION_TYPE_Industry: type 9,WOE_FLAG_DOCUMENT_14,WOE_WEEKDAY_APPR_PROCESS_START_SATURDAY,WOE_WEEKDAY_APPR_PROCESS_START_MONDAY,WOE_NAME_HOUSING_TYPE_House / apartment,WOE_ORGANIZATION_TYPE_Trade: type 3,WOE_FLAG_DOCUMENT_6,WOE_FLAG_WORK_PHONE,WOE_FLAG_PHONE,WOE_ORGANIZATION_TYPE_Police,WOE_ORGANIZATION_TYPE_Security Ministries
0,-0.111635,0.158120,-0.098265,-0.052305,0.023796,-0.173113,-0.186532,0.069433,-0.063358,0.040891,0.045474,-0.034804,0.010787,0.003982,-0.004501,-0.002433,-0.002938,0.009831,-0.422258,-0.003466,0.028177,0.008713,0.076457,0.026672,-0.002423,-0.002583,-0.007623,-0.009725,0.04156,0.003944,-0.032961,0.051120,-0.05466,-0.00341,-0.002691
1,-0.111635,0.158120,-0.098265,-0.052305,0.023796,0.289740,-0.186532,0.069433,-0.063358,0.040891,0.045474,-0.034804,0.010787,0.003982,-0.004501,-0.002433,-0.002938,0.009831,0.045215,-0.003466,0.028177,0.008713,0.076457,0.026672,-0.002423,-0.002583,-0.007623,-0.009725,0.04156,0.003944,-0.032961,-0.184029,-0.05466,-0.00341,-0.002691
2,-0.111635,0.158120,0.283560,-0.052305,-0.404551,0.289740,0.238883,0.069433,0.264034,0.040891,0.045474,-0.034804,0.010787,0.003982,-0.004501,-0.002433,-0.002938,0.009831,0.045215,-0.003466,0.028177,0.008713,-0.298964,0.026672,-0.002423,-0.002583,-0.007623,0.050117,0.04156,-0.311682,-0.032961,0.051120,-0.05466,-0.00341,-0.002691
3,0.437263,0.158120,-0.098265,-0.052305,-0.404551,0.289740,0.238883,0.069433,-0.063358,0.040891,0.045474,-0.034804,0.010787,0.003982,-0.004501,-0.002433,-0.002938,0.009831,0.045215,-0.003466,0.028177,0.008713,0.076457,0.026672,-0.002423,-0.002583,-0.007623,-0.009725,0.04156,0.003944,-0.032961,0.051120,-0.05466,-0.00341,-0.002691
4,-0.111635,0.158120,0.283560,-0.052305,-0.404551,-0.034249,0.238883,-0.113213,-0.063358,0.040891,0.045474,-0.034804,0.010787,0.003982,-0.004501,-0.002433,-0.002938,0.009831,0.045215,-0.003466,0.028177,0.008713,0.076457,0.026672,-0.002423,-0.002583,-0.007623,-0.009725,0.04156,0.003944,0.406590,0.051120,-0.05466,-0.00341,-0.002691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53089,-0.111635,0.158120,-0.098265,0.109776,-0.404551,-0.034249,0.238883,0.069433,-0.063358,0.040891,0.045474,-0.034804,0.010787,0.003982,-0.004501,-0.002433,-0.002938,0.009831,0.045215,-0.003466,0.028177,0.008713,0.076457,0.026672,-0.002423,-0.002583,-0.007623,-0.009725,0.04156,0.003944,-0.032961,0.051120,-0.05466,-0.00341,-0.002691
53090,-0.111635,-0.258537,-0.098265,0.109776,0.023796,-0.034249,-0.186532,0.069433,-0.063358,0.040891,0.045474,-0.034804,0.010787,0.003982,-0.004501,-0.002433,-0.002938,0.009831,0.045215,-0.003466,0.028177,0.008713,-0.298964,0.026672,-0.002423,-0.002583,-0.007623,-0.009725,0.04156,0.003944,-0.032961,-0.184029,-0.05466,-0.00341,-0.002691
53091,-0.111635,-0.258537,-0.098265,-0.052305,0.023796,-0.034249,-0.186532,0.069433,-0.063358,0.040891,-0.146338,-0.034804,0.010787,0.003982,-0.004501,-0.002433,-0.002938,0.009831,0.045215,-0.003466,-0.368362,0.008713,0.076457,0.026672,-0.002423,-0.002583,-0.007623,-0.009725,0.04156,0.003944,-0.032961,-0.184029,0.15183,-0.00341,-0.002691
53092,-0.111635,0.158120,0.283560,0.109776,0.023796,-0.034249,0.238883,0.069433,-0.063358,0.040891,0.045474,-0.034804,0.010787,-0.745903,-0.004501,-0.002433,-0.002938,0.009831,0.045215,-0.003466,-0.368362,0.008713,0.076457,0.026672,-0.002423,-0.002583,-0.007623,-0.009725,0.04156,0.003944,-0.032961,0.051120,-0.05466,-0.00341,-0.002691


In [217]:
# Continuous variables
for c in df_numeric.columns:
    col = df_numeric[c]
    col_binned = pd.cut(col, bins=5, labels = [1,2,3,4,5])
    col_binned = pd.to_numeric(col_binned)
    cross = pd.crosstab(col_binned, y)
    for i in range(len(cross.index)):
        # if some bins have 0 event, use adjust WOE by adding 0.5
        if cross.iloc[i,0] == 0 or cross.iloc[i,1] == 0:
            non_default = (cross.iloc[i,0]+0.5)/cross.iloc[:,0].values.sum()
            default = (cross.iloc[i,1]+0.5)/cross.iloc[:,1].values.sum()
            woe = np.log(non_default/default)
        else:
            non_default = cross.iloc[i,0]/cross.iloc[:,0].values.sum()
            default = cross.iloc[i,1]/cross.iloc[:,1].values.sum()
            woe = np.log(non_default/default)
        col_binned[col_binned == cross.index[i]] = woe
    df_numeric = df_numeric.drop(c,axis=1)
    df_numeric = pd.concat((df_numeric, col_binned.rename('WOE_' + c)), axis=1)
df_numeric

Unnamed: 0,WOE_EXT_SOURCE_3,WOE_EXT_SOURCE_2,WOE_EXT_SOURCE_1,WOE_pct_balance_0_1_year,WOE_DAYS_ID_PUBLISH,WOE_DEF_30_CNT_SOCIAL_CIRCLE,WOE_AMT_ANNUITY,WOE_DAYS_LAST_PHONE_CHANGE,WOE_install_default_total,WOE_AMT_REQ_CREDIT_BUREAU_QRT,WOE_prev_GOODS_price_total,WOE_pct_balance_1_2_year,WOE_AMT_GOODS_PRICE,WOE_OWN_CAR_AGE,WOE_DAYS_REGISTRATION,WOE_>30d_default,WOE_CNT_CHILDREN,WOE_avg_DAYS_CREDIT,WOE_FLOORSMAX_AVG
0,-0.057778,0.872285,-0.019414,0.068969,-0.122416,0.0123,-0.035245,0.286152,-0.004197,0.000922,0.002224,0.014448,-0.072419,0.009267,-0.111779,0.000785,0.002894,0.402655,-0.072515
1,-0.057778,-1.038548,-0.019414,0.068969,0.048055,0.0123,-0.035245,-0.165949,-0.004197,0.000922,0.002224,0.014448,-0.072419,0.009267,-0.111779,0.000785,0.002894,0.402655,-0.072515
2,-0.057778,0.290631,1.104975,0.068969,0.048055,0.0123,-0.035245,0.128415,-0.004197,0.000922,0.002224,0.014448,-0.072419,0.009267,-0.111779,0.000785,0.002894,-0.059609,0.327115
3,0.601663,-0.487754,-0.019414,0.068969,0.048055,0.0123,-0.035245,-0.165949,-0.004197,0.000922,0.002224,0.014448,-0.072419,0.009267,-0.111779,0.000785,0.002894,0.402655,-0.072515
4,-0.057778,0.290631,-0.019414,0.068969,0.309422,0.0123,-0.035245,0.128415,-0.004197,0.000922,0.002224,0.014448,-0.072419,0.009267,0.372959,0.000785,0.002894,-0.453591,-0.072515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53089,-1.192869,-0.145973,-0.019414,0.068969,0.048055,0.0123,-0.035245,0.286152,-0.004197,0.000922,0.002224,0.014448,-0.072419,0.009267,-0.111779,0.000785,0.002894,-0.059609,-0.072515
53090,-0.057778,0.290631,-0.019414,-0.688763,0.309422,0.0123,-0.035245,-0.165949,-0.004197,0.000922,0.002224,0.014448,-0.072419,-0.184309,0.090967,0.000785,0.002894,-0.453591,-0.072515
53091,-0.508151,0.290631,-0.908500,-0.688763,0.048055,0.0123,-0.035245,0.286152,-0.004197,0.000922,0.002224,0.014448,-0.072419,0.009267,0.090967,0.000785,0.002894,0.402655,-0.072515
53092,0.601663,0.290631,0.629297,-0.688763,-0.230483,0.0123,-0.035245,0.286152,-0.004197,0.000922,0.002224,0.014448,-0.072419,0.009267,0.090967,0.000785,0.002894,0.402655,-0.072515


WOE transformation completed. Now all features are WOE

In [219]:
# Normalized X and y for logistic regression; train/validation split
X_logit = pd.concat((df_numeric, df_categ), axis=1)
y_logit = y

sc = MinMaxScaler()
X_logit = pd.DataFrame(sc.fit_transform(X_logit), columns=X_logit.columns)
X_train_logit, X_val_logit, y_train_logit, y_val_logit = train_test_split(X_logit, y_logit, test_size=0.3, random_state=42)

Deal with test set

In [221]:
# Select the features form the stepwise selection
test_logit = test[result].copy()

In [222]:
# Do WOE transformation. First split categorical and continous features
index_categ = test_logit.nunique()<7
index_numeric = test_logit.nunique()>=7
df_categ = test_logit[test_logit.columns[index_categ]]
df_numeric = test_logit[test_logit.columns[index_numeric]]

In [223]:
# WOE transformation

# Categorical variables
for c in df_categ.columns:
    col_binned = df_categ[c].copy()
    cross = pd.crosstab(col_binned, y)
    for i in range(len(cross.index)):
        # if some bins have 0 event, use adjust WOE by adding 0.5
        if cross.iloc[i,0] == 0 or cross.iloc[i,1] == 0:
            non_default = (cross.iloc[i,0]+0.5)/cross.iloc[:,0].values.sum()
            default = (cross.iloc[i,1]+0.5)/cross.iloc[:,1].values.sum()
            woe = np.log(non_default/default)
        else:
            non_default = cross.iloc[i,0]/cross.iloc[:,0].values.sum()
            default = cross.iloc[i,1]/cross.iloc[:,1].values.sum()
            woe = np.log(non_default/default)
        col_binned[col_binned == cross.index[i]] = woe
    df_categ = df_categ.drop(c,axis=1)
    df_categ = pd.concat((df_categ, col_binned.rename('WOE_' + c)), axis=1)
df_categ

Unnamed: 0,WOE_NAME_EDUCATION_TYPE_Higher education,WOE_CODE_GENDER_F,WOE_FLAG_DOCUMENT_3,WOE_FLAG_OWN_CAR_Y,WOE_REGION_RATING_CLIENT_W_CITY,WOE_prev_label_YIELD_mode,WOE_NAME_INCOME_TYPE_Working,WOE_NAME_FAMILY_STATUS_Married,WOE_WALLSMATERIAL_MODE_Panel,WOE_ORGANIZATION_TYPE_Self-employed,WOE_ORGANIZATION_TYPE_Business Entity Type 3,WOE_NAME_CONTRACT_TYPE_Cash loans,WOE_ORGANIZATION_TYPE_Construction,WOE_ORGANIZATION_TYPE_Transport: type 3,WOE_FLAG_DOCUMENT_16,WOE_FLAG_DOCUMENT_18,WOE_FLAG_DOCUMENT_13,WOE_OCCUPATION_TYPE_Security staff,WOE_REG_CITY_NOT_LIVE_CITY,WOE_ORGANIZATION_TYPE_Military,WOE_OCCUPATION_TYPE_Drivers,WOE_OCCUPATION_TYPE_Low-skill Laborers,WOE_OCCUPATION_TYPE_Laborers,WOE_OCCUPATION_TYPE_Sales staff,WOE_ORGANIZATION_TYPE_Industry: type 9,WOE_FLAG_DOCUMENT_14,WOE_WEEKDAY_APPR_PROCESS_START_SATURDAY,WOE_WEEKDAY_APPR_PROCESS_START_MONDAY,WOE_NAME_HOUSING_TYPE_House / apartment,WOE_ORGANIZATION_TYPE_Trade: type 3,WOE_FLAG_DOCUMENT_6,WOE_FLAG_WORK_PHONE,WOE_FLAG_PHONE,WOE_ORGANIZATION_TYPE_Police,WOE_ORGANIZATION_TYPE_Security Ministries
0,0.004294,-0.000656,-0.003671,0.014601,-0.00194,0.000261,0.025304,0.009492,0.007270,0.002123,-0.043325,-0.00016,8.172534e-07,-0.000349,0.0,0.000786,0.0,-0.000933,0.002271,0.000717,-0.001171,-0.000412,0.008217,0.003355,-0.000493,0.0,-0.001421,-0.002072,-0.047887,-0.000234,-0.005001,0.002116,-0.017222,-0.000487,-0.000678
1,0.004294,0.000322,-0.003671,-0.007420,-0.00194,0.000261,0.025304,0.009492,0.007270,0.002123,0.012412,-0.00016,8.172534e-07,-0.000349,0.0,0.000786,0.0,-0.000933,0.002271,0.000717,-0.001171,-0.000412,0.008217,0.003355,-0.000493,0.0,-0.001421,-0.002072,0.005604,-0.000234,-0.005001,0.002116,0.006154,-0.000487,-0.000678
2,-0.012421,0.000322,-0.003671,-0.007420,-0.00194,0.002295,0.025304,0.009492,0.007270,0.002123,0.012412,-0.00016,8.172534e-07,-0.000349,0.0,0.000786,0.0,-0.000933,0.002271,0.000717,-0.001171,-0.000412,0.008217,0.003355,-0.000493,0.0,-0.001421,-0.002072,0.005604,-0.000234,-0.005001,0.002116,0.006154,-0.000487,-0.000678
3,0.004294,-0.000656,-0.003671,-0.007420,0.01563,0.000261,-0.024930,0.009492,0.007270,0.002123,-0.043325,-0.00016,8.172534e-07,-0.000349,0.0,0.000786,0.0,-0.000933,0.002271,0.000717,-0.001171,-0.000412,-0.038008,0.003355,-0.000493,0.0,-0.001421,-0.002072,-0.047887,-0.000234,-0.005001,0.002116,0.006154,-0.000487,-0.000678
4,0.004294,-0.000656,-0.003671,-0.007420,-0.00194,0.002295,0.025304,0.009492,0.007270,0.002123,0.012412,-0.00016,8.172534e-07,-0.000349,0.0,0.000786,0.0,-0.000933,0.002271,0.000717,-0.001171,-0.000412,0.008217,0.003355,-0.000493,0.0,-0.001421,0.009945,0.005604,-0.000234,-0.005001,-0.008217,0.006154,-0.000487,-0.000678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,0.004294,0.000322,-0.003671,-0.007420,-0.00194,0.000261,0.025304,0.009492,0.007270,0.002123,0.012412,-0.00016,8.172534e-07,-0.000349,0.0,0.000786,0.0,-0.000933,0.002271,0.000717,-0.001171,-0.000412,0.008217,0.003355,-0.000493,0.0,-0.001421,-0.002072,0.005604,-0.000234,-0.005001,0.002116,0.006154,-0.000487,-0.000678
48740,-0.012421,0.000322,-0.003671,-0.007420,0.01563,-0.002501,-0.024930,0.009492,0.007270,0.002123,0.012412,-0.00016,-3.752361e-05,-0.000349,0.0,0.000786,0.0,-0.000933,0.002271,0.000717,-0.001171,-0.000412,-0.038008,0.003355,-0.000493,0.0,-0.001421,-0.002072,0.005604,-0.000234,-0.005001,0.002116,0.006154,-0.000487,-0.000678
48741,-0.012421,-0.000656,-0.003671,-0.007420,-0.00194,0.000261,-0.024930,-0.018598,0.007270,-0.015350,0.012412,-0.00016,8.172534e-07,-0.000349,0.0,0.000786,0.0,-0.000933,0.002271,0.000717,0.019423,-0.000412,0.008217,0.003355,-0.000493,0.0,-0.001421,-0.002072,0.005604,-0.000234,-0.005001,0.002116,0.006154,-0.000487,-0.000678
48742,0.004294,0.000322,-0.003671,-0.007420,-0.00194,0.002295,-0.024930,-0.018598,0.007270,-0.015350,0.012412,-0.00016,8.172534e-07,-0.000349,0.0,0.000786,0.0,-0.000933,0.002271,0.000717,-0.001171,-0.000412,0.008217,-0.028858,-0.000493,0.0,-0.001421,-0.002072,0.005604,-0.000234,-0.005001,0.002116,0.006154,-0.000487,-0.000678


In [224]:
# Continuous variables
for c in df_numeric.columns:
    col = df_numeric[c]
    col_binned = pd.cut(col, bins=5, labels = [1,2,3,4,5])
    col_binned = pd.to_numeric(col_binned)
    cross = pd.crosstab(col_binned, y)
    for i in range(len(cross.index)):
        # if some bins have 0 event, use adjust WOE by adding 0.5
        if cross.iloc[i,0] == 0 or cross.iloc[i,1] == 0:
            non_default = (cross.iloc[i,0]+0.5)/cross.iloc[:,0].values.sum()
            default = (cross.iloc[i,1]+0.5)/cross.iloc[:,1].values.sum()
            woe = np.log(non_default/default)
        else:
            non_default = cross.iloc[i,0]/cross.iloc[:,0].values.sum()
            default = cross.iloc[i,1]/cross.iloc[:,1].values.sum()
            woe = np.log(non_default/default)
        col_binned[col_binned == cross.index[i]] = woe
    df_numeric = df_numeric.drop(c,axis=1)
    df_numeric = pd.concat((df_numeric, col_binned.rename('WOE_' + c)), axis=1)
df_numeric

Unnamed: 0,WOE_EXT_SOURCE_3,WOE_EXT_SOURCE_2,WOE_EXT_SOURCE_1,WOE_pct_balance_0_1_year,WOE_DAYS_ID_PUBLISH,WOE_DEF_30_CNT_SOCIAL_CIRCLE,WOE_AMT_ANNUITY,WOE_DAYS_LAST_PHONE_CHANGE,WOE_install_default_total,WOE_AMT_REQ_CREDIT_BUREAU_QRT,WOE_prev_GOODS_price_total,WOE_pct_balance_1_2_year,WOE_AMT_GOODS_PRICE,WOE_OWN_CAR_AGE,WOE_DAYS_REGISTRATION,WOE_>30d_default,WOE_CNT_CHILDREN,WOE_avg_DAYS_CREDIT,WOE_FLOORSMAX_AVG
0,0.013396,-0.006522,0.013006,0.001289,0.010383,-0.000033,0.000253,0.001964,0.000588,0.002565,0.00126,-0.000435,-0.001096,-0.005316,-0.004270,-0.00043,-0.000105,0.004458,0.007190
1,0.013396,-0.006522,0.009975,-0.012961,0.028604,-0.000033,0.000253,0.007968,0.000588,-0.006132,0.00126,-0.000435,-0.001096,-0.005316,-0.002167,-0.00043,-0.000105,0.004458,0.007190
2,-0.008848,-0.022879,0.009975,0.001289,-0.025292,-0.000033,0.000253,-0.014157,0.000588,0.002565,0.00126,-0.000435,-0.001096,-0.005316,-0.002167,-0.00043,-0.000105,0.031968,0.007190
3,0.012316,0.049536,0.013006,0.001289,0.028604,-0.000033,0.000253,0.001964,0.000588,0.002565,0.00126,-0.000435,-0.001096,-0.005316,-0.002167,-0.00043,-0.000105,0.003462,0.007190
4,0.013396,-0.011213,-0.040966,0.060138,0.010383,-0.000033,0.000253,0.001964,0.000588,-0.006132,0.00126,-0.000435,-0.001096,-0.005316,-0.004270,-0.00043,-0.000105,0.004458,0.007190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,-0.008848,-0.011213,0.013006,0.001289,0.010383,-0.000033,0.000253,0.007968,0.000588,0.002565,0.00126,-0.000435,-0.001096,-0.005316,0.009491,-0.00043,-0.000105,0.004458,0.007190
48740,0.013396,0.049536,-0.040966,0.001289,-0.025292,-0.000033,-0.001379,-0.014157,0.000588,0.002565,0.00126,-0.000435,0.001290,-0.005316,-0.004270,-0.00043,-0.000105,0.004458,0.007190
48741,-0.012551,0.049536,0.013006,0.001289,0.028604,-0.000033,0.000253,0.007968,0.000588,0.002565,0.00126,-0.000435,-0.001096,-0.005316,-0.002167,-0.00043,-0.000105,0.004458,0.007190
48742,-0.012551,-0.022879,-0.026387,0.001289,-0.036935,-0.000033,0.000253,0.001964,0.000588,0.002565,0.00126,-0.000435,-0.001096,-0.005316,-0.002167,-0.00043,-0.000105,0.004458,0.007190


In [226]:
# Normalize the test set for logistic regression
test_logit = pd.concat((df_numeric, df_categ), axis=1)

sc = MinMaxScaler()
test_logit = pd.DataFrame(sc.fit_transform(test_logit), columns=test_logit.columns)
test_logit.head()

Unnamed: 0,WOE_EXT_SOURCE_3,WOE_EXT_SOURCE_2,WOE_EXT_SOURCE_1,WOE_pct_balance_0_1_year,WOE_DAYS_ID_PUBLISH,WOE_DEF_30_CNT_SOCIAL_CIRCLE,WOE_AMT_ANNUITY,WOE_DAYS_LAST_PHONE_CHANGE,WOE_install_default_total,WOE_AMT_REQ_CREDIT_BUREAU_QRT,WOE_prev_GOODS_price_total,WOE_pct_balance_1_2_year,WOE_AMT_GOODS_PRICE,WOE_OWN_CAR_AGE,WOE_DAYS_REGISTRATION,WOE_>30d_default,WOE_CNT_CHILDREN,WOE_avg_DAYS_CREDIT,WOE_FLOORSMAX_AVG,WOE_NAME_EDUCATION_TYPE_Higher education,WOE_CODE_GENDER_F,WOE_FLAG_DOCUMENT_3,WOE_FLAG_OWN_CAR_Y,WOE_REGION_RATING_CLIENT_W_CITY,WOE_prev_label_YIELD_mode,WOE_NAME_INCOME_TYPE_Working,WOE_NAME_FAMILY_STATUS_Married,WOE_WALLSMATERIAL_MODE_Panel,WOE_ORGANIZATION_TYPE_Self-employed,WOE_ORGANIZATION_TYPE_Business Entity Type 3,WOE_NAME_CONTRACT_TYPE_Cash loans,WOE_ORGANIZATION_TYPE_Construction,WOE_ORGANIZATION_TYPE_Transport: type 3,WOE_FLAG_DOCUMENT_16,WOE_FLAG_DOCUMENT_18,WOE_FLAG_DOCUMENT_13,WOE_OCCUPATION_TYPE_Security staff,WOE_REG_CITY_NOT_LIVE_CITY,WOE_ORGANIZATION_TYPE_Military,WOE_OCCUPATION_TYPE_Drivers,WOE_OCCUPATION_TYPE_Low-skill Laborers,WOE_OCCUPATION_TYPE_Laborers,WOE_OCCUPATION_TYPE_Sales staff,WOE_ORGANIZATION_TYPE_Industry: type 9,WOE_FLAG_DOCUMENT_14,WOE_WEEKDAY_APPR_PROCESS_START_SATURDAY,WOE_WEEKDAY_APPR_PROCESS_START_MONDAY,WOE_NAME_HOUSING_TYPE_House / apartment,WOE_ORGANIZATION_TYPE_Trade: type 3,WOE_FLAG_DOCUMENT_6,WOE_FLAG_WORK_PHONE,WOE_FLAG_PHONE,WOE_ORGANIZATION_TYPE_Police,WOE_ORGANIZATION_TYPE_Security Ministries
0,0.013396,-0.006522,0.013006,0.001289,0.010383,-3.3e-05,0.000253,0.001964,0.000588,0.002565,0.00126,-0.000435,-0.001096,-0.005316,-0.00427,-0.00043,-0.000105,0.004458,0.00719,0.004294,-0.000656,-0.003671,0.014601,-0.00194,0.000261,0.025304,0.009492,0.00727,0.002123,-0.043325,-0.00016,8.172534e-07,-0.000349,0.0,0.000786,0.0,-0.000933,0.002271,0.000717,-0.001171,-0.000412,0.008217,0.003355,-0.000493,0.0,-0.001421,-0.002072,-0.047887,-0.000234,-0.005001,0.002116,-0.017222,-0.000487,-0.000678
1,0.013396,-0.006522,0.009975,-0.012961,0.028604,-3.3e-05,0.000253,0.007968,0.000588,-0.006132,0.00126,-0.000435,-0.001096,-0.005316,-0.002167,-0.00043,-0.000105,0.004458,0.00719,0.004294,0.000322,-0.003671,-0.00742,-0.00194,0.000261,0.025304,0.009492,0.00727,0.002123,0.012412,-0.00016,8.172534e-07,-0.000349,0.0,0.000786,0.0,-0.000933,0.002271,0.000717,-0.001171,-0.000412,0.008217,0.003355,-0.000493,0.0,-0.001421,-0.002072,0.005604,-0.000234,-0.005001,0.002116,0.006154,-0.000487,-0.000678
2,-0.008848,-0.022879,0.009975,0.001289,-0.025292,-3.3e-05,0.000253,-0.014157,0.000588,0.002565,0.00126,-0.000435,-0.001096,-0.005316,-0.002167,-0.00043,-0.000105,0.031968,0.00719,-0.012421,0.000322,-0.003671,-0.00742,-0.00194,0.002295,0.025304,0.009492,0.00727,0.002123,0.012412,-0.00016,8.172534e-07,-0.000349,0.0,0.000786,0.0,-0.000933,0.002271,0.000717,-0.001171,-0.000412,0.008217,0.003355,-0.000493,0.0,-0.001421,-0.002072,0.005604,-0.000234,-0.005001,0.002116,0.006154,-0.000487,-0.000678
3,0.012316,0.049536,0.013006,0.001289,0.028604,-3.3e-05,0.000253,0.001964,0.000588,0.002565,0.00126,-0.000435,-0.001096,-0.005316,-0.002167,-0.00043,-0.000105,0.003462,0.00719,0.004294,-0.000656,-0.003671,-0.00742,0.01563,0.000261,-0.02493,0.009492,0.00727,0.002123,-0.043325,-0.00016,8.172534e-07,-0.000349,0.0,0.000786,0.0,-0.000933,0.002271,0.000717,-0.001171,-0.000412,-0.038008,0.003355,-0.000493,0.0,-0.001421,-0.002072,-0.047887,-0.000234,-0.005001,0.002116,0.006154,-0.000487,-0.000678
4,0.013396,-0.011213,-0.040966,0.060138,0.010383,-3.3e-05,0.000253,0.001964,0.000588,-0.006132,0.00126,-0.000435,-0.001096,-0.005316,-0.00427,-0.00043,-0.000105,0.004458,0.00719,0.004294,-0.000656,-0.003671,-0.00742,-0.00194,0.002295,0.025304,0.009492,0.00727,0.002123,0.012412,-0.00016,8.172534e-07,-0.000349,0.0,0.000786,0.0,-0.000933,0.002271,0.000717,-0.001171,-0.000412,0.008217,0.003355,-0.000493,0.0,-0.001421,0.009945,0.005604,-0.000234,-0.005001,-0.008217,0.006154,-0.000487,-0.000678


# Model Fitting and Selection
- We are gonna fit the base model, XGBoost model and logistic regression model
- hyperparameter tuning, k-folds cross-validation for model evaluation
- Chooose the best model to predict for the test set