# Data preprocessing

In [354]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

## Obtaining base train and test dataframes

### Creating train and test dataframes

In [355]:
def get_dataframes():
  """Return a tuple containing train and test dataframes."""
  train = pd.read_csv('../../data/external/application_train.csv')
  test = pd.read_csv('../../data/external/application_test.csv')
  return train, test

In [356]:
(train, test) = get_dataframes()

In [357]:
train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,...,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,...,0.0205,0.0193,0.0,0.0,reg oper account,block of flats,0.0149,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,...,0.0787,0.0558,0.0039,0.01,reg oper account,block of flats,0.0714,Block,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,Laborers,1.0,2,2,MONDAY,9,0,0,0,0,0,0,...,,,,,,,,,,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,-9833.0,-2437,,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,...,,,,,,,,,,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,-4311.0,-3458,,1,1,0,1,0,0,Core staff,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,...,,,,,,,,,,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [358]:
test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,...,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.01885,-19241,-2329,-5170.0,-812,,1,1,0,1,0,1,,2.0,2,2,TUESDAY,18,0,0,0,0,0,0,Kindergarten,...,,0.0514,,,,block of flats,0.0392,"Stone, brick",No,0.0,0.0,0.0,0.0,-1740.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.035792,-18064,-4469,-9118.0,-1623,,1,1,0,1,0,0,Low-skill Laborers,2.0,2,2,FRIDAY,9,0,0,0,0,0,0,Self-employed,...,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,,Working,Higher education,Married,House / apartment,0.019101,-20038,-4458,-2175.0,-3503,5.0,1,1,0,1,0,0,Drivers,2.0,2,2,MONDAY,14,0,0,0,0,0,0,Transport: type 3,...,,,,,,,,,,0.0,0.0,0.0,0.0,-856.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.026392,-13976,-1866,-2000.0,-4208,,1,1,0,1,1,0,Sales staff,4.0,2,2,WEDNESDAY,11,0,0,0,0,0,0,Business Entity Type 3,...,0.2446,0.3739,0.0388,0.0817,reg oper account,block of flats,0.37,Panel,No,0.0,0.0,0.0,0.0,-1805.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.010032,-13040,-2191,-4000.0,-4262,16.0,1,1,1,1,0,0,,3.0,2,2,FRIDAY,5,0,0,0,0,1,1,Business Entity Type 3,...,,,,,,,,,,0.0,0.0,0.0,0.0,-821.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,


### Moving target to last column in train dataset

In [359]:
def position_target_column(train):
  """Return train dataframe with target as last column.

  Keyword arguments:
  train -- the train dataframe
  """
  target_col = train.pop('TARGET')
  train['TARGET'] = target_col
  return train

In [360]:
train = position_target_column(train)

### Dropping unused ID column

In [361]:
def drop_id_column(train, test):
  """Return a tuple containing train and test dataframes without id column.

  Keyword arguments:
  train -- the train dataframe
  test -- the test dataframe
  """
  train = train.drop(['SK_ID_CURR'], axis=1)
  test = test.drop(['SK_ID_CURR'], axis=1)
  return train, test

In [362]:
(train, test) = drop_id_column(train, test)

### Organizing test set columns based on train set column order

In [363]:
def reorder_test_columns(train, test):
  """Return test dataframe with columns organized following train dataframe columns order.

  Keyword arguments:
  train -- the train dataframe
  test -- the test dataframe
  """
  test = test[train.drop(['TARGET'], axis=1).columns]
  return test

In [364]:
test = reorder_test_columns(train, test)

## Taking care of missing data

In [365]:
def impute_train_missing_data(train):
  """
  Return tuple containing train dataframe with median imputed in place of missing numerical values 
  and a Series with its numerical columns.

  Keyword arguments:
  train -- the train dataframe
  """
  imputer = SimpleImputer(missing_values=np.nan, strategy='median')
  x_dtypes_train = train.dtypes[:-1]
  num_cols_train = x_dtypes_train == np.number
  X_train = train.iloc[:, :-1].values
  imputer.fit(X_train[:, num_cols_train])
  X_train[:, num_cols_train] = imputer.transform(X_train[:, num_cols_train])
  train.iloc[:, :-1] = X_train
  return train, num_cols_train

In [366]:
(train, num_cols_train) = impute_train_missing_data(train)

In [367]:
def impute_test_missing_data(test):
  """
  Return tuple containing test dataframe with median imputed in place of missing numerical values 
  and a Series with its numerical columns.

    Keyword arguments:
    test -- the test dataframe
    """
  imputer = SimpleImputer(missing_values=np.nan, strategy='median')
  x_dtypes_test = test.dtypes
  num_cols_test = x_dtypes_test == np.number
  X_test = test.iloc[:, :].values
  imputer.fit(X_test[:, num_cols_test])
  X_test[:, num_cols_test] = imputer.transform(X_test[:, num_cols_test])
  test.iloc[:, :] = X_test
  return test, num_cols_test

In [368]:
(test, num_cols_test) = impute_test_missing_data(test)

### Getting text features Na rows percentage

In [369]:
def get_train_na_percentages(train):
  """
  Return a Series with the percentage of Na values per columns in train dataframe.
  Must be called just after impute_train_missing_data().

  Keyword arguments:
  train -- the train dataframe
  """
  na_cols_pctg_train = train[train.columns[train.isna().sum() > 0]].isna().sum() / train.shape[0]
  return na_cols_pctg_train

In [370]:
na_cols_pctg_train = get_train_na_percentages(train)
na_cols_pctg_train

NAME_TYPE_SUITE        0.004201
OCCUPATION_TYPE        0.313455
FONDKAPREMONT_MODE     0.683862
HOUSETYPE_MODE         0.501761
WALLSMATERIAL_MODE     0.508408
EMERGENCYSTATE_MODE    0.473983
dtype: float64

### Dropping text features Na rows

In [371]:
def drop_textual_feat_na_rows(train, test):
  """Return a tuple containing train and test dataframes without textual features Na rows.

  Keyword arguments:
  train -- the train dataframe
  test -- the test dataframe
  """
  train = train.dropna(axis=0)
  test = test.dropna(axis=0)
  return train, test

In [372]:
(train, test) = drop_textual_feat_na_rows(train, test)

## Feature scaling

In [373]:
def standardize_train(train, num_cols_train):
  """Return the train dataframe with standardized numerical features (not the encoded textual dimensions).

  Keyword arguments:
  train -- the train dataframe
  """
  sc = StandardScaler()
  X_train = train.iloc[:, :-1].values
  X_train[:, num_cols_train] = sc.fit_transform(X_train[:, num_cols_train])
  train.iloc[:, :-1] = X_train
  return train

In [374]:
train = standardize_train(train, num_cols_train)

In [375]:
def standardize_test(test, num_cols_test):
  """Return the test dataframe with standardized numerical features (not the encoded textual dimensions).

  Keyword arguments:
  test -- the test dataframe
  """
  sc = StandardScaler() # standardization implies values between approximately -3 and 3
  X_test = test.iloc[:, :].values
  X_test[:, num_cols_test] = sc.fit_transform(X_test[:, num_cols_test]) # we don't standardize encoded textual dimensions.
  test.iloc[:, :] = X_test
  return test

In [376]:
test = standardize_test(test, num_cols_test)

## Encoding categorical data

In [377]:
train.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,...,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,TARGET
0,Cash loans,M,N,Y,0,0.112486,-0.545235,-0.2759,-0.569221,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,-0.253586,-9461,-637,0.359172,-2120,-0.115701,1,1,0,1,1,0,Laborers,-1.316009,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,-2.860439,...,-0.847015,-0.176513,-0.391974,reg oper account,block of flats,-0.867653,"Stone, brick",No,0.23927,4.177291,0.247568,5.310591,-0.146298,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.073559,-0.059606,-0.160059,-0.290937,-0.396781,-0.425172,1
1,Cash loans,F,N,N,0,0.704502,1.526865,0.434724,1.412794,Family,State servant,Higher education,Married,House / apartment,-1.210008,-16765,-1188,1.090039,-291,-0.115701,1,1,0,1,1,0,Core staff,-0.223158,1,1,MONDAY,11,0,0,0,0,0,0,School,-1.308105,...,-0.525905,-0.087911,-0.234799,reg oper account,block of flats,-0.362014,Block,No,-0.141029,-0.301337,-0.136194,-0.261234,0.214322,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.073559,-0.059606,-0.160059,-0.290937,-0.396781,-1.004679,0
13,Cash loans,M,Y,N,1,0.309825,0.650662,-0.000257,0.312947,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,-0.390782,-14086,-3028,1.251233,-4911,1.728985,1,1,0,1,0,0,Drivers,0.869692,2,2,THURSDAY,13,0,0,0,0,0,0,Self-employed,0.016376,...,0.234202,-0.176513,-0.391974,reg oper account,block of flats,0.267126,Panel,No,-0.521328,-0.301337,-0.519956,-0.261234,1.185404,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.073559,-0.059606,-0.160059,-0.290937,-0.396781,-0.425172,0
14,Cash loans,F,N,Y,0,-0.005917,0.312391,0.246019,0.26712,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,-0.804813,-14583,-203,1.259545,-2056,-0.115701,1,1,0,1,0,0,Laborers,-0.223158,2,1,MONDAY,9,0,0,0,0,0,0,Transport: type 2,1.485128,...,2.363206,0.264224,1.214356,reg oper account,block of flats,2.409605,Panel,No,-0.521328,-0.301337,-0.519956,-0.261234,0.968561,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.073559,-0.059606,-0.160059,-0.290937,-0.396781,-0.425172,0
18,Revolving loans,F,N,Y,0,-0.676868,-1.127208,-1.363063,-1.061861,Other_A,Working,Secondary / secondary special,Widow,House / apartment,1.464901,-17718,-7804,-1.155698,-1259,-0.115701,1,1,0,1,1,0,Laborers,-1.316009,1,1,FRIDAY,13,0,0,0,0,0,0,Housing,0.016376,...,-0.757281,-0.176513,-0.391974,reg oper account,block of flats,-0.788004,"Stone, brick",No,2.521066,-0.301337,2.550139,-0.261234,0.908457,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.073559,-0.059606,-0.160059,-0.290937,-0.396781,-1.004679,0


In [378]:
test.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,...,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
3,Cash loans,F,N,Y,2,1.059309,2.56226,1.02866,2.93528,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.196341,-13976,-1866,0.830175,-4208,-0.093046,1,1,0,1,1,0,Sales staff,1.946507,2,2,WEDNESDAY,11,0,0,0,0,0,0,Business Entity Type 3,0.151473,...,1.419629,2.110205,0.689932,0.805199,reg oper account,block of flats,2.168,Panel,No,-0.267779,-0.222635,-0.267861,-0.204852,-0.732162,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.040332,-0.032825,-0.053987,-0.077883,-0.729849,0.649564
5,Cash loans,F,Y,Y,0,0.647693,1.007165,0.177099,0.841447,Unaccompanied,State servant,Secondary / secondary special,Married,House / apartment,0.120865,-18604,-12009,-0.379934,-2027,0.054352,1,1,0,1,1,0,Drivers,-0.236016,2,2,MONDAY,15,0,0,0,0,0,0,Government,0.035074,...,0.938461,0.874028,0.080436,0.702445,not specified,block of flats,0.84078,Block,No,-0.267779,-0.222635,-0.267861,-0.204852,-0.621556,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.040332,-0.032825,-0.053987,-0.077883,0.749922,0.052319
15,Cash loans,M,Y,Y,0,-0.175538,-0.669483,-0.585425,-0.759719,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.5755,-13563,-1007,-0.263216,-4044,0.643943,1,1,0,1,0,1,Laborers,-0.236016,1,1,TUESDAY,15,0,0,0,0,0,0,Business Entity Type 3,-0.963045,...,-0.781894,-0.489856,-0.004714,-0.367065,reg oper account,block of flats,-0.587546,"Stone, brick",No,0.105273,-0.222635,0.108637,-0.204852,0.913653,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.040332,-0.032825,-0.053987,-0.077883,0.749922,0.649564
18,Cash loans,F,Y,Y,1,0.236077,-0.328294,-0.476987,-0.39022,Unaccompanied,Commercial associate,Higher education,Civil marriage,With parents,0.120865,-10962,-1883,1.389072,-1721,-0.240444,1,1,0,1,0,1,Managers,0.855246,2,2,THURSDAY,10,0,0,0,0,0,0,Bank,-0.994737,...,-0.153617,-0.172673,-0.179496,0.039609,reg oper account,block of flats,-0.285087,Panel,No,0.664851,-0.222635,0.673385,-0.204852,-0.523117,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.040332,-0.032825,-0.053987,-0.077883,3.709463,1.246809
20,Cash loans,F,N,Y,0,-0.916446,-0.849631,-0.73001,-0.759719,Unaccompanied,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-0.978692,-10507,-2780,-0.072997,-2729,-0.093046,1,1,0,1,1,0,Sales staff,-0.236016,2,2,TUESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.549215,...,-0.380409,-0.608382,-0.179496,-0.377196,reg oper account,block of flats,-0.533566,Panel,No,-0.267779,-0.222635,-0.267861,-0.204852,-1.276343,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.040332,-0.032825,-0.053987,-0.077883,0.749922,0.052319


### Encoding the independent variables

In [379]:
def get_textual_column_indexes(train, test):
  """Return a tuple containing an ndarray with train and test textual column indexes.

  Keyword arguments:
  train -- the train dataframe
  test -- the test dataframe
  """
  txt_cols_train = train.select_dtypes('object').columns
  txt_indexes_train = train.columns.get_indexer(txt_cols_train)
  txt_cols_test = test.select_dtypes('object').columns
  txt_indexes_test = test.columns.get_indexer(txt_cols_test)
  return txt_indexes_train, txt_indexes_test

In [380]:
(txt_indexes_train, txt_indexes_test) = get_textual_column_indexes(train, test)

In [381]:
txt_indexes_train

array([ 0,  1,  2,  3,  9, 10, 11, 12, 13, 26, 30, 38, 84, 85, 87, 88])

In [382]:
txt_indexes_test

array([ 0,  1,  2,  3,  9, 10, 11, 12, 13, 26, 30, 38, 84, 85, 87, 88])

In [383]:
def one_hot_encode_train(train, txt_indexes_train):
  """Return the train dataframe with one-hot-encoded textual features.

  Keyword arguments:
  train -- the train dataframe
  txt_indexes_train -- ndarray of train textual column indexes
  """
  train_dummies = pd.get_dummies(train.iloc[:, txt_indexes_train])
  train.drop(train.select_dtypes('object').columns, axis=1, inplace=True)
  train = pd.concat([train, train_dummies], axis=1)
  train = position_target_column(train)
  return train

In [384]:
def one_hot_encode_test(test, txt_indexes_test):
  """Return the test dataframe with label-encoded textual features.

  Keyword arguments:
  test -- the test dataframe
  txt_indexes_test -- ndarray of test textual column indexes
  """
  test_dummies = pd.get_dummies(test.iloc[:, txt_indexes_test])
  test.drop(test.select_dtypes('object').columns, axis=1, inplace=True)
  test = pd.concat([test, test_dummies], axis=1)
  return test

In [385]:
train = one_hot_encode_train(train, txt_indexes_train)

In [386]:
test = one_hot_encode_test(test, txt_indexes_test)

In [387]:
# Align the training and testing data, keep only columns present in both dataframes
target_col = train['TARGET']
train, test = train.align(test, join = 'inner', axis = 1)
train = pd.concat([train, target_col], axis=1)

In [388]:
train.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,...,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Postal,ORGANIZATION_TYPE_Realtor,ORGANIZATION_TYPE_Religion,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Security,ORGANIZATION_TYPE_Security Ministries,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Services,ORGANIZATION_TYPE_Telecom,ORGANIZATION_TYPE_Trade: type 1,ORGANIZATION_TYPE_Trade: type 2,ORGANIZATION_TYPE_Trade: type 3,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,FONDKAPREMONT_MODE_not specified,FONDKAPREMONT_MODE_org spec account,FONDKAPREMONT_MODE_reg oper account,FONDKAPREMONT_MODE_reg oper spec account,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,TARGET
0,0,0.112486,-0.545235,-0.2759,-0.569221,-0.253586,-9461,-637,0.359172,-2120,-0.115701,1,1,0,1,1,0,-1.316009,2,2,10,0,0,0,0,0,0,-2.860439,-1.481998,-2.091392,-0.911054,-0.641563,-0.510159,-1.185245,-0.388537,-0.590027,-0.856223,-1.069352,-0.681235,-0.374192,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,1
1,0,0.704502,1.526865,0.434724,1.412794,-1.210008,-16765,-1188,1.090039,-291,-0.115701,1,1,0,1,1,0,-0.223158,1,1,11,0,0,0,0,0,0,-1.308105,0.465451,0.146465,-0.257777,-0.440969,0.197152,0.375793,0.251211,0.000458,-1.207692,0.372065,0.646658,-0.694858,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0
13,1,0.309825,0.650662,-0.000257,0.312947,-0.390782,-14086,-3028,1.251233,-4911,1.728985,1,1,0,1,0,0,0.869692,2,2,13,0,0,0,0,0,0,0.016376,0.165504,1.473751,0.214748,0.115679,-0.049585,-0.164566,0.219362,0.590942,-0.154305,0.659795,0.912491,0.379841,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0
14,0,-0.005917,0.312391,0.246019,0.26712,-0.804813,-14583,-203,1.259545,-2056,-0.115701,1,1,0,1,0,0,-0.223158,2,1,9,0,0,0,0,0,0,1.485128,0.576079,0.146465,2.069065,0.569523,0.931878,1.996871,0.9962,2.362397,0.197163,2.965785,3.037247,1.489424,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0
18,0,-0.676868,-1.127208,-1.363063,-1.061861,1.464901,-17718,-7804,-1.155698,-1259,-0.115701,1,1,0,1,1,0,-1.316009,1,1,13,0,0,0,0,0,0,0.016376,0.921735,0.26772,-0.882611,-0.330643,0.361643,0.736033,-0.561629,-0.590027,-0.505774,-1.069352,-0.681235,-0.494945,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0


In [389]:
test.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,...,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Postal,ORGANIZATION_TYPE_Realtor,ORGANIZATION_TYPE_Religion,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Security,ORGANIZATION_TYPE_Security Ministries,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Services,ORGANIZATION_TYPE_Telecom,ORGANIZATION_TYPE_Trade: type 1,ORGANIZATION_TYPE_Trade: type 2,ORGANIZATION_TYPE_Trade: type 3,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,FONDKAPREMONT_MODE_not specified,FONDKAPREMONT_MODE_org spec account,FONDKAPREMONT_MODE_reg oper account,FONDKAPREMONT_MODE_reg oper spec account,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
3,2,1.059309,2.56226,1.02866,2.93528,0.196341,-13976,-1866,0.830175,-4208,-0.093046,1,1,0,1,1,0,1.946507,2,2,11,0,0,0,0,0,0,0.151473,-0.146208,0.659722,1.530614,1.335635,0.961734,1.840195,0.885624,1.658905,1.201917,0.857589,-1.225699,1.763018,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0
5,0,0.647693,1.007165,0.177099,0.841447,0.120865,-18604,-12009,-0.379934,-2027,0.054352,1,1,0,1,1,0,-0.236016,2,2,15,0,0,0,0,0,0,0.035074,0.533212,-0.596879,0.972921,-1.021486,0.031184,0.021541,-0.004105,0.519938,-0.171541,0.57901,0.840149,1.303675,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0
15,0,-0.175538,-0.669483,-0.585425,-0.759719,0.5755,-13563,-1007,-0.263216,-4044,0.643943,1,1,0,1,0,1,-0.236016,1,1,15,0,0,0,0,0,0,-0.963045,0.849494,-1.334663,-0.805599,-0.676025,-1.892371,-3.73701,-0.403423,-0.619028,-0.171541,-0.812547,-0.450929,-0.440295,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0
18,1,0.236077,-0.328294,-0.476987,-0.39022,0.120865,-10962,-1883,1.389072,-1721,-0.240444,1,1,0,1,0,1,0.855246,2,2,10,0,0,0,0,0,0,-0.994737,-0.913251,-0.856024,-0.176451,-0.547568,0.405902,0.749002,-0.568141,0.235197,-0.514905,0.857589,1.098612,-0.849738,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0
20,0,-0.916446,-0.849631,-0.73001,-0.759719,-0.978692,-10507,-2780,-0.072997,-2729,-0.093046,1,1,0,1,1,0,-0.236016,2,2,10,0,0,0,0,0,0,0.549215,0.374195,1.507179,-0.409985,-0.310608,-0.33729,-0.705921,-0.568141,-0.619028,-0.171541,-0.533968,-0.193085,-0.239412,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0


## Exporting preprocessed data to CSV files

In [392]:
def export_dataframes_to_csv_files(train, test):
  """Export train and test dataframes to CSV files to ./data/processed path.

  Keyword arguments:
  train -- the train dataframe
  test -- the test dataframe
  """
  train.to_csv('../../data/processed/processed_application_train.csv', index=False)
  test.to_csv('../../data/processed/processed_application_test.csv', index=False)

In [393]:
export_dataframes_to_csv_files(train, test)