# Data preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

## Obtaining base train and test dataframes

### Creating train and test dataframes

In [2]:
def get_dataframes():
  """Return a tuple containing train and test dataframes."""
  train = pd.read_csv('../../data/external/application_train.csv')
  test = pd.read_csv('../../data/external/application_test.csv')
  return train, test

In [3]:
(train, test) = get_dataframes()

In [4]:
train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,


### Moving target to last column in train dataset

In [6]:
def position_target_column(train):
  """Return train dataframe with target as last column.

  Keyword arguments:
  train -- the train dataframe
  """
  target_col = train.pop('TARGET')
  train['TARGET'] = target_col
  return train

In [7]:
train = position_target_column(train)

### Dropping unused ID column

In [8]:
def drop_id_column(train, test):
  """Return a tuple containing train and test dataframes without id column.

  Keyword arguments:
  train -- the train dataframe
  test -- the test dataframe
  """
  train = train.drop(['SK_ID_CURR'], axis=1)
  test = test.drop(['SK_ID_CURR'], axis=1)
  return train, test

In [9]:
(train, test) = drop_id_column(train, test)

### Organizing test set columns based on train set column order

In [10]:
def reorder_test_columns(train, test):
  """Return test dataframe with columns organized following train dataframe columns order.

  Keyword arguments:
  train -- the train dataframe
  test -- the test dataframe
  """
  test = test[train.drop(['TARGET'], axis=1).columns]
  return test

In [11]:
test = reorder_test_columns(train, test)

## Taking care of missing data

In [12]:
def impute_train_missing_data(train):
  """
  Return tuple containing train dataframe with median imputed in place of missing numerical values 
  and a Series with its numerical columns.

  Keyword arguments:
  train -- the train dataframe
  """
  imputer = SimpleImputer(missing_values=np.nan, strategy='median')
  x_dtypes_train = train.dtypes[:-1]
  num_cols_train = x_dtypes_train == np.number
  X_train = train.iloc[:, :-1].values
  imputer.fit(X_train[:, num_cols_train])
  X_train[:, num_cols_train] = imputer.transform(X_train[:, num_cols_train])
  train.iloc[:, :-1] = X_train
  return train, num_cols_train

In [13]:
(train, num_cols_train) = impute_train_missing_data(train)

In [14]:
def impute_test_missing_data(test):
  """
  Return tuple containing test dataframe with median imputed in place of missing numerical values 
  and a Series with its numerical columns.

    Keyword arguments:
    test -- the test dataframe
    """
  imputer = SimpleImputer(missing_values=np.nan, strategy='median')
  x_dtypes_test = test.dtypes
  num_cols_test = x_dtypes_test == np.number
  X_test = test.iloc[:, :].values
  imputer.fit(X_test[:, num_cols_test])
  X_test[:, num_cols_test] = imputer.transform(X_test[:, num_cols_test])
  test.iloc[:, :] = X_test
  return test, num_cols_test

In [15]:
(test, num_cols_test) = impute_test_missing_data(test)

### Getting text features Na rows percentage

In [16]:
def get_train_na_percentages(train):
  """
  Return a Series with the percentage of Na values per columns in train dataframe.
  Must be called just after impute_train_missing_data().

  Keyword arguments:
  train -- the train dataframe
  """
  na_cols_pctg_train = train[train.columns[train.isna().sum() > 0]].isna().sum() / train.shape[0]
  return na_cols_pctg_train

In [17]:
na_cols_pctg_train = get_train_na_percentages(train)
na_cols_pctg_train

NAME_TYPE_SUITE        0.004201
OCCUPATION_TYPE        0.313455
FONDKAPREMONT_MODE     0.683862
HOUSETYPE_MODE         0.501761
WALLSMATERIAL_MODE     0.508408
EMERGENCYSTATE_MODE    0.473983
dtype: float64

### Dropping text features Na rows

In [18]:
def drop_textual_feat_na_rows(train, test):
  """Return a tuple containing train and test dataframes without textual features Na rows.

  Keyword arguments:
  train -- the train dataframe
  test -- the test dataframe
  """
  train = train.dropna(axis=0)
  test = test.dropna(axis=0)
  return train, test

In [19]:
(train, test) = drop_textual_feat_na_rows(train, test)

## Encoding categorical data

### Encoding the independent variables

In [20]:
def get_textual_column_indexes(train, test):
  """Return a tuple containing an ndarray with train and test textual column indexes.

  Keyword arguments:
  train -- the train dataframe
  test -- the test dataframe
  """
  txt_cols_train = train.select_dtypes('object').columns
  txt_indexes_train = train.columns.get_indexer(txt_cols_train)
  txt_cols_test = test.select_dtypes('object').columns
  txt_indexes_test = test.columns.get_indexer(txt_cols_test)
  return txt_indexes_train, txt_indexes_test

In [21]:
(txt_indexes_train, txt_indexes_test) = get_textual_column_indexes(train, test)

In [22]:
def label_encode_train(train, txt_indexes_train):
  """Return the train dataframe with label-encoded textual features.

  Keyword arguments:
  train -- the train dataframe
  txt_indexes_train -- ndarray of train textual column indexes
  """
  label_encoder_x = LabelEncoder()
  X_train = train.iloc[:, :-1].values
  for i in txt_indexes_train:
    X_train[:, i] = label_encoder_x.fit_transform(X_train[:, i])
  train.iloc[:, :-1] = X_train
  return train

In [23]:
def label_encode_test(test, txt_indexes_test):
  """Return the test dataframe with label-encoded textual features.

  Keyword arguments:
  test -- the test dataframe
  txt_indexes_test -- ndarray of test textual column indexes
  """
  label_encoder_x = LabelEncoder()
  X_test = test.iloc[:, :].values
  for i in txt_indexes_test:
    X_test[:, i] = label_encoder_x.fit_transform(X_test[:, i])
  test.iloc[:, :] = X_test
  return test

In [24]:
def label_encode_dataframes(train, test, txt_indexes_train, txt_indexes_test):
  """Return a tuple containing label-encoded train and test dataframes.

  Keyword arguments:
  train -- the train dataframe
  test -- the test dataframe
  txt_indexes_train -- ndarray of train textual column indexes
  txt_indexes_test -- ndarray of test textual column indexes
  """
  train = label_encode_train(train, txt_indexes_train)
  test = label_encode_test(test, txt_indexes_test)
  return train, test

In [25]:
(train, test) = label_encode_dataframes(train, test, txt_indexes_train, txt_indexes_test)

## Feature scaling

In [26]:
def standardize_train(train, num_cols_train):
  """Return the train dataframe with standardized numerical features (not the encoded textual dimensions).

  Keyword arguments:
  train -- the train dataframe
  """
  sc = StandardScaler()
  X_train = train.iloc[:, :-1].values
  X_train[:, num_cols_train] = sc.fit_transform(X_train[:, num_cols_train])
  train.iloc[:, :-1] = X_train
  return train

In [27]:
train = standardize_train(train, num_cols_train)

In [28]:
def standardize_test(test, num_cols_test):
  """Return the test dataframe with standardized numerical features (not the encoded textual dimensions).

  Keyword arguments:
  test -- the test dataframe
  """
  sc = StandardScaler() # standardization implies values between approximately -3 and 3
  X_test = test.iloc[:, :].values
  X_test[:, num_cols_test] = sc.fit_transform(X_test[:, num_cols_test]) # we don't standardize encoded textual dimensions.
  test.iloc[:, :] = X_test
  return test

In [29]:
test = standardize_test(test, num_cols_test)

## Feature selection

### Removing features with at least 50% Na values (percentage computed from train set)

In [30]:
def select_features_on_na(train, test, na_cols_pctg_train):
  """Return a tuple containing train and test dataframes without columns containing at least 50% Na values.

  Keyword arguments:
  train -- the train dataframe
  test -- the test dataframe
  na_cols_pctg_train -- a Series with the percentage of Na values per columns in train dataframe.
  """
  dropped_cols = na_cols_pctg_train[na_cols_pctg_train >= 0.5].axes[0].tolist()
  train = train.drop(dropped_cols, axis=1)
  test = test.drop(dropped_cols, axis=1)
  return train, test

In [31]:
(train, test) = select_features_on_na(train, test, na_cols_pctg_train)

### Removing features with a modality that appears with a probability of at least 80%

In [32]:
def select_features_on_mod_proba(train, test):
  """Return a tuple containing train and test dataframes 
  without columns containing modalities that appeared with a probability of at least 80%.

  Keyword arguments:
  train -- the train dataframe
  test -- the test dataframe
  """
  PROBABILITY_THRESHOLD = 0.8
  train_without_target = train.drop('TARGET', axis=1)
  cols_train = train_without_target.columns.tolist()
  cols_to_drop_train = []
  for col in cols_train:
    mods_pctg = train_without_target[col].value_counts() / train_without_target[col].value_counts().sum()
    for pctg in mods_pctg:
      if pctg >= PROBABILITY_THRESHOLD:
        cols_to_drop_train.append(col)
  train = train.drop(cols_to_drop_train, axis=1)
  test = test.drop(cols_to_drop_train, axis=1)
  return train, test

In [33]:
(train, test) = select_features_on_mod_proba(train, test)

## Exporting preprocessed data to CSV files

In [34]:
def export_dataframes_to_csv_files(train, test):
  """Export train and test dataframes to CSV files to ./data/processed path.

  Keyword arguments:
  train -- the train dataframe
  test -- the test dataframe
  """
  train.to_csv('../../data/processed/processed_application_train.csv', index=False)
  test.to_csv('../../data/processed/processed_application_test.csv', index=False)

In [35]:
export_dataframes_to_csv_files(train, test)













