In [64]:
import time

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Helper functions

In [2]:
def change_dtypes(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        if (df[col].dtype == "object") and (df[col].nunique() < df.shape[0]):
            df[col] = df[col].astype("category")

        elif df[col].dtype == float:
            df[col] = df[col].astype(np.float32)

        elif df[col].dtype == int:
            df[col] = df[col].astype(np.int32)

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtypes(df)
    return df

In [17]:
def train_test_col_align(df_train, df_test, exclude_cols=None):
    if exclude_cols is None:
        exclude_cols = []
    cols_train = df_train.columns.to_list()
    
    for col in exclude_cols:
        assert col in cols_train, col + " is not in df_train"
        
    test_cols = [col for col in cols_train if col not in exclude_cols]
    return df_train[test_cols + exclude_cols], df_test[test_cols]
    

class OneHotEncoder(BaseEstimator, TransformerMixin):
    def fit(self, train_df):
        df_cat = train_df.select_dtypes(["object", "category"])
        self._cat_cols = df_cat.columns.to_list()
        
        if len(self._cat_cols) > 0:
            self._cat_cols_ohe = pd.get_dummies(df_cat).columns.to_list()
        else:
            self._cat_cols_ohe = []
        return self
    
    def transform(self, df):
        if len(self._cat_cols) == 0:
            print("No cat cols in df_train, so do nothing.")
            return df
        
        df_cat = df.select_dtypes(["object", "category"])
        cat_cols = df_cat.columns.to_list()
        assert set(cat_cols) == set(self._cat_cols), "df does not have the same categorical cols as train_df"
        
        # one-hot encode
        df_cat = pd.get_dummies(df_cat)
        # drop redundant classes which my be present in test_df
        for col in df_cat.columns:
            if col not in self._cat_cols_ohe:
                df_cat = df_cat.drop([col], axis="columns")
        
        # if some some colums are lacking in test but present in train, make them will all zero 
        cat_cols_ohe = df_cat.columns.to_list()
        for col in self._cat_cols_ohe:
            if col not in cat_cols_ohe:
                df_cat[col] = 0
                df_cat[col] = df_cat[col].astype(np.uint8)
        
        num_cols = [col for col in df.columns if col not in cat_cols]
        df_num = df[num_cols]
        
        return pd.concat([df_num, df_cat], axis="columns")

# Load data

In [4]:
time_start = time.time()

df_train = load_csv("data/data_/X_y_train.csv")
df_test = load_csv("data/data_/X_test.csv")
print("df_train.shape", df_train.shape)
print("df_test.shape", df_test.shape)
print("df_train.isnull().sum().sum:", df_train.isnull().sum().sum())
print("df_test.isnull().sum().sum:", df_test.isnull().sum().sum())

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

Memory usage before changing types 3972.43 MB
Memory usage after changing types 1950.56 MB
Memory usage before changing types 629.29 MB
Memory usage after changing types 308.86 MB
df_train.shape (307511, 1648)
df_test.shape (48744, 1647)
df_train.isnull().sum().sum: 0
df_test.isnull().sum().sum: 0
Elapsed Time 809.6063630580902


In [34]:
X_train = df_train.copy()
X_test = df_test.copy()

y_train = X_train["APPL_TARGET"].values
X_train = X_train.drop(["SK_ID_CURR", "APPL_TARGET"], axis="columns")

sk_id_test = X_test[["SK_ID_CURR"]]
X_test = X_test.drop(["SK_ID_CURR"], axis="columns")
print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

# TODO go back and check feature extraction
# due to error
X_train = X_train.drop(["PRAP_AMT_DOWN_PAYMENT_IS_NONNEG_entropy"], axis="columns")
X_test = X_test.drop(["PRAP_AMT_DOWN_PAYMENT_IS_NONNEG_entropy"], axis="columns")
print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

X_train.shape (307511, 1646)
X_test.shape (48744, 1646)
X_train.shape (307511, 1645)
X_test.shape (48744, 1645)


# Preprocessing

## One-hot encoding

In [35]:
# one-hot encode
ohe = OneHotEncoder()
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)

# make sure that columns in train and test are aligned
X_train, X_test = train_test_col_align(X_train, X_test)
print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

X_train.shape (307511, 1959)
X_test.shape (48744, 1959)


In [45]:
X_train.dtypes.value_counts()

float32    1524
uint8       359
int32        38
bool         38
dtype: int64

In [46]:
X_test.dtypes.value_counts()

float32    1524
uint8       359
int32        38
bool         38
dtype: int64

## standardization

In [53]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape)

X_train.shape (307511, 1959)
X_test.shape (48744, 1959)


## Split into train validation set for model selection

In [66]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, 
                                                  stratify=y_train, random_state=146)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

((246008, 1959), (246008,), (61503, 1959), (61503,))