In [1]:
import warnings
warnings.filterwarnings('ignore')

import time

import pandas as pd
import numpy as np
import scipy
from scipy import stats

import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin

# Helper functions

In [2]:
def change_dtypes(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        if (df[col].dtype == "object") and (df[col].nunique() < df.shape[0]):
            df[col] = df[col].astype("category")

        elif df[col].dtype == float:
            df[col] = df[col].astype(np.float32)

        elif df[col].dtype == int:
            df[col] = df[col].astype(np.int32)

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtypes(df)
    return df

In [3]:
def check_type(val):
    if type(val) == str:
        return "string"
    
    if np.issubsctype(type(val), np.number):
        return "number"
    
    if callable(val):
        return "function"
    
    return str(type(val))


class NumColsImputer(BaseEstimator, TransformerMixin):
    def __init__(self, specified_values=None, default="median"):
        """
        :param specified_values: dict {colname (str): val (float)}, impute values for some specific columns
        :param default: str, function or float, value or function used for the remaining columns
        """
        assert (specified_values is None) or isinstance(specified_values, 
                                                        dict), "specified_values must be None or dict"
        
        self._specified_values = specified_values
        if (self._specified_values is not None) and (len(self._specified_values) > 0):
            for col, val in self._specified_values.items():
                assert check_type(val) == "number", "Impute value for " + col + " is not number."
        
        self._default = default
        self._default_type = check_type(self._default)
        if self._default_type not in ["number", "string", "function"]:
            raise ValueError("Unsupported stat type " + self._default_type)
    
    def _cal_imput_vals(self, df):
        cat_cols = df.select_dtypes(["object", "category", "bool"]).columns.to_list()
        if len(cat_cols) > 0:
            raise ValueError("There are non-number columns: " + ", ".join(cat_cols))
        
        all_cols = df.columns.to_list()
        if self._default_type == "number":
            impute_values = {col: self._default for col in all_cols}
            
        elif self._default_type == "string":
            impute_values = getattr(df, self._default)()
        
        elif self._default_type == "function":
            impute_values = df.apply(self._default)
        
        else:
            raise ValueError("Unknown default imputer:", self._default)
        
        # if it is a pandas series, turn it into dict
        impute_values = dict(impute_values)
        if (self._specified_values is None) or (len(self._specified_values) == 0):
            return impute_values
        
        for col in self._specified_values:
            impute_values[col] = self._specified_values[col]
            
        return impute_values
    
    def fit(self, df):
        impute_values = self._cal_imput_vals(df)
        
        cols_with_na = [col for col in df.columns if df[col].isnull().any()]
        self._impute_values = {col: impute_values[col] for col in cols_with_na}
        
        for k, v in self._impute_values.items():
            if np.isnan(v):
                raise ValueError("One of the impute_values is NaN: " + k)
        
        return self
    
    def transform(self, df):
        return df.fillna(self._impute_values)


class CatColsImputer(BaseEstimator, TransformerMixin):
    def __init__(self, specified_values=None, default="missing_value"):
        """
        :param specified_values: dict {colname (str): val (str, float, function)}, 
                                 impute values for some specific columns
        :param default: str, used for the remaining columns
        """
        assert (specified_values is None) or isinstance(specified_values, 
                                                        dict), "specified_values must be None or dict"
        
        self._specified_values = specified_values
        if (self._specified_values is not None) and (len(self._specified_values) > 0):
            for col, val in self._specified_values.items():
                assert check_type(val) in ["string", 
                                           "function"], "Impute value for " + col + " is " + check_type(val)
        
        self._default = default
        assert check_type(self._default) == "string", "default must be string"
        
        
    def _cal_imput_vals(self, df):
        num_cols = df.select_dtypes(["number"]).columns.to_list()
        if len(num_cols) > 0:
            raise ValueError("There are number columns: " + ", ".join(num_cols))
        
        all_cols = df.columns.to_list()
        impute_values = {col: self._default for col in all_cols}
        if (self._specified_values is None) or (len(self._specified_values) == 0):
            return impute_values
        
        for col, val in self._specified_values.items():
            dtype = check_type(val)
            if dtype == "string":
                impute_values[col] = val
            
            elif dtype == "function":
                impute_values[col] = val(df[col])
            
            else:
                raise ValueError("Unknown imputer for " + col + ": ", str(val))
        return impute_values
    
    def fit(self, df):
        impute_values = self._cal_imput_vals(df)
        
        cols_with_na = [col for col in df.columns if df[col].isnull().any()]
        self._impute_values = {col: impute_values[col] for col in cols_with_na}
        
        return self
    
    def transform(self, df):
        df_new = df.copy()
        for col, val in self._impute_values.items():
            df_new[col] = df_new[col].astype("object").fillna(val).astype("category")
            
        return df_new


def get_colnames_from_regex(df, regex_strings):
    cols = []
    for regex_str in regex_strings:
        cols.extend(df.filter(regex=regex_str).columns.to_list())
    return cols


class Imputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        raise NotImplementedError("Not implemented")
        
        self._regex_strings = None
        self._spec_impt_regex_val_num = None
        
        self._spec_impt_vals_num = {}
        self._default_imput_vals_num = "median"
        
        self._spec_impt_vals_cat = {}
        self._default_imput_vals_cat = "missing_value"
    
    def fit(self, df_train):
        if self._regex_strings is not None:
            cols_imput_with_regex = get_colnames_from_regex(df_train, self._regex_strings)
            self._spec_impt_vals_num.update({col: self._spec_impt_regex_val_num 
                                             for col in cols_imput_with_regex})
        
        df_num = df_train.select_dtypes(["number"])
        self._imputer_num = NumColsImputer(specified_values=self._spec_impt_vals_num, 
                                           default=self._default_imput_vals_num)
        self._imputer_num.fit(df_num)
        
        df_cat = df_train.select_dtypes(["object", "category", "bool"])
        self._imputer_cat = CatColsImputer(specified_values=self._spec_impt_vals_cat, 
                                           default=self._default_imput_vals_cat)
        self._imputer_cat.fit(df_cat)
        
        return self
    
    def transform(self, df):
        dfs = []
        num_df = df.select_dtypes(["number"])
        
        if len(self._spec_impt_vals_num) > 0:
            some_col = list(self._spec_impt_vals_num.keys())[0]
            isnull_df = num_df[[some_col]]
            for col in self._spec_impt_vals_num:
                isnull_df[col + "_ISNULL"] = num_df[col].isnull()
                
            isnull_df = isnull_df.drop([some_col], axis="columns")
            dfs.append(isnull_df)
        
        num_df = self._imputer_num.transform(num_df)
        dfs.append(num_df)
        
        # cat
        cat_df = df.select_dtypes(["object", "category", "bool"])
        cat_df = self._imputer_cat.transform(cat_df)
        dfs.append(cat_df)
        
        return pd.concat(dfs, axis="columns")
    

class DefaultImputer(Imputer):
    def __init__(self):
        self._regex_strings = None
        self._spec_impt_regex_val_num = None
        
        self._spec_impt_vals_num = {}
        self._default_imput_vals_num = 0.
        
        self._spec_impt_vals_cat = {}
        self._default_imput_vals_cat = "missing_value"

In [4]:
class CollinearColumnRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold, col_regex=None):
        """
        :param threshold: float in [0, 1], if two columns have correlation greater than threshold
                          one of them will be removed
        :param col_regex: str, regular expression to select columns
        """
        self._threshold = threshold
        self._col_regex = col_regex
    
    def _collinear_columns(self, df, threshold):
        if self._col_regex is None:
            df_sel = df.select_dtypes(["number", "bool"])
        else:
            df_sel = df.filter(regex=self._col_regex)
            df_sel = df_sel.select_dtypes(["number", "bool"])
        
        df_sel = df_sel.astype("float32")
        
        all_cols = df_sel.columns.to_list()
        ncols = len(all_cols)
        
        corr_mat = df_sel.corr().abs()
        self._corr_mat = corr_mat
        collin_cols = []
        for i in range(ncols-1):
            col_i = all_cols[i]
            if col_i in collin_cols:
                continue
            
            for j in range(i + 1, ncols):
                col_j = all_cols[j]
                if col_j in collin_cols:
                    continue
                
                corr = corr_mat.loc[col_i, col_j]
                if corr > threshold:
                    collin_cols.append(col_j)
        
        collin_cols = list(set(collin_cols))
        return collin_cols
    
    
    def fit(self, df):
        self._collin_cols = self._collinear_columns(df, self._threshold)
        return self
    
    def transform(self, df):
        all_cols = df.columns.to_list()
        nonexist_cols = [col for col in self._collin_cols if col not in all_cols]
        if len(nonexist_cols) > 0:
            print("WARNING: These collinear cols to be droped do not exist in df:", nonexist_cols)
            
        droped_col = [col for col in self._collin_cols if col in all_cols]
        print("Number of columns droped:", len(droped_col))
        return df.drop(droped_col, axis="columns")

    
class OneHotEncoder(BaseEstimator, TransformerMixin):
    def fit(self, train_df):
        df_cat = train_df.select_dtypes(["object", "category"])
        self._cat_cols = df_cat.columns.to_list()
        
        if len(self._cat_cols) > 0:
            self._cat_cols_ohe = pd.get_dummies(df_cat, drop_first=True).columns.to_list()
        else:
            self._cat_cols_ohe = []
        return self
    
    def transform(self, df):
        if len(self._cat_cols) == 0:
            print("No cat cols in df_train, so do nothing.")
            return df
        
        df_cat = df.select_dtypes(["object", "category"])
        cat_cols = df_cat.columns.to_list()
        assert set(cat_cols) == set(self._cat_cols), "df does not have the same categorical cols as train_df"
        
        # one-hot encode
        df_cat = pd.get_dummies(df_cat)
        # drop cols that are present in test_df but absent in train_df
        for col in df_cat.columns:
            if col not in self._cat_cols_ohe:
                df_cat = df_cat.drop([col], axis="columns")
        
        # if some some colums are absent in test but present in train, make them will all zero 
        cat_cols_ohe = df_cat.columns.to_list()
        for col in self._cat_cols_ohe:
            if col not in cat_cols_ohe:
                df_cat[col] = 0
                df_cat[col] = df_cat[col].astype(np.uint8)
        
        num_cols = [col for col in df.columns if col not in cat_cols]
        df_num = df[num_cols]
        
        return pd.concat([df_num, df_cat], axis="columns")
    

In [5]:
def flatten_multiindex_cols(columns):
    fat_cols = ["_".join([str(c) for c in flat_col]) for flat_col in columns.to_flat_index()]
    return fat_cols


def agg_num_cols(df, by_sers, stats):
    assert type(by_sers) in [list, tuple], "by_sers must be a list or tuple"
    assert type(stats) in [list, tuple], "stats must be a list or tuple"
    
    for ser in by_sers:
        assert isinstance(ser, pd.Series), "ser in by_sers must be Series"
        
    cat_cols = df.select_dtypes(["object", "category"]).columns.to_list()
    if len(cat_cols) > 0:
        raise ValueError("There are non-number cols: " + ", ".join(cat_cols))
    
    df_agg = df.groupby(by_sers).agg(stats)
    df_agg.columns = flatten_multiindex_cols(df_agg.columns)
    
    return df_agg


def agg_cat_cols(df, by_sers, stats):
    assert type(by_sers) in [list, tuple], "by_sers must be a list or tuple"
    assert type(stats) in [list, tuple], "stats must be a list or tuple"
    
    for ser in by_sers:
        assert isinstance(ser, pd.Series), "ser in by_sers must be Series"
        
    num_cols = df.select_dtypes(["number"]).columns.to_list()
    if len(num_cols) > 0:
        raise ValueError("There are number cols: " + ", ".join(num_cols))
    
    df_agg = df.groupby(by_sers).agg(stats)
    df_agg.columns = flatten_multiindex_cols(df_agg.columns)
    
    return df_agg


class Aggregator(BaseEstimator, TransformerMixin):
    
    def __init__(self, by_list_cols, 
                 num_stats, bool_stats, cat_stats, 
                 ohe_cat_stats=None,
                 ohe_cat_max_class=None,
                 iqr=False, minmax_range=False, mean_median_diff=False):
        """
        :param by_list_cols: list of str, cols by which the dataframe is grouped
        :param num_stats: list, aggregating functions for numerical columns
        :param bool_stats: list, aggregating functions for bool columns
        :param cat_stats: list, aggregating functions for category columns
        :param ohe_cat_stats: list, aggregating functions for category columns after one-hot encoded
        :param ohe_cat_max_class: int, category columns with at most ohe_cat_max_class classes 
                                 will be one-hot encoded before aggregating, 
                                 If None, no one-hot encoding will be done.
        """
        
        self._by_list_cols = by_list_cols
        
        self._num_stats = num_stats
        self._bool_stats = bool_stats
        self._cat_stats = cat_stats
        self._ohe_cat_stats = ohe_cat_stats
        
        if ohe_cat_max_class is None:
            self._ohe_cat_max_class = ["mean", "sum"]
        else:
            self._ohe_cat_max_class = ohe_cat_max_class
        
        self._iqr = iqr
        self._minmax_range = minmax_range
        self._mean_median_diff = mean_median_diff
    
    def _num_agg(self, df, by_sers):
        agg_df = agg_num_cols(df, by_sers, stats=self._num_stats)
        return agg_df
    
    def _bool_agg(self, df, by_sers):
        agg_df = agg_num_cols(df, by_sers, stats=self._bool_stats)
        return agg_df
    
    def _cat_agg(self, df, by_sers):
        agg_df =  agg_cat_cols(df, by_sers, stats=self._cat_stats)
        return agg_df
    
    def _ohe_cat_agg(self, df, by_sers):
        agg_df = agg_num_cols(df, by_sers, stats=self._ohe_cat_stats)
        return agg_df
    
    
    def _iqr_agg(self, num_df, by_sers):
        grouped = num_df.groupby(by_sers)
        iqr_df = grouped.quantile(0.75) - grouped.quantile(0.25)
        iqr_df.columns = [col + "_iqr" for col in iqr_df.columns]
        return iqr_df
    
    def _range_agg(self, num_df, by_sers):
        grouped = num_df.groupby(by_sers)
        range_df = grouped.max() - grouped.min()
        range_df.columns = [col + "_range" for col in range_df.columns]
        return range_df
    
    def _mm_diff_agg(self, num_df, by_sers):
        grouped = num_df.groupby(by_sers)
        diff_df = grouped.mean() - grouped.median()
        diff_df.columns = [col + "_mm_diff" for col in diff_df.columns]
        return diff_df
    
    def _cat_cols_to_ohe(self, df_train):
        cat_cols = []
        if self._ohe_cat_max_class is None:
            return cat_cols
        
        for col in self._cat_cols:
            if df_train[col].nunique() <=  self._ohe_cat_max_class:
                for cl in df_train[col].unique():
                    cat_cols.append(col + "_" + str(cl))
        return cat_cols
        
    def fit(self, df_train):
        df_train = df_train.drop(self._by_list_cols, axis="columns")
        
        self._ohe = OneHotEncoder()
        self._ohe.fit(df_train)
        
        self._bool_cols = df_train.select_dtypes(["bool"]).columns.to_list()
        self._cat_cols = df_train.select_dtypes(["category", "object"]).columns.to_list()
        self._num_cols = df_train.select_dtypes(["number"]).columns.to_list()
        
        # cat column names after being one-hot encoded
        all_cat_ohe_cols = self._ohe.transform(df_train).columns.to_list()
        self._cat_cols_ohe = []
        if self._ohe_cat_max_class is not None:
            for col in self._cat_cols:
                if df_train[col].nunique() <=  self._ohe_cat_max_class:
                    for cl in df_train[col].unique():
                        ohe_col = col + "_" + str(cl)
                        
                        if ohe_col in all_cat_ohe_cols:
                            self._cat_cols_ohe.append(ohe_col)
                        
        return self
    
    def transform(self, df):
        by_sers = [df[col] for col in self._by_list_cols]
        df = df.drop(self._by_list_cols, axis="columns")
        
        all_cols = df.columns.to_list()
        for col in self._bool_cols + self._cat_cols + self._num_cols:
            if col not in all_cols:
                raise ValueError(col + " exists in train but not in test")
        
        dfs = []
        
        # bool cols
        if len(self._bool_cols) > 0:
            df_bool = df[self._bool_cols]
            print("Aggregating bool df with shape:", df_bool.shape)
            df_bool = self._bool_agg(df_bool, by_sers)
            
            for col in df_bool.columns:
                if col.endswith("_mean") or col.endswith("_sum") or col.endswith("_entropy"):
                    df_bool[col] = df_bool[col].astype("float32")
                
                if col.endswith("_mode"):
                    df_bool[col] = df_bool[col].astype("category")
            dfs.append(df_bool)
        
        # categorical cols
        if len(self._cat_cols) > 0:
            df_cat = df[self._cat_cols]
            print("Aggregating cat df with shape:", df_cat.shape)
            df_cat = self._cat_agg(df_cat, by_sers)
            
            for col in df_cat.columns:
                if col.endswith("_mean") or col.endswith("_entropy"):
                    df_cat[col] = df_cat[col].astype("float32")
                
                if col.endswith("_mode"):
                    df_cat[col] = df_cat[col].astype("category")
            
            dfs.append(df_cat)
        
        # number cols
        df_num = df[self._num_cols]
        if df_num.shape[1] > 0:
            print("Aggregating num df with shape:", df_num.shape)
            df_num = self._num_agg(df_num, by_sers)
            dfs.append(df_num)
        
        # ohe cat cols
        if len(self._cat_cols_ohe) > 0:
            df_ohe = self._ohe.transform(df)
            for col in self._cat_cols_ohe:
                if col not in df_ohe.columns:
                    raise ValueError(col + " is not in cols of df_ohe")
                    
            df_ohe = df_ohe[self._cat_cols_ohe]
            print("Aggregating ohe cat df with shape:", df_ohe.shape)
            
            df_ohe = self._ohe_cat_agg(df_ohe, by_sers)
            dfs.append(df_ohe)
        
        # aggregate num cols with iqr, range and mean-median difference
        df_num = df.select_dtypes(["number"])
        print("df_num.shape for iqr, mimmax_range, mean_median_diff", df_num.shape)
            
        if self._iqr and df_num.shape[1] > 0:
            print("Aggregating num df with iqr")
            df_iqr = self._iqr_agg(df_num, by_sers)
            dfs.append(df_iqr)
        
        if self._minmax_range and df_num.shape[1] > 0:
            print("Aggregating num df with range")
            df_range = self._range_agg(df_num, by_sers)
            print("df_range.shape", df_range.shape)
            dfs.append(df_range)
        
        if self._mean_median_diff and df_num.shape[1] > 0:
            print("Aggregating num df with mean-median difference")
            df_diff = self._mm_diff_agg(df_num, by_sers)
            dfs.append(df_diff)
        
        return pd.concat(dfs, axis="columns")

In [6]:
def train_test_partition(df, matching_key, train_id_ser):
    is_train = df[matching_key].isin(train_id_ser.values)
    
    train = df.loc[is_train, :]
    test = df.loc[~is_train, :]
    return train, test

In [7]:
def mode(ser):
    return ser.mode().values[0]


def entropy(ser):
    pk = ser.value_counts(normalize=True)
    return stats.entropy(pk)

# Feature extraction from `application_[train|test]`

In [None]:
class ApplImputer(Imputer):
    def __init__(self):
        self._regex_strings = ["^APARTMENTS_", "^BASEMENTAREA_", "^YEARS_B", "^COMMONAREA_", 
                               "^ELEVATORS_", "^ENTRANCES_", "^FLOORS", "^LANDAREA_", "^LIVING", 
                               "^NONLIVING", "AMT_REQ_CREDIT_BUREAU_"]
        self._spec_impt_regex_val_num = -1.
        
        self._spec_impt_vals_num = {"OWN_CAR_AGE": -1.,
                                    "EXT_SOURCE_1": 0.,
                                    "EXT_SOURCE_3": 0.,
                                    "TOTALAREA_MODE": -1.}
        self._default_imput_vals_num = "median"
        
        self._spec_impt_vals_cat = None
        self._default_imput_vals_cat = "missing_value"
        

class ApplNewColsAdder(BaseEstimator, TransformerMixin):
    def fit(self, df_train):
        credit_to_income = df_train["AMT_CREDIT"] / df_train["AMT_INCOME_TOTAL"]
        self._cti_min = credit_to_income.replace(-np.inf, np.nan).min() / 10.
        self._cti_max = credit_to_income.replace(np.inf, np.nan).max() * 10.
        
        credit_to_goods = df_train["AMT_CREDIT"] / df_train["AMT_GOODS_PRICE"]
        self._ctg_min = credit_to_goods.replace(-np.inf, np.nan).min() / 10.
        self._ctg_max = credit_to_goods.replace(np.inf, np.nan).max() * 10.
    
    def transform(self, df):
        df_new = df.copy()
        df_new["AMT_INCOME_TOTAL_LOG"] = np.log(df_new["AMT_INCOME_TOTAL"])
        df_new["DAYS_EMPLOYED_POSITIVE"] = df_new["DAYS_EMPLOYED"] > 0
        days_emp_max = df_new["DAYS_EMPLOYED"].max()
        if days_emp_max > 0:
            df_new["DAYS_EMPLOYED"] = df_new["DAYS_EMPLOYED"].replace({days_emp_max: 100.})
        
        df_new["CREDIT_TO_INCOME"] = df_new["AMT_CREDIT"] / df_new["AMT_INCOME_TOTAL"]
        df_new["CREDIT_TO_INCOME"] = df_new["CREDIT_TO_INCOME"].replace(-np.inf, self._cti_min)
        df_new["CREDIT_TO_INCOME"] = df_new["CREDIT_TO_INCOME"].replace(np.inf, self._cti_max)
        
        df_new["CREDIT_TO_GOODS"] = df_new["AMT_CREDIT"] / df_new["AMT_GOODS_PRICE"]
        df_new["CREDIT_TO_GOODS"] = df_new["CREDIT_TO_GOODS"].replace(-np.inf, self._ctg_min)
        df_new["CREDIT_TO_GOODS"] = df_new["CREDIT_TO_GOODS"].replace(np.inf, self._ctg_max)
        
        return df_new

In [None]:
application_train = load_csv("data/download/application_train.csv")
application_test = load_csv("data/download/application_test.csv")

appl_train_key = application_train["SK_ID_CURR"]
appl_test_key = application_test["SK_ID_CURR"]
print(application_train.shape, application_test.shape)

In [None]:
time_start = time.time()

appl_train = application_train.copy()
appl_test = application_test.copy()
print("appl_train.shape", appl_train.shape)
print("appl_test.shape", appl_test.shape)

imputer = ApplImputer()
imputer.fit(appl_train)
appl_train = imputer.transform(appl_train)
appl_test = imputer.transform(appl_test)
print("appl_train.isnull().sum().sum()", appl_train.isnull().sum().sum())
print("appl_test.isnull().sum().sum()", appl_test.isnull().sum().sum())


remover = CollinearColumnRemover(0.95, col_regex="_ISNULL$")
remover.fit(appl_train)
appl_train = remover.transform(appl_train)
appl_test = remover.transform(appl_test)
print("appl_train.shape", appl_train.shape)
print("appl_test.shape", appl_test.shape)


adder = ApplNewColsAdder()
adder.fit(appl_train)
appl_train = adder.transform(appl_train)
appl_test = adder.transform(appl_test)
print("appl_train.shape", appl_train.shape)
print("appl_test.shape", appl_test.shape)


if False:
    appl_train.to_csv("data/data_/application_train.csv", index=False)
    appl_test.to_csv("data/data_/application_test.csv", index=False)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)


# Feature extraction from `bureau` data

## `bureau.csv`

In [None]:
class BureauImputer(Imputer):
    def __init__(self):
        self._regex_strings = None
        self._spec_impt_regex_val_num = None
        
        self._spec_impt_vals_num = {"DAYS_ENDDATE_FACT": 100.,
                                    "AMT_CREDIT_MAX_OVERDUE": -1000.,
                                    "AMT_CREDIT_SUM_DEBT": 0.,
                                    "AMT_CREDIT_SUM_LIMIT": 0.,
                                    "AMT_ANNUITY": -1000.}
        self._default_imput_vals_num = "median"
        
        self._spec_impt_vals_cat = None
        self._default_imput_vals_cat = "missing_value"


def drop_cols_bureau(df):
    # mostly zero
    cols_to_drop = ["CREDIT_DAY_OVERDUE", "CNT_CREDIT_PROLONG", "AMT_CREDIT_SUM_OVERDUE"]
    return df.drop(cols_to_drop, axis="columns")


def add_new_cols_bureau(df):
    df["DAYS_CREDIT_ENDDATE_ISPOSITIVE"] = df["DAYS_CREDIT_ENDDATE"] > 0

    df["AMT_CREDIT_MAX_OVERDUE_ISPOSITIVE"] = df["AMT_CREDIT_MAX_OVERDUE"] > 0
    df["AMT_CREDIT_SUM_DEBT_ISPOSITIVE"] = df["AMT_CREDIT_SUM_DEBT"] > 0
    df["AMT_CREDIT_SUM_LIMIT_ISPOSITIVE"] = df["AMT_CREDIT_SUM_LIMIT"] > 0
    df["AMT_ANNUITY_ISPOSITIVE"] =  df["AMT_ANNUITY"] > 0
    
    amt_credt_sum = df["AMT_CREDIT_SUM"] + 1
    df["AMT_CREDIT_MAX_OVERDUE_TO_SUM"] = df["AMT_CREDIT_MAX_OVERDUE"] / amt_credt_sum
    df["AMT_CREDIT_SUM_DEBT_TO_SUM"] = df["AMT_CREDIT_SUM_DEBT"] / amt_credt_sum
    df["AMT_CREDIT_SUM_LIMIT_TO_SUM"] = df["AMT_CREDIT_SUM_LIMIT"] / amt_credt_sum
    return df


def bu_nearest_status(df):
    return df.sort_values(by=["DAYS_CREDIT"], ascending=False)["CREDIT_ACTIVE"].iloc[0]

def bu_mode_status_three_nearest(df):
    statuses = df.sort_values(by=["DAYS_CREDIT"], ascending=False)["CREDIT_ACTIVE"].iloc[: 3]
    return statuses.mode().values[0]

def bu_mode_status_six_nearest(df):
    statuses = df.sort_values(by=["DAYS_CREDIT"], ascending=False)["CREDIT_ACTIVE"].iloc[: 6]
    return statuses.mode().values[0]


def add_new_agg_cols_bu(df):
    results = {}
    results["NEAREST_CREDIT_ACTIVE"] = df.groupby("SK_ID_CURR").apply(bu_nearest_status)
    results["MODE_CREDIT_ACTIVE_THREE"] = df.groupby("SK_ID_CURR").apply(bu_mode_status_three_nearest)
    results["MODE_CREDIT_ACTIVE_SIX"] = df.groupby("SK_ID_CURR").apply(bu_mode_status_six_nearest)
    return results

In [None]:
bureau = load_csv("data/download/bureau.csv")
print("bureau.shape", bureau.shape)

bureau_train, bureau_test = train_test_partition(bureau, "SK_ID_CURR", appl_train_key)

print("bureau_train.shape", bureau_train.shape)
print("bureau_test.shape", bureau_test.shape)

bureau_train_keys = bureau_train[["SK_ID_CURR", "SK_ID_BUREAU"]]
bureau_test_keys = bureau_test[["SK_ID_CURR", "SK_ID_BUREAU"]]

bureau_train = bureau_train.drop(["SK_ID_BUREAU"], axis="columns")
bureau_test = bureau_test.drop(["SK_ID_BUREAU"], axis="columns")

bureau_train.head()

In [None]:
time_start = time.time()

bureau_agg_train = bureau_train.copy()
bureau_agg_test = bureau_test.copy()
print("bureau_agg_train.shape", bureau_agg_train.shape)
print("bureau_agg_test.shape", bureau_agg_test.shape)


# drop some cols
bureau_agg_train = drop_cols_bureau(bureau_agg_train)
bureau_agg_test = drop_cols_bureau(bureau_agg_test)
print("bureau_agg_train.shape", bureau_agg_train.shape)
print("bureau_agg_test.shape", bureau_agg_test.shape)


# impute missing values
imputer = BureauImputer()
imputer.fit(bureau_agg_train)
bureau_agg_train = imputer.transform(bureau_agg_train)
bureau_agg_test = imputer.transform(bureau_agg_test)
print("bureau_agg_train.isnull().sum().sum()", bureau_agg_train.isnull().sum().sum())
print("bureau_agg_test.isnull().sum().sum()", bureau_agg_test.isnull().sum().sum())
print("bureau_agg_train.shape", bureau_agg_train.shape)
print("bureau_agg_test.shape", bureau_agg_test.shape)


# add some cols
bureau_agg_train = add_new_cols_bureau(bureau_agg_train)
bureau_agg_test = add_new_cols_bureau(bureau_agg_test)
print("bureau_agg_train.shape", bureau_agg_train.shape)
print("bureau_agg_test.shape", bureau_agg_test.shape)


# new agg cols
new_agg_cols_train = add_new_agg_cols_bu(bureau_agg_train)
new_agg_cols_test = add_new_agg_cols_bu(bureau_agg_test)


# aggregate over "SK_ID_CURR"
num_stats = ["sum", "mean", "median", "min", "max", "var"]
bool_stats = ["sum", "mean", mode, entropy]
cat_stats = ["count", "nunique", mode, entropy]
ohe_cat_stats = ["sum", "mean", "var"]

by_list_cols = ["SK_ID_CURR"]

aggregator = Aggregator(by_list_cols, num_stats, bool_stats, cat_stats,
                        ohe_cat_stats=ohe_cat_stats,
                        ohe_cat_max_class=10,
                        iqr=True, minmax_range=True, mean_median_diff=True)
aggregator.fit(bureau_agg_train)
bureau_agg_train = aggregator.transform(bureau_agg_train)
bureau_agg_test = aggregator.transform(bureau_agg_test)
print("bureau_agg_train.isnull().sum().sum()", bureau_agg_train.isnull().sum().sum())
print("bureau_agg_test.isnull().sum().sum()", bureau_agg_test.isnull().sum().sum())
print("bureau_agg_train.shape", bureau_agg_train.shape)
print("bureau_agg_test.shape", bureau_agg_test.shape)


# add new agg cols
for col in new_agg_cols_train:
    bureau_agg_train[col] = new_agg_cols_train[col]

for col in new_agg_cols_test:
    bureau_agg_test[col] = new_agg_cols_test[col]

print("bureau_agg_train.isnull().sum().sum()", bureau_agg_train.isnull().sum().sum())
print("bureau_agg_test.isnull().sum().sum()", bureau_agg_test.isnull().sum().sum())
print("bureau_agg_train.shape", bureau_agg_train.shape)
print("bureau_agg_test.shape", bureau_agg_test.shape)

# in case 
imputer = DefaultImputer()
imputer.fit(bureau_agg_train)
bureau_agg_train = imputer.transform(bureau_agg_train)
bureau_agg_test = imputer.transform(bureau_agg_test)
print("bureau_agg_train.isnull().sum().sum()", bureau_agg_train.isnull().sum().sum())
print("bureau_agg_test.isnull().sum().sum()", bureau_agg_test.isnull().sum().sum())


# remove collinear cols
remover = CollinearColumnRemover(0.95)
remover.fit(bureau_agg_train)
bureau_agg_train = remover.transform(bureau_agg_train)
bureau_agg_test = remover.transform(bureau_agg_test)
print("bureau_agg_train.shape", bureau_agg_train.shape)
print("bureau_agg_test.shape", bureau_agg_test.shape)

# reset index
bureau_agg_train = bureau_agg_train.reset_index()
bureau_agg_test = bureau_agg_test.reset_index()
print("bureau_agg_train.shape", bureau_agg_train.shape)
print("bureau_agg_test.shape", bureau_agg_test.shape)


if False:
    bureau_agg_train.to_csv("data/data_/bureau_agg_train.csv", index=False)
    bureau_agg_test.to_csv("data/data_/bureau_agg_test.csv", index=False)


time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

## `bureau_balance.csv`

In [None]:
def bb_nearest_status(df):
    return df.sort_values(by=["MONTHS_BALANCE"], ascending=False)["STATUS"].iloc[0]

def bb_mode_status_three_nearest(df):
    statuses = df.sort_values(by=["MONTHS_BALANCE"], ascending=False)["STATUS"].iloc[: 3]
    return statuses.mode().values[0]

def bb_mode_status_six_nearest(df):
    statuses = df.sort_values(by=["MONTHS_BALANCE"], ascending=False)["STATUS"].iloc[: 6]
    return statuses.mode().values[0]


def add_new_agg_cols_bb(df):
    results = {}
    results["NEAREST_STATUS"] = df.groupby("SK_ID_BUREAU").apply(bb_nearest_status)
    results["MODE_STATUS_THREE_NEAREST"] = df.groupby("SK_ID_BUREAU").apply(bb_mode_status_three_nearest)
    results["MODE_STATUS_SIX_NEAREST"] = df.groupby("SK_ID_BUREAU").apply(bb_mode_status_six_nearest)
    return results

In [None]:
bureau_balance = load_csv("data/download/bureau_balance.csv")
print("bureau_balance.shape", bureau_balance.shape)

bureau_balance = bureau_balance.merge(bureau[["SK_ID_CURR", "SK_ID_BUREAU"]], how="left", on="SK_ID_BUREAU")
bureau_balance = bureau_balance.dropna(subset=["SK_ID_CURR"])
bureau_balance["SK_ID_CURR"] = bureau_balance["SK_ID_CURR"].astype("int32")
print("bureau_balance.shape", bureau_balance.shape)

bureau_balance_train, bureau_balance_test = train_test_partition(bureau_balance, "SK_ID_CURR", appl_train_key)
bureau_balance_train = bureau_balance_train.drop(["SK_ID_CURR"], axis="columns")
bureau_balance_test = bureau_balance_test.drop(["SK_ID_CURR"], axis="columns")

print("bureau_balance_train.shape:", bureau_balance_train.shape)
print("bureau_balance_test.shape:", bureau_balance_test.shape)

bureau_balance_train.head()

In [None]:
time_start = time.time()

bureau_balance_agg_train = bureau_balance_train.copy()
bureau_balance_agg_test = bureau_balance_test.copy()
print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)


# new agg cols
new_agg_cols_train = add_new_agg_cols_bb(bureau_balance_agg_train)
new_agg_cols_test = add_new_agg_cols_bb(bureau_balance_agg_test)


# aggregate over "SK_ID_BUREAU"
num_stats = ["sum", "mean", "median", "min", "max", "var"]
bool_stats = ["sum", "mean", mode, entropy]
cat_stats = ["count", "nunique", mode, entropy]
ohe_cat_stats = ["sum", "mean", "var"]

by_list_cols = ["SK_ID_BUREAU"]

aggregator = Aggregator(by_list_cols, num_stats, bool_stats, cat_stats,
                        ohe_cat_stats=ohe_cat_stats,
                        ohe_cat_max_class=10,
                        iqr=True, minmax_range=True, mean_median_diff=True)

aggregator.fit(bureau_balance_agg_train)
bureau_balance_agg_train = aggregator.transform(bureau_balance_agg_train)
bureau_balance_agg_test = aggregator.transform(bureau_balance_agg_test)
print("bureau_balance_agg_train.isnull().sum().sum():", bureau_balance_agg_train.isnull().sum().sum())
print("bureau_balance_agg_test.isnull().sum().sum():", bureau_balance_agg_test.isnull().sum().sum())
print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)


# add new aggs cols
for col in new_agg_cols_train:
    bureau_balance_agg_train[col] = new_agg_cols_train[col]

for col in new_agg_cols_test:
    bureau_balance_agg_test[col] = new_agg_cols_test[col]
print("bureau_balance_agg_train.isnull().sum().sum():", bureau_balance_agg_train.isnull().sum().sum())
print("bureau_balance_agg_test.isnull().sum().sum():", bureau_balance_agg_test.isnull().sum().sum())
print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)


# in case
imputer = DefaultImputer()
imputer.fit(bureau_balance_agg_train)
bureau_balance_agg_train = imputer.transform(bureau_balance_agg_train)
bureau_balance_agg_test = imputer.transform(bureau_balance_agg_test)
print("bureau_balance_agg_train.isnull().sum().sum():", bureau_balance_agg_train.isnull().sum().sum())
print("bureau_balance_agg_test.isnull().sum().sum():", bureau_balance_agg_test.isnull().sum().sum())


# remove collinear columns
remover = CollinearColumnRemover(0.95)
remover.fit(bureau_balance_agg_train)
bureau_balance_agg_train = remover.transform(bureau_balance_agg_train)
bureau_balance_agg_test = remover.transform(bureau_balance_agg_test)

print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)

bureau_balance_agg_train = bureau_balance_agg_train.reset_index()
bureau_balance_agg_test = bureau_balance_agg_test.reset_index()
print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)

if False:
    bureau_balance_agg_train.to_csv("data/data_/bureau_balance_agg_train_tmp.csv", index=False)
    bureau_balance_agg_test.to_csv("data/data_/bureau_balance_agg_test_tmp.csv", index=False)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

In [None]:
# aggregate over "SK_ID_CURR"
time_start = time.time()


bureau_balance_agg_train = load_csv("data/data_/bureau_balance_agg_train_tmp.csv")
bureau_balance_agg_test = load_csv("data/data_/bureau_balance_agg_test_tmp.csv")
print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)

bureau_balance_agg_train = bureau_balance_agg_train.merge(bureau_train_keys, how="left", on="SK_ID_BUREAU")
bureau_balance_agg_test = bureau_balance_agg_test.merge(bureau_test_keys, how="left", on="SK_ID_BUREAU")
print("bureau_balance_agg_train.isnull().sum().sum():", bureau_balance_agg_train.isnull().sum().sum())
print("bureau_balance_agg_test.isnull().sum().sum():", bureau_balance_agg_test.isnull().sum().sum())
print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)

bureau_balance_agg_train = bureau_balance_agg_train.drop(["SK_ID_BUREAU"], axis="columns")
bureau_balance_agg_test = bureau_balance_agg_test.drop(["SK_ID_BUREAU"], axis="columns")
print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)

# aggregate
num_stats = ["sum", "mean", "median", "min", "max", "var"]
bool_stats = ["sum", "mean", mode, entropy]
cat_stats = ["count", "nunique", mode, entropy]
ohe_cat_stats = ["sum", "mean", "var"]

by_list_cols = ["SK_ID_CURR"]

aggregator = Aggregator(by_list_cols, num_stats, bool_stats, cat_stats,
                        ohe_cat_stats=ohe_cat_stats,
                        ohe_cat_max_class=10,
                        iqr=True, minmax_range=True, mean_median_diff=True)

aggregator.fit(bureau_balance_agg_train)
bureau_balance_agg_train = aggregator.transform(bureau_balance_agg_train)
bureau_balance_agg_test = aggregator.transform(bureau_balance_agg_test)
print("bureau_balance_agg_train.isnull().sum().sum():", bureau_balance_agg_train.isnull().sum().sum())
print("bureau_balance_agg_test.isnull().sum().sum():", bureau_balance_agg_test.isnull().sum().sum())
print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)

# in case
imputer = DefaultImputer()
imputer.fit(bureau_balance_agg_train)
bureau_balance_agg_train = imputer.transform(bureau_balance_agg_train)
bureau_balance_agg_test = imputer.transform(bureau_balance_agg_test)
print("bureau_balance_agg_train.isnull().sum().sum():", bureau_balance_agg_train.isnull().sum().sum())
print("bureau_balance_agg_test.isnull().sum().sum():", bureau_balance_agg_test.isnull().sum().sum())


# remove collinear columns
remover = CollinearColumnRemover(0.95)
remover.fit(bureau_balance_agg_train)
bureau_balance_agg_train = remover.transform(bureau_balance_agg_train)
bureau_balance_agg_test = remover.transform(bureau_balance_agg_test)

print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)

bureau_balance_agg_train = bureau_balance_agg_train.reset_index()
bureau_balance_agg_test = bureau_balance_agg_test.reset_index()
print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)

if False:
    bureau_balance_agg_train.to_csv("data/data_/bureau_balance_agg_train.csv", index=False)
    bureau_balance_agg_test.to_csv("data/data_/bureau_balance_agg_test.csv", index=False)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

# Feature extraction from `previous application` data

## `previous_application.csv`

In [None]:
class PrevApplImputer(Imputer):
    def __init__(self):
        self._regex_strings = None
        self._spec_impt_regex_val_num = None
        
        self._spec_impt_vals_num = {"RATE_DOWN_PAYMENT": -1.,
                                   "CNT_PAYMENT": -10.,
                                   "DAYS_FIRST_DRAWING": 0., 
                                   "DAYS_FIRST_DUE": 0.,
                                   "DAYS_LAST_DUE_1ST_VERSION": 0.,
                                   "DAYS_LAST_DUE": 0.,
                                   "DAYS_TERMINATION": 0.}
        self._default_imput_vals_num = "median"
        
        self._spec_impt_vals_cat = {"NAME_TYPE_SUITE": "missing_value",
                                    "NFLAG_INSURED_ON_APPROVAL": "missing_value"}
        self._default_imput_vals_cat = "missing_value"
    

def hour_period_bin(hours):
    hours = hours.values
    hour_bin = np.array(["evening"] * len(hours), dtype="object")
    morning_mask = (hours > 5) & (hours < 12)
    afternoon_mask = (hours >= 12) & (hours < 18)
    
    hour_bin[morning_mask] = "morning"
    hour_bin[afternoon_mask] = "afternoon"
    return hour_bin


def add_new_cols_prev_appl(df):
    # add to bool cols to identify if values are non-negative
    cols_is_nonneg = ["DAYS_FIRST_DRAWING", "DAYS_FIRST_DUE", 
                      "DAYS_LAST_DUE_1ST_VERSION", 
                      "DAYS_LAST_DUE", "DAYS_TERMINATION"]
    for col in cols_is_nonneg:
        df[col + "_IS_NONNEG"] = df[col] >= 0
        
        df["PERIOD_APPR_PROCESS_START"] = hour_period_bin(df["HOUR_APPR_PROCESS_START"])
        df["PERIOD_APPR_PROCESS_START"] = df["PERIOD_APPR_PROCESS_START"].astype("category")
    
    df["AMT_APPLICATION_TO_CREDIT"] = df["AMT_APPLICATION"] / (df["AMT_CREDIT"] + 1)
    df["AMT_DOWN_PAY_TO_CREDIT"] = df["AMT_DOWN_PAYMENT"] / (df["AMT_CREDIT"] + 1)
    df["AMT_GOODS_PRICE_TO_CREDIT"] = df["AMT_GOODS_PRICE"] / (df["AMT_CREDIT"] + 1)
    
    return df


def mean_n_nearest(df, sort_by, n, col):
    ser = df.sort_values(by=sort_by, ascending=False)[col].iloc[: n]
    return ser.mean()

def min_n_nearest(df, sort_by, n, col):
    ser = df.sort_values(by=sort_by, ascending=False)[col].iloc[: n]
    return ser.min()

def max_n_nearest(df, sort_by, n, col):
    ser = df.sort_values(by=sort_by, ascending=False)[col].iloc[: n]
    return ser.max()


def add_new_agg_cols_prev_appl(df):
    sort_by = ["DAYS_DECISION"]
    orig_cols = ["AMT_APPLICATION_TO_CREDIT", "AMT_DOWN_PAY_TO_CREDIT", "AMT_GOODS_PRICE_TO_CREDIT"]
    number_nn_entries = [3, 6]
    
    results = {}
    for col in orig_cols:
        for n in number_nn_entries:
            new_col = "MEAN_" + col + "_%d_NEAREST" % n
            print("Nearest mean for " + new_col)
            results[new_col] = df.groupby("SK_ID_CURR").apply(lambda df: mean_n_nearest(df, sort_by, n, col))
            
            new_col = "MIN_" + col + "_%d_NEAREST" % n
            print("Nearest min for " + new_col)
            results[new_col] = df.groupby("SK_ID_CURR").apply(lambda df: min_n_nearest(df, sort_by, n, col))
            
            new_col = "MAX_" + col + "_%d_NEAREST" % n
            print("Nearest max for " + new_col)
            results[new_col] = df.groupby("SK_ID_CURR").apply(lambda df: max_n_nearest(df, sort_by, n, col))

    return results

In [None]:
previous_application = load_csv("data/download/previous_application.csv")
print("previous_application.shape", previous_application.shape)

prev_appl_train, prev_appl_test = train_test_partition(previous_application, "SK_ID_CURR", appl_train_key)

print("prev_appl_train.shape", prev_appl_train.shape)
print("prev_appl_test.shape", prev_appl_test.shape)


prev_appl_train_keys = prev_appl_train[["SK_ID_CURR", "SK_ID_PREV"]]
prev_appl_test_keys = prev_appl_test[["SK_ID_CURR", "SK_ID_PREV"]]

prev_appl_train.head()

In [None]:
time_start = time.time()

prev_appl_agg_train = prev_appl_train.copy()
prev_appl_agg_test = prev_appl_test.copy()
print("prev_appl_agg_train.shape", prev_appl_agg_train.shape)
print("prev_appl_agg_test.shape", prev_appl_agg_test.shape)


prev_appl_agg_train = prev_appl_agg_train.drop(["SK_ID_PREV"], axis="columns")
prev_appl_agg_test = prev_appl_agg_test.drop(["SK_ID_PREV"], axis="columns")
print("prev_appl_agg_train.shape", prev_appl_agg_train.shape)
print("prev_appl_agg_test.shape", prev_appl_agg_test.shape)


# drop cols with high percentage of null
cols_to_drop = ["RATE_INTEREST_PRIMARY", "RATE_INTEREST_PRIVILEGED"]
prev_appl_agg_train = prev_appl_agg_train.drop(cols_to_drop, axis="columns")
prev_appl_agg_test = prev_appl_agg_test.drop(cols_to_drop, axis="columns")
print("prev_appl_agg_train.shape", prev_appl_agg_train.shape)
print("prev_appl_agg_test.shape", prev_appl_agg_test.shape)


# impute missing values
imputer = PrevApplImputer()
imputer.fit(prev_appl_agg_train)
prev_appl_agg_train = imputer.transform(prev_appl_agg_train)
prev_appl_agg_test = imputer.transform(prev_appl_agg_test)

print("prev_appl_agg_train.isnull().sum().sum()", prev_appl_agg_train.isnull().sum().sum())
print("prev_appl_agg_test.isnull().sum().sum()", prev_appl_agg_test.isnull().sum().sum())
print("prev_appl_agg_train.shape", prev_appl_agg_train.shape)
print("prev_appl_agg_test.shape", prev_appl_agg_test.shape)

# add new cols
prev_appl_agg_train = add_new_cols_prev_appl(prev_appl_agg_train)
prev_appl_agg_test = add_new_cols_prev_appl(prev_appl_agg_test)
print("prev_appl_agg_train.shape", prev_appl_agg_train.shape)
print("prev_appl_agg_test.shape", prev_appl_agg_test.shape)


# new agg cols
new_agg_cols_train = add_new_agg_cols_prev_appl(prev_appl_agg_train)
new_agg_cols_test = add_new_agg_cols_prev_appl(prev_appl_agg_test)


# aggregate over "SK_ID_CURR"
num_stats = ["sum", "mean", "median", "min", "max", "var"]
bool_stats = ["sum", "mean", mode, entropy]
cat_stats = ["count", "nunique", mode, entropy]
ohe_cat_stats = ["sum", "mean", "var"]

by_list_cols = ["SK_ID_CURR"]

aggregator = Aggregator(by_list_cols, num_stats, bool_stats, cat_stats,
                        ohe_cat_stats=ohe_cat_stats,
                        ohe_cat_max_class=10,
                        iqr=True, minmax_range=True, mean_median_diff=True)

aggregator.fit(prev_appl_agg_train)
prev_appl_agg_train = aggregator.transform(prev_appl_agg_train)
prev_appl_agg_test = aggregator.transform(prev_appl_agg_test)

print("prev_appl_agg_train.isnull().sum().sum()", prev_appl_agg_train.isnull().sum().sum())
print("prev_appl_agg_test.isnull().sum().sum()", prev_appl_agg_test.isnull().sum().sum())
print("prev_appl_agg_train.shape", prev_appl_agg_train.shape)
print("prev_appl_agg_test.shape", prev_appl_agg_test.shape)


# add new agg cols
for col in new_agg_cols_train:
    prev_appl_agg_train[col] = new_agg_cols_train[col]

for col in new_agg_cols_test:
    prev_appl_agg_test[col] = new_agg_cols_test[col]

print("prev_appl_agg_train.isnull().sum().sum()", prev_appl_agg_train.isnull().sum().sum())
print("prev_appl_agg_test.isnull().sum().sum()", prev_appl_agg_test.isnull().sum().sum())
print("prev_appl_agg_train.shape", prev_appl_agg_train.shape)
print("prev_appl_agg_test.shape", prev_appl_agg_test.shape)


# in case
imputer = DefaultImputer()
imputer.fit(prev_appl_agg_train)
prev_appl_agg_train = imputer.transform(prev_appl_agg_train)
prev_appl_agg_test = imputer.transform(prev_appl_agg_test)
print("prev_appl_agg_train.isnull().sum().sum()", prev_appl_agg_train.isnull().sum().sum())
print("prev_appl_agg_test.isnull().sum().sum()", prev_appl_agg_test.isnull().sum().sum())


# remove collinear columns
remover = CollinearColumnRemover(0.95)
remover.fit(prev_appl_agg_train)
prev_appl_agg_train = remover.transform(prev_appl_agg_train)
prev_appl_agg_test = remover.transform(prev_appl_agg_test)
print("prev_appl_agg_train.shape", prev_appl_agg_train.shape)
print("prev_appl_agg_test.shape", prev_appl_agg_test.shape)

prev_appl_agg_train = prev_appl_agg_train.reset_index()
prev_appl_agg_test = prev_appl_agg_test.reset_index()

if False:
    prev_appl_agg_train.to_csv("data/data_/previous_application_agg_train.csv", index=False)
    prev_appl_agg_test.to_csv("data/data_/previous_application_agg_test.csv", index=False)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

## `POS_CASH_balance.csv`

In [None]:
class PCBalImputer(Imputer):
    def __init__(self):
        self._regex_strings = None
        self._spec_impt_regex_val_num = None
        
        self._spec_impt_vals_num = {"CNT_INSTALMENT": -1.,
                                   "CNT_INSTALMENT_FUTURE": -1.,
                                   }
        self._default_imput_vals_num = "median"
        
        self._spec_impt_vals_cat = None
        self._default_imput_vals_cat = "missing_value"


def add_new_agg_cols_pc_bal(df):
    sort_by = ["MONTHS_BALANCE"]
    orig_cols = ["CNT_INSTALMENT", "CNT_INSTALMENT_FUTURE", "SK_DPD", "SK_DPD_DEF"]
    number_nn_entries = [3, 6]
    
    results = {}
    for col in orig_cols:
        for n in number_nn_entries:
            new_col = "MEAN_" + col + "_%d_NEAREST" % n
            print("Nearest mean for " + new_col)
            results[new_col] = df.groupby("SK_ID_PREV").apply(lambda df: mean_n_nearest(df, sort_by, n, col))
            
            new_col = "MIN_" + col + "_%d_NEAREST" % n
            print("Nearest min for " + new_col)
            results[new_col] = df.groupby("SK_ID_PREV").apply(lambda df: min_n_nearest(df, sort_by, n, col))
            
            new_col = "MAX_" + col + "_%d_NEAREST" % n
            print("Nearest max for " + new_col)
            results[new_col] = df.groupby("SK_ID_PREV").apply(lambda df: max_n_nearest(df, sort_by, n, col))

    return results

In [None]:
POS_CASH_balance = load_csv("data/download/POS_CASH_balance.csv")
print("POS_CASH_balance.shape", POS_CASH_balance.shape)

POS_CASH_balance = POS_CASH_balance.drop(["SK_ID_CURR"], axis="columns")
POS_CASH_balance = POS_CASH_balance.merge(previous_application[["SK_ID_PREV", "SK_ID_CURR"]], 
                                          how="left", on="SK_ID_PREV")
print("POS_CASH_balance.shape", POS_CASH_balance.shape)

# drop rows that does not have "SK_ID_PREV" in previous_application
POS_CASH_balance = POS_CASH_balance.dropna(subset=["SK_ID_CURR"])
POS_CASH_balance["SK_ID_CURR"] = POS_CASH_balance["SK_ID_CURR"].astype("int32")
print("POS_CASH_balance.shape", POS_CASH_balance.shape)

PC_bal_train, PC_bal_test = train_test_partition(POS_CASH_balance, "SK_ID_CURR", appl_train_key)
PC_bal_train = PC_bal_train.drop(["SK_ID_CURR"], axis="columns")
PC_bal_test = PC_bal_test.drop(["SK_ID_CURR"], axis="columns")

print("PC_bal_train.shape:", PC_bal_train.shape)
print("PC_bal_test.shape:", PC_bal_test.shape)

PC_bal_train.head()

In [None]:
time_start = time.time()

PC_bal_agg_train = PC_bal_train.copy()
PC_bal_agg_test = PC_bal_test.copy()

# impute missing values
imputer = PCBalImputer()
imputer.fit(PC_bal_agg_train)
PC_bal_agg_train = imputer.transform(PC_bal_agg_train)
PC_bal_agg_test = imputer.transform(PC_bal_agg_test)
print("PC_bal_agg_train.shape:", PC_bal_agg_train.shape)
print("PC_bal_agg_test.shape:", PC_bal_agg_test.shape)

# new agg cols
new_agg_cols_train = add_new_agg_cols_pc_bal(PC_bal_agg_train)
new_agg_cols_test = add_new_agg_cols_pc_bal(PC_bal_agg_test)

PC_bal_agg_train = PC_bal_agg_train.drop(["MONTHS_BALANCE"], axis="columns")
PC_bal_agg_test = PC_bal_agg_test.drop(["MONTHS_BALANCE"], axis="columns")


# aggregate over "SK_ID_PREV"
num_stats = ["sum", "mean", "median", "min", "max", "var"]
bool_stats = ["sum", "mean", mode, entropy]
cat_stats = ["count", "nunique", mode, entropy]
ohe_cat_stats = ["sum", "mean", "var"]

by_list_cols = ["SK_ID_PREV"]

aggregator = Aggregator(by_list_cols, num_stats, bool_stats, cat_stats,
                        ohe_cat_stats=ohe_cat_stats,
                        ohe_cat_max_class=10,
                        iqr=True, minmax_range=True, mean_median_diff=True)

aggregator.fit(PC_bal_agg_train)
PC_bal_agg_train = aggregator.transform(PC_bal_agg_train)
PC_bal_agg_test = aggregator.transform(PC_bal_agg_test)

print("PC_bal_agg_train.isnull().sum().sum()", PC_bal_agg_train.isnull().sum().sum())
print("PC_bal_agg_test.isnull().sum().sum()", PC_bal_agg_test.isnull().sum().sum())
print("PC_bal_agg_train.shape:", PC_bal_agg_train.shape)
print("PC_bal_agg_test.shape:", PC_bal_agg_test.shape)


# add new agg cols
for col in new_agg_cols_train:
    PC_bal_agg_train[col] = new_agg_cols_train[col]

for col in new_agg_cols_test:
    PC_bal_agg_test[col] = new_agg_cols_test[col]
print("PC_bal_agg_train.shape:", PC_bal_agg_train.shape)
print("PC_bal_agg_test.shape:", PC_bal_agg_test.shape)


# in case
imputer = DefaultImputer()
imputer.fit(PC_bal_agg_train)
PC_bal_agg_train = imputer.transform(PC_bal_agg_train)
PC_bal_agg_test = imputer.transform(PC_bal_agg_test)
print("PC_bal_agg_train.isnull().sum().sum()", PC_bal_agg_train.isnull().sum().sum())
print("PC_bal_agg_test.isnull().sum().sum()", PC_bal_agg_test.isnull().sum().sum())


# remove collinear columns
remover = CollinearColumnRemover(0.95)
remover.fit(PC_bal_agg_train)
PC_bal_agg_train = remover.transform(PC_bal_agg_train)
PC_bal_agg_test = remover.transform(PC_bal_agg_test)

# reset index
PC_bal_agg_train = PC_bal_agg_train.reset_index()
PC_bal_agg_test = PC_bal_agg_test.reset_index()
print("PC_bal_agg_train.shape:", PC_bal_agg_train.shape)
print("PC_bal_agg_test.shape:", PC_bal_agg_test.shape)


if False:
    PC_bal_agg_train.to_csv("data/data_/POS_CASH_balance_agg_train_tmp.csv", index=False)
    PC_bal_agg_test.to_csv("data/data_/POS_CASH_balance_agg_test_tmp.csv", index=False)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

In [None]:
# aggregate over SK_ID_CURR
time_start = time.time()

# this turns [0, 1] into bool
PC_bal_agg_train = load_csv("data/data_/POS_CASH_balance_agg_train_tmp.csv")
PC_bal_agg_test = load_csv("data/data_/POS_CASH_balance_agg_test_tmp.csv")
print("PC_bal_agg_train.shape:", PC_bal_agg_train.shape)
print("PC_bal_agg_test.shape:", PC_bal_agg_test.shape)


PC_bal_agg_train = PC_bal_agg_train.merge(prev_appl_train_keys, how="left", on="SK_ID_PREV")
PC_bal_agg_test = PC_bal_agg_test.merge(prev_appl_test_keys, how="left", on="SK_ID_PREV")
print("PC_bal_agg_train.isnull().sum().sum()", PC_bal_agg_train.isnull().sum().sum())
print("PC_bal_agg_test.isnull().sum().sum()", PC_bal_agg_test.isnull().sum().sum())
print("PC_bal_agg_train.shape:", PC_bal_agg_train.shape)
print("PC_bal_agg_test.shape:", PC_bal_agg_test.shape)


PC_bal_agg_train = PC_bal_agg_train.drop(["SK_ID_PREV"], axis="columns")
PC_bal_agg_test = PC_bal_agg_test.drop(["SK_ID_PREV"], axis="columns")
print("PC_bal_agg_train.shape:", PC_bal_agg_train.shape)
print("PC_bal_agg_test.shape:", PC_bal_agg_test.shape)


# aggregate
num_stats = ["sum", "mean", "median", "min", "max", "var"]
bool_stats = ["sum", "mean", mode, entropy]
cat_stats = ["count", "nunique", mode, entropy]
ohe_cat_stats = ["sum", "mean", "var"]

by_list_cols = ["SK_ID_CURR"]

aggregator = Aggregator(by_list_cols, num_stats, bool_stats, cat_stats,
                        ohe_cat_stats=ohe_cat_stats,
                        ohe_cat_max_class=10,
                        iqr=True, minmax_range=True, mean_median_diff=True)

aggregator.fit(PC_bal_agg_train)
PC_bal_agg_train = aggregator.transform(PC_bal_agg_train)
PC_bal_agg_test = aggregator.transform(PC_bal_agg_test)
print("PC_bal_agg_train.isnull().sum().sum()", PC_bal_agg_train.isnull().sum().sum())
print("PC_bal_agg_test.isnull().sum().sum()", PC_bal_agg_test.isnull().sum().sum())
print("PC_bal_agg_train.shape:", PC_bal_agg_train.shape)
print("PC_bal_agg_test.shape:", PC_bal_agg_test.shape)


# in case
imputer = DefaultImputer()
imputer.fit(PC_bal_agg_train)
PC_bal_agg_train = imputer.transform(PC_bal_agg_train)
PC_bal_agg_test = imputer.transform(PC_bal_agg_test)
print("PC_bal_agg_train.isnull().sum().sum()", PC_bal_agg_train.isnull().sum().sum())
print("PC_bal_agg_test.isnull().sum().sum()", PC_bal_agg_test.isnull().sum().sum())


# remove collinear columns
remover = CollinearColumnRemover(0.95)
remover.fit(PC_bal_agg_train)
PC_bal_agg_train = remover.transform(PC_bal_agg_train)
PC_bal_agg_test = remover.transform(PC_bal_agg_test)
print("PC_bal_agg_train.shape:", PC_bal_agg_train.shape)
print("PC_bal_agg_test.shape:", PC_bal_agg_test.shape)


# reset index
PC_bal_agg_train = PC_bal_agg_train.reset_index()
PC_bal_agg_test = PC_bal_agg_test.reset_index()
print("PC_bal_agg_train.shape:", PC_bal_agg_train.shape)
print("PC_bal_agg_test.shape:", PC_bal_agg_test.shape)


if False:
    PC_bal_agg_train.to_csv("data/data_/POS_CASH_balance_agg_train.csv", index=False)
    PC_bal_agg_test.to_csv("data/data_/POS_CASH_balance_agg_test.csv", index=False)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

## `installments_payments.csv`

In [None]:
class InstalPayImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, df_train):
        self._days_entry_payment_impute = df_train["DAYS_ENTRY_PAYMENT"].min() - 10
        return self
    
    def transform(self, df):
        df["DAYS_ENTRY_PAYMENT"] = df["DAYS_ENTRY_PAYMENT"].fillna(self._days_entry_payment_impute)
        df["AMT_PAYMENT"] = df["AMT_PAYMENT"].fillna(0.)
        return df


def add_new_cols_inst_pay(df):
    df["DAYS_INSTAL_PAY_DIFF"] = df["DAYS_ENTRY_PAYMENT"] - df["DAYS_INSTALMENT"]
    df["DAYS_INSTAL_PAY_DIFF_ISPOSITIVE"] = df["DAYS_INSTAL_PAY_DIFF"] > 0
    
    df["AMT_INSTAL_PAY_DIFF"] = df["AMT_PAYMENT"] - df["AMT_INSTALMENT"]
    df["AMT_INSTAL_PAY_DIFF_ISPOSITIVE"] = df["AMT_INSTAL_PAY_DIFF"] > 0
    
    df["AMT_PAY_INSTAL_RATIO"] = np.clip((df["AMT_PAYMENT"] + 1) / (df["AMT_INSTALMENT"] + 1), 0., 10.)
    return df


def add_new_agg_cols_inst_pay(df):
    sort_by = ["DAYS_INSTALMENT"]
    orig_cols = ["DAYS_INSTAL_PAY_DIFF", "AMT_INSTAL_PAY_DIFF", "AMT_PAY_INSTAL_RATIO"]
    number_nn_entries = [3, 6]
    
    results = {}
    
    for col in orig_cols:
        for n in number_nn_entries:
            new_col = "MEAN_" + col + "_%d_NEAREST" % n
            print("Nearest mean for " + new_col)
            results[new_col] = df.groupby("SK_ID_PREV").apply(lambda df: mean_n_nearest(df, sort_by, n, col))
            
            new_col = "MIN_" + col + "_%d_NEAREST" % n
            print("Nearest min for " + new_col)
            results[new_col] = df.groupby("SK_ID_PREV").apply(lambda df: min_n_nearest(df, sort_by, n, col))
            
            new_col = "MAX_" + col + "_%d_NEAREST" % n
            print("Nearest max for " + new_col)
            results[new_col] = df.groupby("SK_ID_PREV").apply(lambda df: max_n_nearest(df, sort_by, n, col))
    
    return results

In [None]:
installments_payments = load_csv("data/download/installments_payments.csv")
print("installments_payments.shape", installments_payments.shape)

installments_payments = installments_payments.drop(["SK_ID_CURR"], axis="columns")
installments_payments = installments_payments.merge(previous_application[["SK_ID_PREV", "SK_ID_CURR"]],
                                                   how="left", on="SK_ID_PREV")

# drop rows that does not have "SK_ID_PREV" in previous_application
installments_payments = installments_payments.dropna(subset=["SK_ID_CURR"])
installments_payments["SK_ID_CURR"] = installments_payments["SK_ID_CURR"].astype("int32")
print("installments_payments.shape", installments_payments.shape)

inst_pay_train, inst_pay_test = train_test_partition(installments_payments, "SK_ID_CURR", appl_train_key)
inst_pay_train = inst_pay_train.drop(["SK_ID_CURR"], axis="columns")
inst_pay_test = inst_pay_test.drop(["SK_ID_CURR"], axis="columns")
print("inst_pay_train.shape:", inst_pay_train.shape)
print("inst_pay_test.shape:", inst_pay_test.shape)

inst_pay_train.head()

In [None]:
time_start = time.time()

inst_pay_agg_train = inst_pay_train.copy()
inst_pay_agg_test = inst_pay_test.copy()
print("inst_pay_agg_train.shape:", inst_pay_agg_train.shape)
print("inst_pay_agg_test.shape:", inst_pay_agg_test.shape)


# impute
print("inst_pay_agg_train.isnull().sum().sum()", inst_pay_agg_train.isnull().sum().sum())
print("inst_pay_agg_test.isnull().sum().sum()", inst_pay_agg_test.isnull().sum().sum())
imputer = InstalPayImputer()
imputer.fit(inst_pay_agg_train)
inst_pay_agg_train = imputer.transform(inst_pay_agg_train)
inst_pay_agg_test = imputer.transform(inst_pay_agg_test)
print("inst_pay_agg_train.isnull().sum().sum()", inst_pay_agg_train.isnull().sum().sum())
print("inst_pay_agg_test.isnull().sum().sum()", inst_pay_agg_test.isnull().sum().sum())
print("inst_pay_agg_train.shape:", inst_pay_agg_train.shape)
print("inst_pay_agg_test.shape:", inst_pay_agg_test.shape)


# add some more cols
inst_pay_agg_train = add_new_cols_inst_pay(inst_pay_agg_train)
inst_pay_agg_test = add_new_cols_inst_pay(inst_pay_agg_test)
print("inst_pay_agg_train.shape:", inst_pay_agg_train.shape)
print("inst_pay_agg_test.shape:", inst_pay_agg_test.shape)


# new agg cols
new_agg_cols_train = add_new_agg_cols_inst_pay(inst_pay_agg_train)
new_agg_cols_test = add_new_agg_cols_inst_pay(inst_pay_agg_test)

inst_pay_agg_train = inst_pay_agg_train.drop(["DAYS_INSTALMENT"], axis="columns")
inst_pay_agg_test = inst_pay_agg_test.drop(["DAYS_INSTALMENT"], axis="columns")


# aggregate over "SK_ID_PREV"
num_stats = ["sum", "mean", "median", "min", "max", "var"]
bool_stats = ["sum", "mean", mode, entropy]
cat_stats = ["count", "nunique", mode, entropy]
ohe_cat_stats = ["sum", "mean", "var"]

by_list_cols = ["SK_ID_PREV"]

aggregator = Aggregator(by_list_cols, num_stats, bool_stats, cat_stats,
                        ohe_cat_stats=ohe_cat_stats,
                        ohe_cat_max_class=10,
                        iqr=True, minmax_range=True, mean_median_diff=True)

aggregator.fit(inst_pay_agg_train)
inst_pay_agg_train = aggregator.transform(inst_pay_agg_train)
inst_pay_agg_test = aggregator.transform(inst_pay_agg_test)
print("inst_pay_agg_train.isnull().sum().sum():", inst_pay_agg_train.isnull().sum().sum())
print("inst_pay_agg_test.isnull().sum().sum():", inst_pay_agg_test.isnull().sum().sum())
print("inst_pay_agg_train.shape:", inst_pay_agg_train.shape)
print("inst_pay_agg_test.shape:", inst_pay_agg_test.shape)


# add new agg cols
for col in new_agg_cols_train:
    inst_pay_agg_train[col] = new_agg_cols_train[col]
    
for col in new_agg_cols_test:
    inst_pay_agg_test[col] = new_agg_cols_test[col]

print("inst_pay_agg_train.shape:", inst_pay_agg_train.shape)
print("inst_pay_agg_test.shape:", inst_pay_agg_test.shape)


# just in case
imputer = DefaultImputer()
imputer.fit(inst_pay_agg_train)
inst_pay_agg_train = imputer.transform(inst_pay_agg_train)
inst_pay_agg_test = imputer.transform(inst_pay_agg_test)
print("inst_pay_agg_train.isnull().sum().sum()", inst_pay_agg_train.isnull().sum().sum())
print("inst_pay_agg_test.isnull().sum().sum()", inst_pay_agg_test.isnull().sum().sum())


# remove collinear columns
remover = CollinearColumnRemover(0.95)
remover.fit(inst_pay_agg_train)
inst_pay_agg_train = remover.transform(inst_pay_agg_train)
inst_pay_agg_test = remover.transform(inst_pay_agg_test)
print("inst_pay_agg_train.shape:", inst_pay_agg_train.shape)
print("inst_pay_agg_test.shape:", inst_pay_agg_test.shape)

# reset index
inst_pay_agg_train = inst_pay_agg_train.reset_index()
inst_pay_agg_test = inst_pay_agg_test.reset_index()
print("inst_pay_agg_train.shape:", inst_pay_agg_train.shape)
print("inst_pay_agg_test.shape:", inst_pay_agg_test.shape)

if False:
    inst_pay_agg_train.to_csv("data/data_/installments_payments_agg_train_tmp.csv", index=False)
    inst_pay_agg_test.to_csv("data/data_/installments_payments_agg_test_tmp.csv", index=False)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

In [None]:
# aggregate over "SK_ID_CURR"
time_start = time.time()


inst_pay_agg_train = load_csv("data/data_/installments_payments_agg_train_tmp.csv")
inst_pay_agg_test = load_csv("data/data_/installments_payments_agg_test_tmp.csv")
print("inst_pay_agg_train.shape:", inst_pay_agg_train.shape)
print("inst_pay_agg_test.shape:", inst_pay_agg_test.shape)


inst_pay_agg_train = inst_pay_agg_train.merge(prev_appl_train_keys, how="left", on="SK_ID_PREV")
inst_pay_agg_test = inst_pay_agg_test.merge(prev_appl_test_keys, how="left", on="SK_ID_PREV")
print("inst_pay_agg_train.isnull().sum().sum():", inst_pay_agg_train.isnull().sum().sum())
print("inst_pay_agg_test.isnull().sum().sum():", inst_pay_agg_test.isnull().sum().sum())
print("inst_pay_agg_train.shape:", inst_pay_agg_train.shape)
print("inst_pay_agg_test.shape:", inst_pay_agg_test.shape)

inst_pay_agg_train = inst_pay_agg_train.drop(["SK_ID_PREV"], axis="columns")
inst_pay_agg_test = inst_pay_agg_test.drop(["SK_ID_PREV"], axis="columns")
print("inst_pay_agg_train.shape:", inst_pay_agg_train.shape)
print("inst_pay_agg_test.shape:", inst_pay_agg_test.shape)


# aggregate
num_stats = ["sum", "mean", "median", "min", "max", "var"]
bool_stats = ["sum", "mean", mode, entropy]
cat_stats = ["count", "nunique", mode, entropy]
ohe_cat_stats = ["sum", "mean", "var"]

by_list_cols = ["SK_ID_CURR"]

aggregator = Aggregator(by_list_cols, num_stats, bool_stats, cat_stats,
                        ohe_cat_stats=ohe_cat_stats,
                        ohe_cat_max_class=10,
                        iqr=True, minmax_range=True, mean_median_diff=True)

aggregator.fit(inst_pay_agg_train)
inst_pay_agg_train = aggregator.transform(inst_pay_agg_train)
inst_pay_agg_test = aggregator.transform(inst_pay_agg_test)
print("inst_pay_agg_train.isnull().sum().sum():", inst_pay_agg_train.isnull().sum().sum())
print("inst_pay_agg_test.isnull().sum().sum():", inst_pay_agg_test.isnull().sum().sum())
print("inst_pay_agg_train.shape:", inst_pay_agg_train.shape)
print("inst_pay_agg_test.shape:", inst_pay_agg_test.shape)


# just in case
imputer = DefaultImputer()
imputer.fit(inst_pay_agg_train)
inst_pay_agg_train = imputer.transform(inst_pay_agg_train)
inst_pay_agg_test = imputer.transform(inst_pay_agg_test)
print("inst_pay_agg_train.isnull().sum().sum()", inst_pay_agg_train.isnull().sum().sum())
print("inst_pay_agg_test.isnull().sum().sum()", inst_pay_agg_test.isnull().sum().sum())


# remove collinear columns
remover = CollinearColumnRemover(0.95)
remover.fit(inst_pay_agg_train)
inst_pay_agg_train = remover.transform(inst_pay_agg_train)
inst_pay_agg_test = remover.transform(inst_pay_agg_test)
print("inst_pay_agg_train.shape:", inst_pay_agg_train.shape)
print("inst_pay_agg_test.shape:", inst_pay_agg_test.shape)


# reset index
inst_pay_agg_train = inst_pay_agg_train.reset_index()
inst_pay_agg_test = inst_pay_agg_test.reset_index()
print("inst_pay_agg_train.shape:", inst_pay_agg_train.shape)
print("inst_pay_agg_test.shape:", inst_pay_agg_test.shape)

if False:
    inst_pay_agg_train.to_csv("data/data_/installments_payments_agg_train.csv", index=False)
    inst_pay_agg_test.to_csv("data/data_/installments_payments_agg_test.csv", index=False)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

## `credit_card_balance.csv`

In [None]:
class CCBalImputer(Imputer):
    def __init__(self):
        self._regex_strings = None
        self._spec_impt_regex_val_num = None
        
        self._spec_impt_vals_num = {"AMT_DRAWINGS_ATM_CURRENT": 0.,
                                   "AMT_DRAWINGS_OTHER_CURRENT": 0., 
                                    "AMT_DRAWINGS_POS_CURRENT": 0.,
                                    "AMT_INST_MIN_REGULARITY": 0.,
                                    "AMT_PAYMENT_CURRENT": 0.,
                                    "CNT_DRAWINGS_ATM_CURRENT": 0.,
                                    "CNT_DRAWINGS_OTHER_CURRENT": 0.,
                                    "CNT_DRAWINGS_POS_CURRENT": 0.,
                                    "CNT_INSTALMENT_MATURE_CUM": 0.,
                                    
                                   }
        self._default_imput_vals_num = 0.
        
        self._spec_impt_vals_cat = None
        self._default_imput_vals_cat = "missing_value"
        

def add_new_cols_cc_bal(df):
    df["AMT_PAYMENT_TO_BALANCE"] = df["AMT_PAYMENT_TOTAL_CURRENT"] / df["AMT_BALANCE"].replace(0., 1.)
    df["AMT_BALANCE_TO_CREDIT_LIMIT"] = (df["AMT_BALANCE"] + 1) / (df["AMT_CREDIT_LIMIT_ACTUAL"] + 1)
    df["AMT_DRAWING_TOTAL"] = (df["AMT_DRAWINGS_ATM_CURRENT"] + df["AMT_DRAWINGS_CURRENT"] + 
                              df["AMT_DRAWINGS_OTHER_CURRENT"] + df["AMT_DRAWINGS_POS_CURRENT"])
    df["AMT_DRAWING_TOTAL_TO_CREDIT_LIMIT"] = df["AMT_DRAWING_TOTAL"] / (df["AMT_CREDIT_LIMIT_ACTUAL"] + 1)
    df["AMT_PAYMENT_TOTAL_TO_DRAWING_TOTAL"] = (df["AMT_PAYMENT_TOTAL_CURRENT"] / 
                                                df["AMT_DRAWING_TOTAL"].replace(0., 1.))
    return df


def add_new_agg_cols_cc_bal(df):
    sort_by = ["MONTHS_BALANCE"]
    orig_cols = ["AMT_PAYMENT_TO_BALANCE", "AMT_BALANCE_TO_CREDIT_LIMIT", 
                 "AMT_DRAWING_TOTAL_TO_CREDIT_LIMIT", "AMT_PAYMENT_TOTAL_TO_DRAWING_TOTAL", 
                 "SK_DPD"]
    number_nn_entries = [3, 6]
    
    results = {}
    
    for col in orig_cols:
        for n in number_nn_entries:
            new_col = "MEAN_" + col + "_%d_NEAREST" % n
            print("Nearest mean for " + new_col)
            results[new_col] = df.groupby("SK_ID_PREV").apply(lambda df: mean_n_nearest(df, sort_by, n, col))
            
            new_col = "MIN_" + col + "_%d_NEAREST" % n
            print("Nearest min for " + new_col)
            results[new_col] = df.groupby("SK_ID_PREV").apply(lambda df: min_n_nearest(df, sort_by, n, col))
            
            new_col = "MAX_" + col + "_%d_NEAREST" % n
            print("Nearest max for " + new_col)
            results[new_col] = df.groupby("SK_ID_PREV").apply(lambda df: max_n_nearest(df, sort_by, n, col))
    
    return results

In [None]:
credit_card_balance = load_csv("data/download/credit_card_balance.csv")
print("credit_card_balance.shape", credit_card_balance.shape)

credit_card_balance = credit_card_balance.drop(["SK_ID_CURR"], axis="columns")
credit_card_balance = credit_card_balance.merge(previous_application[["SK_ID_PREV", "SK_ID_CURR"]],
                                               how="left", on="SK_ID_PREV")

credit_card_balance = credit_card_balance.dropna(subset=["SK_ID_CURR"])
credit_card_balance["SK_ID_CURR"] = credit_card_balance["SK_ID_CURR"].astype("int32")
print("credit_card_balance.shape", credit_card_balance.shape)

cc_bal_train, cc_bal_test = train_test_partition(credit_card_balance, "SK_ID_CURR", appl_train_key)
cc_bal_train = cc_bal_train.drop(["SK_ID_CURR"], axis="columns")
cc_bal_test = cc_bal_test.drop(["SK_ID_CURR"], axis="columns")

print("cc_bal_train.shape:", cc_bal_train.shape)
print("cc_bal_test.shape:", cc_bal_test.shape)

cc_bal_train.head()

In [None]:
time_start = time.time()

cc_bal_agg_train = cc_bal_train.copy()
cc_bal_agg_test = cc_bal_test.copy()
print("cc_bal_agg_train.shape", cc_bal_agg_train.shape)
print("cc_bal_agg_test.shape", cc_bal_agg_test.shape)


# impute missing values
imputer = CCBalImputer()
imputer.fit(cc_bal_agg_train)
cc_bal_agg_train = imputer.transform(cc_bal_agg_train)
cc_bal_agg_test = imputer.transform(cc_bal_agg_test)
print("cc_bal_agg_train.isnull().sum().sum():", cc_bal_agg_train.isnull().sum().sum())
print("cc_bal_agg_test.isnull().sum().sum():", cc_bal_agg_test.isnull().sum().sum())
print("cc_bal_agg_train.shape", cc_bal_agg_train.shape)
print("cc_bal_agg_test.shape", cc_bal_agg_test.shape)


# add new cols
cc_bal_agg_train = add_new_cols_cc_bal(cc_bal_agg_train)
cc_bal_agg_test = add_new_cols_cc_bal(cc_bal_agg_test)


# remove collinear columns
remover = CollinearColumnRemover(0.95)
remover.fit(cc_bal_agg_train)
cc_bal_agg_train = remover.transform(cc_bal_agg_train)
cc_bal_agg_test = remover.transform(cc_bal_agg_test)
print("cc_bal_agg_train.shape", cc_bal_agg_train.shape)
print("cc_bal_agg_test.shape", cc_bal_agg_test.shape)


# new agg cols
new_agg_cols_train = add_new_agg_cols_cc_bal(cc_bal_agg_train)
new_agg_cols_test = add_new_agg_cols_cc_bal(cc_bal_agg_test)
print("cc_bal_agg_train.shape", cc_bal_agg_train.shape)
print("cc_bal_agg_test.shape", cc_bal_agg_test.shape)


# aggregate over "SK_ID_PREV"
num_stats = ["sum", "mean", "median", "min", "max", "var"]
bool_stats = ["sum", "mean", mode, entropy]
cat_stats = ["count", "nunique", mode, entropy]
ohe_cat_stats = ["sum", "mean", "var"]

by_list_cols = ["SK_ID_PREV"]

aggregator = Aggregator(by_list_cols, num_stats, bool_stats, cat_stats,
                        ohe_cat_stats=ohe_cat_stats,
                        ohe_cat_max_class=10,
                        iqr=True, minmax_range=True, mean_median_diff=True)

aggregator.fit(cc_bal_agg_train)
cc_bal_agg_train = aggregator.transform(cc_bal_agg_train)
cc_bal_agg_test = aggregator.transform(cc_bal_agg_test)
print("cc_bal_agg_train.isnull().sum().sum():", cc_bal_agg_train.isnull().sum().sum())
print("cc_bal_agg_test.isnull().sum().sum():", cc_bal_agg_test.isnull().sum().sum())
print("cc_bal_agg_train.shape", cc_bal_agg_train.shape)
print("cc_bal_agg_test.shape", cc_bal_agg_test.shape)


# add new agg cols
for col in new_agg_cols_train:
    cc_bal_agg_train[col] = new_agg_cols_train[col]

for col in new_agg_cols_test:
    cc_bal_agg_test[col] = new_agg_cols_test[col]
print("cc_bal_agg_train.shape", cc_bal_agg_train.shape)
print("cc_bal_agg_test.shape", cc_bal_agg_test.shape)


# just in case
imputer = DefaultImputer()
imputer.fit(cc_bal_agg_train)
cc_bal_agg_train = imputer.transform(cc_bal_agg_train)
cc_bal_agg_test = imputer.transform(cc_bal_agg_test)
print("cc_bal_agg_train.isnull().sum().sum():", cc_bal_agg_train.isnull().sum().sum())
print("cc_bal_agg_test.isnull().sum().sum():", cc_bal_agg_test.isnull().sum().sum())


# remove collinear columns
remover = CollinearColumnRemover(0.95)
remover.fit(cc_bal_agg_train)
cc_bal_agg_train = remover.transform(cc_bal_agg_train)
cc_bal_agg_test = remover.transform(cc_bal_agg_test)
print("cc_bal_agg_train.shape", cc_bal_agg_train.shape)
print("cc_bal_agg_test.shape", cc_bal_agg_test.shape)


# reset index
cc_bal_agg_train = cc_bal_agg_train.reset_index()
cc_bal_agg_test = cc_bal_agg_test.reset_index()
print("cc_bal_agg_train.shape", cc_bal_agg_train.shape)
print("cc_bal_agg_test.shape", cc_bal_agg_test.shape)

if False:
    cc_bal_agg_train.to_csv("data/data_/credit_card_balance_agg_train_tmp.csv", index=False)
    cc_bal_agg_test.to_csv("data/data_/credit_card_balance_agg_test_tmp.csv", index=False)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

In [None]:
# aggregate over "SK_ID_CURR"
time_start = time.time()


cc_bal_agg_train = load_csv("data/data_/credit_card_balance_agg_train_tmp.csv")
cc_bal_agg_test = load_csv("data/data_/credit_card_balance_agg_test_tmp.csv")
print("cc_bal_agg_train.shape", cc_bal_agg_train.shape)
print("cc_bal_agg_test.shape", cc_bal_agg_test.shape)

cc_bal_agg_train = cc_bal_agg_train.merge(prev_appl_train_keys, how="left", on="SK_ID_PREV")
cc_bal_agg_test = cc_bal_agg_test.merge(prev_appl_test_keys, how="left", on="SK_ID_PREV")
print("cc_bal_agg_train.isnull().sum().sum():", cc_bal_agg_train.isnull().sum().sum())
print("cc_bal_agg_test.isnull().sum().sum():", cc_bal_agg_test.isnull().sum().sum())
print("cc_bal_agg_train.shape", cc_bal_agg_train.shape)
print("cc_bal_agg_test.shape", cc_bal_agg_test.shape)


cc_bal_agg_train = cc_bal_agg_train.drop(["SK_ID_PREV"], axis="columns")
cc_bal_agg_test = cc_bal_agg_test.drop(["SK_ID_PREV"], axis="columns")
print("cc_bal_agg_train.shape", cc_bal_agg_train.shape)
print("cc_bal_agg_test.shape", cc_bal_agg_test.shape)

# aggregate
num_stats = ["sum", "mean", "median", "min", "max", "var"]
bool_stats = ["sum", "mean", mode, entropy]
cat_stats = ["count", "nunique", mode, entropy]
ohe_cat_stats = ["sum", "mean", "var"]

by_list_cols = ["SK_ID_CURR"]

aggregator = Aggregator(by_list_cols, num_stats, bool_stats, cat_stats,
                        ohe_cat_stats=ohe_cat_stats,
                        ohe_cat_max_class=10,
                        iqr=True, minmax_range=True, mean_median_diff=True)

aggregator.fit(cc_bal_agg_train)
cc_bal_agg_train = aggregator.transform(cc_bal_agg_train)
cc_bal_agg_test = aggregator.transform(cc_bal_agg_test)
print("cc_bal_agg_train.isnull().sum().sum():", cc_bal_agg_train.isnull().sum().sum())
print("cc_bal_agg_test.isnull().sum().sum():", cc_bal_agg_test.isnull().sum().sum())
print("cc_bal_agg_train.shape", cc_bal_agg_train.shape)
print("cc_bal_agg_test.shape", cc_bal_agg_test.shape)


# in case
imputer = DefaultImputer()
imputer.fit(cc_bal_agg_train)
cc_bal_agg_train = imputer.transform(cc_bal_agg_train)
cc_bal_agg_test = imputer.transform(cc_bal_agg_test)
print("cc_bal_agg_train.isnull().sum().sum():", cc_bal_agg_train.isnull().sum().sum())
print("cc_bal_agg_test.isnull().sum().sum():", cc_bal_agg_test.isnull().sum().sum())


# remove collinear columns
remover = CollinearColumnRemover(0.95)
remover.fit(cc_bal_agg_train)
cc_bal_agg_train = remover.transform(cc_bal_agg_train)
cc_bal_agg_test = remover.transform(cc_bal_agg_test)
print("cc_bal_agg_train.shape", cc_bal_agg_train.shape)
print("cc_bal_agg_test.shape", cc_bal_agg_test.shape)


# reset index
cc_bal_agg_train = cc_bal_agg_train.reset_index()
cc_bal_agg_test = cc_bal_agg_test.reset_index()
print("cc_bal_agg_train.shape", cc_bal_agg_train.shape)
print("cc_bal_agg_test.shape", cc_bal_agg_test.shape)

if False:
    cc_bal_agg_train.to_csv("data/data_/credit_card_balance_agg_train.csv", index=False)
    cc_bal_agg_test.to_csv("data/data_/credit_card_balance_agg_test.csv", index=False)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

# Merge all dataframes

In [8]:
def train_test_col_align(df_train, df_test, exclude_cols=None):
    if exclude_cols is None:
        exclude_cols = []
    cols_train = df_train.columns.to_list()
    
    for col in exclude_cols:
        assert col in cols_train, col + " is not in df_train"
        
    test_cols = [col for col in cols_train if col not in exclude_cols]
    return df_train[test_cols + exclude_cols], df_test[test_cols]


def add_prefix_to_cols(df, prefix, exclude_cols=None):
    df = df.copy()
    if exclude_cols is None:
        exclude_cols = []
        
    cols = df.columns.to_list()
    for i, col in enumerate(cols):
        if col not in exclude_cols:
            cols[i] = prefix + col
    df.columns = cols
    return df


def merge_dataframes(df_left, dfs_right, 
                     left_prefix, right_prefixes, 
                     on_col="SK_ID_CURR"):
    assert isinstance(dfs_right, list), "dfs_right must be a list"
    assert isinstance(right_prefixes, list), "right_prefixes must be a list"
    assert len(dfs_right) == len(right_prefixes), "dfs_right and right_prefixes must have the same len"
    
    result = df_left
    result = add_prefix_to_cols(result, left_prefix, exclude_cols=[on_col])
    
    for df, prefix in zip(dfs_right, right_prefixes):
        print("Merging with " + prefix)
        df = add_prefix_to_cols(df, prefix, exclude_cols=[on_col])
        result = result.merge(df, how="left", on=on_col)
        
        df_cols = [col for col in result.columns if col.startswith(prefix)]
        # add mask column to tell which rows in df are missing
        result[prefix + "MISSING_ROW"] = result[df_cols[0]].isnull()
        
    return result


class MergeMissingImputer(BaseEstimator, TransformerMixin):
    
    def fit(self, df_train):
        cols = df_train.columns.to_list()
        cols_num = df_train.select_dtypes(["number"]).columns.to_list()
        cols_cat = [col for col in cols if col not in cols_num]
        
        self._impute_values = {}
        for col in cols_num:
            if df_train[col].isnull().sum() > 0:
                self._impute_values[col] = df_train[col].median()
        
        for col in cols_cat:
            if df_train[col].isnull().sum() > 0:
                self._impute_values[col] = mode(df_train[col])
    
    def transform(self, df):
        df = df.copy()
        for col, val in self._impute_values.items():
            try:
                df[col] = df[col].fillna(val)
            except AttributeError:
                print("problem with " + col)
                raise AttributeError()
        return df

### Load `application`

In [9]:
application_train = load_csv("data/data_/application_train.csv")
application_test = load_csv("data/data_/application_test.csv")

print("application_train.shape:", application_train.shape)
print("application_test.shape:", application_test.shape)

application_train, application_test = train_test_col_align(application_train, application_test, 
                                                           exclude_cols=["TARGET"])
print("application_train.shape:", application_train.shape)
print("application_test.shape:", application_test.shape)

Memory usage before changing types 311.82 MB
Memory usage after changing types 143.31 MB
Memory usage before changing types 49.04 MB
Memory usage after changing types 22.53 MB
application_train.shape: (307511, 139)
application_test.shape: (48744, 138)
application_train.shape: (307511, 139)
application_test.shape: (48744, 138)


In [10]:
application_train.head()

Unnamed: 0,OWN_CAR_AGE_ISNULL,EXT_SOURCE_1_ISNULL,EXT_SOURCE_3_ISNULL,TOTALAREA_MODE_ISNULL,APARTMENTS_AVG_ISNULL,BASEMENTAREA_AVG_ISNULL,YEARS_BUILD_AVG_ISNULL,COMMONAREA_AVG_ISNULL,ELEVATORS_AVG_ISNULL,LANDAREA_AVG_ISNULL,...,ORGANIZATION_TYPE,FONDKAPREMONT_MODE,HOUSETYPE_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,AMT_INCOME_TOTAL_LOG,DAYS_EMPLOYED_POSITIVE,CREDIT_TO_INCOME,CREDIT_TO_GOODS,TARGET
0,True,False,False,False,False,False,False,False,False,False,...,Business Entity Type 3,reg oper account,block of flats,"Stone, brick",No,12.218495,False,2.007889,1.158397,1
1,True,False,True,False,False,False,False,False,False,False,...,School,reg oper account,block of flats,Block,No,12.506177,False,4.79075,1.145199,0
2,False,True,False,True,True,True,True,True,True,True,...,Government,missing_value,missing_value,missing_value,missing_value,11.119883,False,2.0,1.0,0
3,True,True,True,True,True,True,True,True,True,True,...,Business Entity Type 3,missing_value,missing_value,missing_value,missing_value,11.81303,False,2.316167,1.052803,0
4,True,True,True,True,True,True,True,True,True,True,...,Religion,missing_value,missing_value,missing_value,missing_value,11.707669,False,4.222222,1.0,0


In [11]:
application_test.head()

Unnamed: 0,OWN_CAR_AGE_ISNULL,EXT_SOURCE_1_ISNULL,EXT_SOURCE_3_ISNULL,TOTALAREA_MODE_ISNULL,APARTMENTS_AVG_ISNULL,BASEMENTAREA_AVG_ISNULL,YEARS_BUILD_AVG_ISNULL,COMMONAREA_AVG_ISNULL,ELEVATORS_AVG_ISNULL,LANDAREA_AVG_ISNULL,...,WEEKDAY_APPR_PROCESS_START,ORGANIZATION_TYPE,FONDKAPREMONT_MODE,HOUSETYPE_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,AMT_INCOME_TOTAL_LOG,DAYS_EMPLOYED_POSITIVE,CREDIT_TO_INCOME,CREDIT_TO_GOODS
0,True,False,False,False,False,False,True,True,True,True,...,TUESDAY,Kindergarten,missing_value,block of flats,"Stone, brick",No,11.81303,False,4.213333,1.264
1,True,False,False,True,True,True,True,True,True,True,...,FRIDAY,Self-employed,missing_value,missing_value,missing_value,missing_value,11.502875,False,2.250182,1.2376
2,False,True,False,True,True,True,True,True,True,True,...,MONDAY,Transport: type 3,missing_value,missing_value,missing_value,missing_value,12.218495,False,3.275378,1.0528
3,True,False,False,False,False,False,False,False,False,False,...,WEDNESDAY,Business Entity Type 3,reg oper account,block of flats,Panel,No,12.660328,False,5.0,1.0
4,False,False,True,True,True,True,True,True,True,True,...,FRIDAY,Business Entity Type 3,missing_value,missing_value,missing_value,missing_value,12.100712,False,3.475,1.0


### Load `bureau`

In [12]:
bureau_train = load_csv("data/data_/bureau_agg_train.csv")
bureau_test = load_csv("data/data_/bureau_agg_test.csv")
print("bureau_train.shape:", bureau_train.shape)
print("bureau_test.shape:", bureau_test.shape)

bureau_train, bureau_test = train_test_col_align(bureau_train, bureau_test)
print("bureau_train.shape:", bureau_train.shape)
print("bureau_test.shape:", bureau_test.shape)

Memory usage before changing types 318.82 MB
Memory usage after changing types 155.99 MB
Memory usage before changing types 51.21 MB
Memory usage after changing types 25.05 MB
bureau_train.shape: (263491, 160)
bureau_test.shape: (42320, 160)
bureau_train.shape: (263491, 160)
bureau_test.shape: (42320, 160)


### Load `bureau_balance`

In [13]:
bureau_balance_train = load_csv("data/data_/bureau_balance_agg_train.csv")
bureau_balance_test = load_csv("data/data_/bureau_balance_agg_test.csv")
print("bureau_balance_train.shape", bureau_balance_train.shape)
print("bureau_balance_test.shape", bureau_balance_test.shape)

bureau_balance_train, bureau_balance_test = train_test_col_align(bureau_balance_train, bureau_balance_test)
print("bureau_balance_train.shape", bureau_balance_train.shape)
print("bureau_balance_test.shape", bureau_balance_test.shape)

Memory usage before changing types 216.93 MB
Memory usage after changing types 107.36 MB
Memory usage before changing types 99.52 MB
Memory usage after changing types 49.25 MB
bureau_balance_train.shape (92231, 294)
bureau_balance_test.shape (42311, 294)
bureau_balance_train.shape (92231, 294)
bureau_balance_test.shape (42311, 294)


### Load `previous_application`

In [14]:
previous_application_train = load_csv("data/data_/previous_application_agg_train.csv")
previous_application_test = load_csv("data/data_/previous_application_agg_test.csv")
print("previous_application_train.shape", previous_application_train.shape)
print("previous_application_test.shape", previous_application_test.shape)

previous_application_train, previous_application_test = train_test_col_align(previous_application_train, 
                                                                             previous_application_test)

print("previous_application_train.shape", previous_application_train.shape)
print("previous_application_test.shape", previous_application_test.shape)

Memory usage before changing types 785.85 MB
Memory usage after changing types 379.84 MB
Memory usage before changing types 129.06 MB
Memory usage after changing types 62.39 MB
previous_application_train.shape (291057, 348)
previous_application_test.shape (47800, 348)
previous_application_train.shape (291057, 348)
previous_application_test.shape (47800, 348)


### Load `POS_CASH_balance`

In [15]:
POS_CASH_balance_train = load_csv("data/data_/POS_CASH_balance_agg_train.csv")
POS_CASH_balance_test = load_csv("data/data_/POS_CASH_balance_agg_test.csv")
print("POS_CASH_balance_train.shape", POS_CASH_balance_train.shape)
print("POS_CASH_balance_test.shape", POS_CASH_balance_test.shape)

POS_CASH_balance_train, POS_CASH_balance_test = train_test_col_align(POS_CASH_balance_train, POS_CASH_balance_test)
print("POS_CASH_balance_train.shape", POS_CASH_balance_train.shape)
print("POS_CASH_balance_test.shape", POS_CASH_balance_test.shape)

Memory usage before changing types 1003.81 MB
Memory usage after changing types 501.33 MB
Memory usage before changing types 165.78 MB
Memory usage after changing types 82.79 MB
POS_CASH_balance_train.shape (286967, 439)
POS_CASH_balance_test.shape (47392, 439)
POS_CASH_balance_train.shape (286967, 439)
POS_CASH_balance_test.shape (47392, 439)


### Load `installments_payments`

In [16]:
installments_payments_train = load_csv("data/data_/installments_payments_agg_train.csv")
installments_payments_test = load_csv("data/data_/installments_payments_agg_test.csv")
print("installments_payments_train.shape", installments_payments_train.shape)
print("installments_payments_test.shape", installments_payments_test.shape)

installments_payments_train, installments_payments_test = train_test_col_align(installments_payments_train, 
                                                                               installments_payments_test)
print("installments_payments_train.shape", installments_payments_train.shape)
print("installments_payments_test.shape", installments_payments_test.shape)

Memory usage before changing types 1394.36 MB
Memory usage after changing types 697.47 MB
Memory usage before changing types 228.99 MB
Memory usage after changing types 114.55 MB
installments_payments_train.shape (289406, 604)
installments_payments_test.shape (47529, 604)
installments_payments_train.shape (289406, 604)
installments_payments_test.shape (47529, 604)


### Load `credit_card_balance`

In [17]:
credit_card_balance_train = load_csv("data/data_/credit_card_balance_agg_train.csv")
credit_card_balance_test = load_csv("data/data_/credit_card_balance_agg_test.csv")
print("credit_card_balance_train.shape", credit_card_balance_train.shape)
print("credit_card_balance_test.shape", credit_card_balance_test.shape)

credit_card_balance_train, credit_card_balance_test = train_test_col_align(credit_card_balance_train, 
                                                                           credit_card_balance_test)
print("credit_card_balance_train.shape", credit_card_balance_train.shape)
print("credit_card_balance_test.shape", credit_card_balance_test.shape)

Memory usage before changing types 331.30 MB
Memory usage after changing types 165.53 MB
Memory usage before changing types 61.69 MB
Memory usage after changing types 30.83 MB
credit_card_balance_train.shape (77934, 534)
credit_card_balance_test.shape (14513, 534)
credit_card_balance_train.shape (77934, 534)
credit_card_balance_test.shape (14513, 534)


### Merge

In [18]:
merge_train = merge_dataframes(application_train, 
                               [bureau_train, bureau_balance_train, 
                                previous_application_train, POS_CASH_balance_train,
                                installments_payments_train, credit_card_balance_train], 
                               "APPL_", ["BURE_", "BUBA_", "PRAP_", "POBA_", "INPA_", "CCBA_"])

merge_test = merge_dataframes(application_test, 
                              [bureau_test, bureau_balance_test, 
                               previous_application_test, POS_CASH_balance_test,
                               installments_payments_test, credit_card_balance_test], 
                              "APPL_", ["BURE_", "BUBA_", "PRAP_", "POBA_", "INPA_", "CCBA_"])

print("merge_train.shape", merge_train.shape)
print("merge_test.shape", merge_test.shape)

print("merge_train.isnull().sum().sum()", merge_train.isnull().sum().sum())
print("merge_test.isnull().sum().sum()", merge_test.isnull().sum().sum())

imputer = MergeMissingImputer()
imputer.fit(merge_train)
merge_train = imputer.transform(merge_train)
merge_test = imputer.transform(merge_test)

print("merge_train.isnull().sum().sum()", merge_train.isnull().sum().sum())
print("merge_test.isnull().sum().sum()", merge_test.isnull().sum().sum())
print("merge_train.shape", merge_train.shape)
print("merge_test.shape", merge_test.shape)


remover = CollinearColumnRemover(0.95)
remover.fit(merge_train)
merge_train = remover.transform(merge_train)
merge_test = remover.transform(merge_test)
print("merge_train.shape", merge_train.shape)
print("merge_test.shape", merge_test.shape)

merge_train, merge_test = train_test_col_align(merge_train, merge_test, exclude_cols=["APPL_TARGET"])
print("merge_train.shape", merge_train.shape)
print("merge_test.shape", merge_test.shape)

Merging with BURE_
Merging with BUBA_
Merging with PRAP_
Merging with POBA_
Merging with INPA_
Merging with CCBA_
Merging with BURE_
Merging with BUBA_
Merging with PRAP_
Merging with POBA_
Merging with INPA_
Merging with CCBA_
merge_train.shape (307511, 2518)
merge_test.shape (48744, 2517)
merge_train.isnull().sum().sum() 218065886
merge_test.isnull().sum().sum() 22803797
merge_train.isnull().sum().sum() 0
merge_test.isnull().sum().sum() 0
merge_train.shape (307511, 2518)
merge_test.shape (48744, 2517)
Number of columns droped: 69
Number of columns droped: 69
merge_train.shape (307511, 2449)
merge_test.shape (48744, 2448)
merge_train.shape (307511, 2449)
merge_test.shape (48744, 2448)


In [19]:
merge_train.to_csv("data/data_/X_y_train.csv", index=False)
merge_test.to_csv("data/data_/X_test.csv", index=False)