In [1]:
import warnings
warnings.filterwarnings('ignore')

import time

import pandas as pd
import numpy as np
import scipy
from scipy import stats

import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin

# Helper functions

In [2]:
def change_dtypes(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        if (df[col].dtype == "object") and (df[col].nunique() < df.shape[0]):
            df[col] = df[col].astype("category")

        elif set(df[col].unique()) == set([0, 1]):
            df[col] = df[col].astype(bool)

        elif df[col].dtype == float:
            df[col] = df[col].astype(np.float32)

        elif df[col].dtype == int:
            df[col] = df[col].astype(np.int32)

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtypes(df)
    return df

In [3]:
def check_type(val):
    if type(val) == str:
        return "string"
    
    if np.issubsctype(type(val), np.number):
        return "number"
    
    if callable(val):
        return "function"
    
    return str(type(val))


class NumColsImputer(BaseEstimator, TransformerMixin):
    def __init__(self, specified_values=None, default="median"):
        """
        :param specified_values: dict {colname (str): val (float)}, impute values for some specific columns
        :param default: str, function or float, value or function used for the remaining columns
        """
        assert (specified_values is None) or isinstance(specified_values, 
                                                        dict), "specified_values must be None or dict"
        
        self._specified_values = specified_values
        if self._specified_values is not None:
            for col, val in self._specified_values.items():
                assert check_type(val) == "number", "Impute value for " + col + " is not number."
        
        self._default = default
        self._default_type = check_type(self._default)
        if self._default_type not in ["number", "string", "function"]:
            raise ValueError("Unsupported stat type " + self._default_type)
    
    def _cal_imput_vals(self, df):
        cat_cols = df.select_dtypes(["object", "category", "bool"]).columns.to_list()
        if len(cat_cols) > 0:
            raise ValueError("There are non-number columns: " + ", ".join(cat_cols))
        
        all_cols = df.columns.to_list()
        if self._default_type == "number":
            impute_values = {col: self._default for col in all_cols}
            
        elif self._default_type == "string":
            impute_values = getattr(df, self._default)()
        
        elif self._default_type == "function":
            impute_values = df.apply(self._default)
        
        else:
            return None
            
        impute_values = dict(impute_values)
        if self._specified_values is None:
            return impute_values
        
        for col in self._specified_values:
            impute_values[col] = self._specified_values[col]
            
        return impute_values
    
    def fit(self, df):
        impute_values = self._cal_imput_vals(df)
        
        cols_with_na = [col for col in df.columns if df[col].isnull().any()]
        self._impute_values = {col: impute_values[col] for col in cols_with_na}
        
        for k, v in self._impute_values.items():
            if np.isnan(v):
                raise ValueError("One of the impute_values is NaN: " + k)
        
        return self
    
    def transform(self, df):
        return df.fillna(self._impute_values)


class CatColsImputer(BaseEstimator, TransformerMixin):
    def __init__(self, specified_values=None, default="missing_value"):
        """
        :param specified_values: dict {colname (str): val (str, float, function)}, 
                                 impute values for some specific columns
        :param default: str, used for the remaining columns
        """
        assert (specified_values is None) or isinstance(specified_values, 
                                                        dict), "specified_values must be None or dict"
        
        self._specified_values = specified_values
        if self._specified_values is not None:
            for col, val in self._specified_values.items():
                assert check_type(val) in ["string", 
                                           "function"], "Impute value for " + col + " is " + check_type(val)
        
        self._default = default
        assert check_type(self._default) == "string", "default must be string"
        
        
    def _cal_imput_vals(self, df):
        num_cols = df.select_dtypes(["number"]).columns.to_list()
        if len(num_cols) > 0:
            raise ValueError("There are number columns: " + ", ".join(num_cols))
        
        all_cols = df.columns.to_list()
        impute_values = {col: self._default for col in all_cols}
        if self._specified_values is None:
            return impute_values
        
        for col, val in self._specified_values.items():
            dtype = check_type(val)
            if dtype == "string":
                impute_values[col] = val
            
            elif dtype == "function":
                impute_values[col] = val(df[col])
            
            else:
                return None
        return impute_values
    
    def fit(self, df):
        impute_values = self._cal_imput_vals(df)
        
        cols_with_na = [col for col in df.columns if df[col].isnull().any()]
        self._impute_values = {col: impute_values[col] for col in cols_with_na}
        
        return self
    
    def transform(self, df):
        df_new = df.copy()
        for col, val in self._impute_values.items():
            df_new[col] = df_new[col].astype("object").fillna(val).astype("category")
            
        return df_new


class CollinearColumnRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold, col_regex=None):
        """
        :param threshold: float in [0, 1], if two columns have correlation greater than threshold
                          one of them will be removed
        :param col_regex: str, regular expression to select columns
        """
        self._threshold = threshold
        self._col_regex = col_regex
    
    def _collinear_columns_NOTUSED(self, df, threshold):
        if self._col_regex is None:
            df_sel = df.select_dtypes(["number", "bool"])
        else:
            df_sel = df.filter(regex=self._col_regex)
        
        all_cols = df_sel.columns.to_list()
        ncols = len(all_cols)
        
        collin_cols = []
        for i in range(ncols-1):
            col_i = all_cols[i]
            if col_i in collin_cols:
                continue
            
            for j in range(i + 1, ncols):
                col_j = all_cols[j]
                if col_j in collin_cols:
                    continue
                
                corr = df_sel[[col_i]].corrwith(df_sel[col_j]).values[0]
                if corr > threshold:
                    collin_cols.append(col_j)
        
        collin_cols = list(set(collin_cols))
        return collin_cols
    
    def _collinear_columns(self, df, threshold):
        if self._col_regex is None:
            df_sel = df.select_dtypes(["number", "bool"])
        else:
            df_sel = df.filter(regex=self._col_regex)
        
        all_cols = df_sel.columns.to_list()
        ncols = len(all_cols)
        
        corr_mat = df_sel.corr().abs()
        collin_cols = []
        for i in range(ncols-1):
            col_i = all_cols[i]
            if col_i in collin_cols:
                continue
            
            for j in range(i + 1, ncols):
                col_j = all_cols[j]
                if col_j in collin_cols:
                    continue
                
                corr = corr_mat.loc[col_i, col_j]
                if corr > threshold:
                    collin_cols.append(col_j)
        
        collin_cols = list(set(collin_cols))
        return collin_cols
    
    
    def fit(self, df):
        self._collin_cols = self._collinear_columns(df, self._threshold)
        return self
    
    def transform(self, df):
        all_cols = df.columns.to_list()
        nonexist_cols = [col for col in self._collin_cols if col not in all_cols]
        if len(nonexist_cols) > 0:
            print("WARNING: These collinear cols to be droped do not exist in df:", nonexist_cols)
            
        droped_col = [col for col in self._collin_cols if col in all_cols]
        return df.drop(droped_col, axis="columns")
    

class OneHotEncoder(BaseEstimator, TransformerMixin):
    def fit(self, train_df):
        df_cat = train_df.select_dtypes(["object", "category"])
        self._cat_cols = df_cat.columns.to_list()
        self._cat_cols_ohe = pd.get_dummies(df_cat).columns.to_list()
    
    def transform(self, df):
        df_cat = df.select_dtypes(["object", "category"])
        cat_cols = df_cat.columns.to_list()
        assert set(cat_cols) == set(self._cat_cols), "df does not have the same categorical cols as train_df"
        
        # one-hot encode
        df_cat = pd.get_dummies(df_cat)
        # drop redundant classes which my be present in test_df
        for col in df_cat.columns:
            if col not in self._cat_cols_ohe:
                df_cat = df_cat.drop([col], axis="columns")
        
        # if some some colums are lacking in test but present in train, make them will all zero 
        cat_cols_ohe = df_cat.columns.to_list()
        for col in self._cat_cols_ohe:
            if col not in cat_cols_ohe:
                df_cat[col] = 0
                df_cat[col] = df_cat[col].astype(np.uint8)
        
        num_cols = [col for col in df.columns if col not in cat_cols]
        df_num = df[num_cols]
        
        return pd.concat([df_num, df_cat], axis="columns")

In [4]:
def get_colnames_from_regex(df, regex_strings):
    cols = []
    for regex_str in regex_strings:
        cols.extend(df.filter(regex=regex_str).columns.to_list())
    return cols


class Imputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        raise NotImplementedError("Not implemented")
        
        self._regex_strings = None
        self._spec_impt_regex_val_num = None
        
        self._spec_impt_vals_num = None
        self._default_imput_vals_num = "median"
        
        self._spec_impt_vals_cat = None
        self._default_imput_vals_cat = "missing_value"
    
    def fit(self, df_train):
        if self._regex_strings is not None:
            cols_imput_with_regex = get_colnames_from_regex(df_train, self._regex_strings)
            self._spec_impt_vals_num.update({col: self._spec_impt_regex_val_num 
                                             for col in cols_imput_with_regex})
        
        df_num = df_train.select_dtypes(["number"])
        self._imputer_num = NumColsImputer(specified_values=self._spec_impt_vals_num, 
                                           default=self._default_imput_vals_num)
        self._imputer_num.fit(df_num)
        
        df_cat = df_train.select_dtypes(["object", "category", "bool"])
        self._imputer_cat = CatColsImputer(specified_values=self._spec_impt_vals_cat, 
                                           default=self._default_imput_vals_cat)
        self._imputer_cat.fit(df_cat)
    
    def transform(self, df):
        num_df = df.select_dtypes(["number"])
        
        some_col = list(self._spec_impt_vals_num.keys())[0]
        isnull_df = num_df[[some_col]]
        for col in self._spec_impt_vals_num:
            isnull_df[col + "_ISNULL"] = num_df[col].isnull()
            
        isnull_df = isnull_df.drop([some_col], axis="columns")
        num_df = self._imputer_num.transform(num_df)
        
        # cat
        cat_df = df.select_dtypes(["object", "category", "bool"])
        cat_df = self._imputer_cat.transform(cat_df)
        
        return pd.concat([num_df, isnull_df, cat_df], axis="columns")

In [16]:
def flatten_multiindex_cols(columns):
    fat_cols = ["_".join([str(c) for c in flat_col]) for flat_col in columns.to_flat_index()]
    return fat_cols


def agg_num_cols(df, by_sers, stats):
    assert type(by_sers) in [list, tuple], "by_sers must be a list or tuple"
    assert type(stats) in [list, tuple], "stats must be a list or tuple"
    
    for ser in by_sers:
        assert isinstance(ser, pd.Series), "ser in by_sers must be Series"
        
    cat_cols = df.select_dtypes(["object", "category"]).columns.to_list()
    if len(cat_cols) > 0:
        raise ValueError("There are non-number cols: " + ", ".join(cat_cols))
    
    df_agg = df.groupby(by_sers).agg(stats)
    df_agg.columns = flatten_multiindex_cols(df_agg.columns)
    
    return df_agg


def agg_cat_cols(df, by_sers, stats):
    assert type(by_sers) in [list, tuple], "by_sers must be a list or tuple"
    assert type(stats) in [list, tuple], "stats must be a list or tuple"
    
    for ser in by_sers:
        assert isinstance(ser, pd.Series), "ser in by_sers must be Series"
        
    num_cols = df.select_dtypes(["number"]).columns.to_list()
    if len(num_cols) > 0:
        raise ValueError("There are number cols: " + ", ".join(num_cols))
    
    df_agg = df.groupby(by_sers).agg(stats)
    df_agg.columns = flatten_multiindex_cols(df_agg.columns)
    
    return df_agg


class Aggregator:
    
    def __init__(self, by_list_cols, 
                 num_stats, bool_stats, cat_stats,
                 one_hot_encode_cat,
                 iqr=False, minmax_range=False, mean_median_diff=False):
        
        self._by_list_cols = by_list_cols
        
        self._num_stats = num_stats
        self._bool_stats = bool_stats
        self._cat_stats = cat_stats
        
        self._one_hot_encode_cat = one_hot_encode_cat
        
        self._iqr = iqr
        self._minmax_range = minmax_range
        self._mean_median_diff = mean_median_diff
    
    def _num_agg(self, df, by_sers):
        agg_df = agg_num_cols(df, by_sers, stats=self._num_stats)
        return agg_df
    
    def _bool_agg(self, df, by_sers):
        agg_df = agg_num_cols(df, by_sers, stats=self._bool_stats)
        return agg_df
    
    def _cat_agg(self, df, by_sers):
        agg_df =  agg_cat_cols(df, by_sers, stats=self._cat_stats)
        return agg_df
    
    
    def _iqr_agg(self, num_df, by_sers):
        grouped = num_df.groupby(by_sers)
        iqr_df = grouped.quantile(0.75) - grouped.quantile(0.25)
        iqr_df.columns = [col + "_iqr" for col in iqr_df.columns]
        return iqr_df
    
    def _range_agg(self, num_df, by_sers):
        grouped = num_df.groupby(by_sers)
        range_df = grouped.max() - grouped.min()
        range_df.columns = [col + "_range" for col in range_df.columns]
        return range_df
    
    def _mm_diff_agg(self, num_df, by_sers):
        grouped = num_df.groupby(by_sers)
        diff_df = grouped.mean() - grouped.median()
        diff_df.columns = [col + "_mm_diff" for col in diff_df.columns]
        return diff_df
    
    
    def fit(self, df_train):
        self._ohe = OneHotEncoder()
        self._ohe.fit(df_train)
        return self
    
    def transform(self, df):
        by_sers = [df[col] for col in self._by_list_cols]
        df = df.drop(self._by_list_cols, axis="columns")
        
        dfs = []
        
        df_bool = df.select_dtypes(["bool"])
        if df_bool.shape[1] > 0:
            print("Aggregating bool df with shape:", df_bool.shape)
            df_bool = self._bool_agg(df_bool, by_sers)
            dfs.append(df_bool)
        
        # one-hot encode
        if self._one_hot_encode_cat:
            print("One-hot encoding for categoricals and treating the results as numericals")
            df_num = self._ohe.transform(df)
            df_num = df_num.select_dtypes(["number"])
        else:
            print("Do not one-hot encode categoricals")
            df_num = df.select_dtypes(["number"])
            
        if df_num.shape[1] > 0:
            print("Aggregating num df with shape:", df_num.shape)
            df_num = self._num_agg(df_num, by_sers)
            dfs.append(df_num)
        
        df_cat = df.select_dtypes(["category"])
        if df_cat.shape[1] > 0:
            print("Aggregating cat df with shape:", df_cat.shape)
            df_cat = self._cat_agg(df_cat, by_sers)
            dfs.append(df_cat)
        
        # aggregate num cols with iqr, range and mean-median difference
        df_num = df.select_dtypes(["number"])
            
        if self._iqr:
            print("Aggregating num df with iqr")
            df_iqr = self._iqr_agg(df_num, by_sers)
            dfs.append(df_iqr)
        
        if self._minmax_range:
            print("Aggregating num df with range")
            df_range = self._range_agg(df_num, by_sers)
            dfs.append(df_range)
        
        if self._mean_median_diff:
            print("Aggregating num df with mean-median difference")
            df_diff = self._mm_diff_agg(df_num, by_sers)
            dfs.append(df_diff)
        
        return pd.concat(dfs, axis="columns")

In [6]:
def train_test_partition(df, matching_key, train_id_ser):
    is_train = df[matching_key].isin(train_id_ser.values)
    
    train = df.loc[is_train, :]
    test = df.loc[~is_train, :]
    return train, test

In [7]:
def mode(ser):
    return ser.mode().values[0]


def entropy(ser):
    pk = ser.value_counts(normalize=True)
    return stats.entropy(pk)

# Load data

In [12]:
credit_card_balance = load_csv("data/download/credit_card_balance.csv")
credit_card_balance.shape

Memory usage before changing types 706.62 MB
Memory usage after changing types 341.79 MB


(3840312, 23)

# Feature extraction from `application_[train|test]`

In [8]:
class ApplImputer(Imputer):
    def __init__(self):
        self._regex_strings = ["^APARTMENTS_", "^BASEMENTAREA_", "^YEARS_B", "^COMMONAREA_", 
                               "^ELEVATORS_", "^ENTRANCES_", "^FLOORS", "^LANDAREA_", "^LIVING", 
                               "^NONLIVING", "AMT_REQ_CREDIT_BUREAU_"]
        self._spec_impt_regex_val_num = -1.
        
        self._spec_impt_vals_num = {"OWN_CAR_AGE": -1.,
                                    "EXT_SOURCE_1": 0.,
                                    "EXT_SOURCE_3": 0.,
                                    "TOTALAREA_MODE": -1.}
        self._default_imput_vals_num = "median"
        
        self._spec_impt_vals_cat = None
        self._default_imput_vals_cat = "missing_value"
        

class ApplNewColsAdder(BaseEstimator, TransformerMixin):
    def fit(self, df_train):
        credit_to_income = df_train["AMT_CREDIT"] / df_train["AMT_INCOME_TOTAL"]
        self._cti_min = credit_to_income.replace(-np.inf, np.nan).min() / 10.
        self._cti_max = credit_to_income.replace(np.inf, np.nan).max() * 10.
        
        credit_to_goods = df_train["AMT_CREDIT"] / df_train["AMT_GOODS_PRICE"]
        self._ctg_min = credit_to_goods.replace(-np.inf, np.nan).min() / 10.
        self._ctg_max = credit_to_goods.replace(np.inf, np.nan).max() * 10.
    
    def transform(self, df):
        df_new = df.copy()
        df_new["AMT_INCOME_TOTAL_LOG"] = np.log(df_new["AMT_INCOME_TOTAL"])
        df_new["DAYS_EMPLOYED_POSITIVE"] = df_new["DAYS_EMPLOYED"] > 0
        days_emp_max = df_new["DAYS_EMPLOYED"].max()
        if days_emp_max > 0:
            df_new["DAYS_EMPLOYED"] = df_new["DAYS_EMPLOYED"].replace({days_emp_max: 100.})
        
        df_new["CREDIT_TO_INCOME"] = df_new["AMT_CREDIT"] / df_new["AMT_INCOME_TOTAL"]
        df_new["CREDIT_TO_INCOME"] = df_new["CREDIT_TO_INCOME"].replace(-np.inf, self._cti_min)
        df_new["CREDIT_TO_INCOME"] = df_new["CREDIT_TO_INCOME"].replace(np.inf, self._cti_max)
        
        df_new["CREDIT_TO_GOODS"] = df_new["AMT_CREDIT"] / df_new["AMT_GOODS_PRICE"]
        df_new["CREDIT_TO_GOODS"] = df_new["CREDIT_TO_GOODS"].replace(-np.inf, self._ctg_min)
        df_new["CREDIT_TO_GOODS"] = df_new["CREDIT_TO_GOODS"].replace(np.inf, self._ctg_max)
        
        return df_new

In [9]:
application_train = load_csv("data/download/application_train.csv")
application_test = load_csv("data/download/application_test.csv")

appl_train_key = application_train["SK_ID_CURR"]
appl_test_key = application_test["SK_ID_CURR"]
print(application_train.shape, application_test.shape)

Memory usage before changing types 300.13 MB
Memory usage after changing types 104.87 MB
Memory usage before changing types 47.18 MB
Memory usage after changing types 18.19 MB
(307511, 122) (48744, 121)


In [10]:
time_start = time.time()

appl_train = application_train.copy()
appl_test = application_test.copy()

imputer = ApplImputer()
imputer.fit(appl_train)
appl_train = imputer.transform(appl_train)
appl_test = imputer.transform(appl_test)

remover = CollinearColumnRemover(0.99, col_regex="_ISNULL$")
remover.fit(appl_train)
appl_train = remover.transform(appl_train)
appl_test = remover.transform(appl_test)

adder = ApplNewColsAdder()
adder.fit(appl_train)
appl_train = adder.transform(appl_train)
appl_test = adder.transform(appl_test)

print("appl_train.shape", appl_train.shape)
print("appl_test.shape", appl_test.shape)

if False:
    appl_train.to_csv("data/data_/application_train.csv", index=False)
    appl_test.to_csv("data/data_/application_test.csv", index=False)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

appl_train.shape (307511, 145)
appl_test.shape (48744, 144)
Elapsed Time 30.59755253791809


# Feature extraction from `bureau` data

## `bureau.csv`

In [10]:
class BureauImputer(Imputer):
    def __init__(self):
        self._regex_strings = None
        self._spec_impt_regex_val_num = None
        
        self._spec_impt_vals_num = {"DAYS_ENDDATE_FACT": 100.,
                                    "AMT_CREDIT_MAX_OVERDUE": -1000.,
                                    "AMT_CREDIT_SUM_DEBT": 0.,
                                    "AMT_CREDIT_SUM_LIMIT": 0.,
                                    "AMT_ANNUITY": -1000.}
        self._default_imput_vals_num = "median"
        
        self._spec_impt_vals_cat = None
        self._default_imput_vals_cat = "missing_value"

        
def bureau_add_cols(bu_df):
    df = bu_df.copy()
    df["DAYS_CREDIT_ENDDATE_ISPOSITIVE"] = df["DAYS_CREDIT_ENDDATE"] > 0
    df["DAYS_CREDIT_UPDATE_ISPOSITIVE"] = df["DAYS_CREDIT_UPDATE"] > 0
    return df

In [11]:
bureau = load_csv("data/download/bureau.csv")
print("bureau.shape", bureau.shape)

bureau_train, bureau_test = train_test_partition(bureau, "SK_ID_CURR", appl_train_key)

print("bureau_train.shape", bureau_train.shape)
print("bureau_test.shape", bureau_test.shape)

bureau_train_keys = bureau_train[["SK_ID_CURR", "SK_ID_BUREAU"]]
bureau_test_keys = bureau_test[["SK_ID_CURR", "SK_ID_BUREAU"]]

bureau_train.head()

Memory usage before changing types 233.43 MB
Memory usage after changing types 101.27 MB
bureau.shape (1716428, 17)
bureau_train.shape (1465325, 17)
bureau_test.shape (251103, 17)


Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


In [13]:
time_start = time.time()

bureau_agg_train = bureau_train.copy()
bureau_agg_test = bureau_test.copy()

imputer = BureauImputer()
imputer.fit(bureau_agg_train)
bureau_agg_train = imputer.transform(bureau_agg_train)
bureau_agg_test = imputer.transform(bureau_agg_test)

bureau_agg_train = bureau_add_cols(bureau_agg_train)
bureau_agg_test = bureau_add_cols(bureau_agg_test)

print("bureau_agg_train.shape", bureau_agg_train.shape)
print("bureau_agg_test.shape", bureau_agg_test.shape)


num_stats = ["count", "mean", "median", "min", "max", "var"]
bool_stats = ["count", "mean", "var", mode, entropy]
cat_stats = ["count", "nunique", mode, entropy]


by_list_cols = ["SK_ID_CURR"]
bureau_agg_train = bureau_agg_train.drop(["SK_ID_BUREAU"], axis="columns")
bureau_agg_test = bureau_agg_test.drop(["SK_ID_BUREAU"], axis="columns")

aggregator = Aggregator(by_list_cols, num_stats, bool_stats, cat_stats,
                        one_hot_encode_cat=True,
                        iqr=True, minmax_range=True, mean_median_diff=True)
aggregator.fit(bureau_agg_train)
bureau_agg_train = aggregator.transform(bureau_agg_train)
bureau_agg_test = aggregator.transform(bureau_agg_test)

bureau_agg_train = bureau_agg_train.reset_index()
bureau_agg_test = bureau_agg_test.reset_index()

print("bureau_agg_train.shape", bureau_agg_train.shape)
print("bureau_agg_test.shape", bureau_agg_test.shape)

bureau_agg_train = bureau_agg_train.fillna(0.)
bureau_agg_test = bureau_agg_test.fillna(0.)


remover = CollinearColumnRemover(0.99)
remover.fit(bureau_agg_train)
bureau_agg_train = remover.transform(bureau_agg_train)
bureau_agg_test = remover.transform(bureau_agg_test)

print("bureau_agg_train.shape", bureau_agg_train.shape)
print("bureau_agg_test.shape", bureau_agg_test.shape)

if False:
    bureau_agg_train.to_csv("data/data_/bureau_agg_train.csv", index=False)
    bureau_agg_test.to_csv("data/data_/bureau_agg_test.csv", index=False)


time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

bureau_agg_train.shape (1465325, 24)
bureau_agg_test.shape (251103, 24)
Aggregating bool df with shape: (1465325, 7)
Aggregating num df with shape: (1465325, 35)
Aggregating cat df with shape: (1465325, 3)
Aggregating num df with iqr
Aggregating num df with range
Aggregating num df with mean-median difference
Aggregating bool df with shape: (251103, 7)
Aggregating num df with shape: (251103, 35)
Aggregating cat df with shape: (251103, 3)
Aggregating num df with iqr
Aggregating num df with range
Aggregating num df with mean-median difference
bureau_agg_train.shape (263491, 363)
bureau_agg_test.shape (42320, 363)
bureau_agg_train.shape (263491, 293)
bureau_agg_test.shape (42320, 293)
Elapsed Time 3633.656004190445


In [41]:
del bureau_agg_train, bureau_agg_test

## `bureau_balance.csv`

In [12]:
def nearest_status(df):
    return df.sort_values(by=["MONTHS_BALANCE"], ascending=False)["STATUS"].iloc[0]

def mode_status_five_nearest(df):
    statuses = df.sort_values(by=["MONTHS_BALANCE"], ascending=False)["STATUS"].iloc[: 5]
    return statuses.mode().values[0]

def mode_status_ten_nearest(df):
    statuses = df.sort_values(by=["MONTHS_BALANCE"], ascending=False)["STATUS"].iloc[: 10]
    return statuses.mode().values[0]

In [13]:
bureau_balance = load_csv("data/download/bureau_balance.csv")
print("bureau_balance.shape", bureau_balance.shape)

bureau_balance = bureau_balance.merge(bureau[["SK_ID_CURR", "SK_ID_BUREAU"]], how="left", on="SK_ID_BUREAU")
bureau_balance = bureau_balance.dropna(subset=["SK_ID_CURR"])
bureau_balance["SK_ID_CURR"] = bureau_balance["SK_ID_CURR"].astype("int32")
print("bureau_balance.shape", bureau_balance.shape)

bureau_balance_train, bureau_balance_test = train_test_partition(bureau_balance, "SK_ID_CURR", appl_train_key)
bureau_balance_train = bureau_balance_train.drop(["SK_ID_CURR"], axis="columns")
bureau_balance_test = bureau_balance_test.drop(["SK_ID_CURR"], axis="columns")

print("bureau_balance_train.shape:", bureau_balance_train.shape)
print("bureau_balance_test.shape:", bureau_balance_test.shape)

bureau_balance_train.head()

Memory usage before changing types 655.20 MB
Memory usage after changing types 245.70 MB
bureau_balance.shape (27299925, 3)
bureau_balance.shape (24179741, 4)
bureau_balance_train.shape: (14701612, 3)
bureau_balance_test.shape: (9478129, 3)


Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [15]:
time_start = time.time()

bureau_balance_agg_train = bureau_balance_train.copy()
bureau_balance_agg_test = bureau_balance_test.copy()
print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)


NEAREST_STATUS_train = bureau_balance_agg_train.groupby("SK_ID_BUREAU").apply(nearest_status)
MODE_STATUS_FIVE_NEAREST_train = bureau_balance_agg_train.groupby("SK_ID_BUREAU").apply(mode_status_five_nearest)
MODE_STATUS_TEN_NEAREST_train = bureau_balance_agg_train.groupby("SK_ID_BUREAU").apply(mode_status_ten_nearest)

NEAREST_STATUS_test = bureau_balance_agg_test.groupby("SK_ID_BUREAU").apply(nearest_status)
MODE_STATUS_FIVE_NEAREST_test = bureau_balance_agg_test.groupby("SK_ID_BUREAU").apply(mode_status_five_nearest)
MODE_STATUS_TEN_NEAREST_test = bureau_balance_agg_test.groupby("SK_ID_BUREAU").apply(mode_status_ten_nearest)

num_stats = ["count", "mean", "min"]
bool_stats = []
cat_stats = ["count", "nunique", mode, entropy]

# aggregate over "SK_ID_BUREAU"
by_list_cols = ["SK_ID_BUREAU"]
aggregator = Aggregator(by_list_cols, num_stats, bool_stats, cat_stats,
                        one_hot_encode_cat=False,
                        iqr=True, minmax_range=True, mean_median_diff=True)
aggregator.fit(bureau_balance_agg_train)
bureau_balance_agg_train = aggregator.transform(bureau_balance_agg_train)
bureau_balance_agg_test = aggregator.transform(bureau_balance_agg_test)
print("bureau_balance_agg_train.isnull().sum().sum():", bureau_balance_agg_train.isnull().sum().sum())
print("bureau_balance_agg_test.isnull().sum().sum():", bureau_balance_agg_test.isnull().sum().sum())
print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)


bureau_balance_agg_train["NEAREST_STATUS"] = NEAREST_STATUS_train
bureau_balance_agg_train["MODE_STATUS_FIVE_NEAREST"] = MODE_STATUS_FIVE_NEAREST_train
bureau_balance_agg_train["MODE_STATUS_TEN_NEAREST"] = MODE_STATUS_TEN_NEAREST_train

bureau_balance_agg_test["NEAREST_STATUS"] = NEAREST_STATUS_test
bureau_balance_agg_test["MODE_STATUS_FIVE_NEAREST"] = MODE_STATUS_FIVE_NEAREST_test
bureau_balance_agg_test["MODE_STATUS_TEN_NEAREST"] = MODE_STATUS_TEN_NEAREST_test
print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)

# remove collinear columns
remover = CollinearColumnRemover(0.99)
remover.fit(bureau_balance_agg_train)
bureau_balance_agg_train = remover.transform(bureau_balance_agg_train)
bureau_balance_agg_test = remover.transform(bureau_balance_agg_test)

print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)

bureau_balance_agg_train = bureau_balance_agg_train.reset_index()
bureau_balance_agg_test = bureau_balance_agg_test.reset_index()
print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)

if True:
    bureau_balance_agg_train.to_csv("data/data_/bureau_balance_agg_train_tmp.csv", index=False)
    bureau_balance_agg_test.to_csv("data/data_/bureau_balance_agg_test_tmp.csv", index=False)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

bureau_balance_agg_train.shape (14701612, 3)
bureau_balance_agg_test.shape (9478129, 3)
Do not one-hot encode categoricals
Aggregating num df with shape: (14701612, 1)
Aggregating cat df with shape: (14701612, 1)
Do not one-hot encode categoricals
Aggregating num df with iqr
Aggregating num df with range
Aggregating num df with mean-median difference
Do not one-hot encode categoricals
Aggregating num df with shape: (9478129, 1)
Aggregating cat df with shape: (9478129, 1)
Do not one-hot encode categoricals
Aggregating num df with iqr
Aggregating num df with range
Aggregating num df with mean-median difference
bureau_balance_agg_train.isnull().sum().sum(): 0
bureau_balance_agg_test.isnull().sum().sum(): 0
bureau_balance_agg_train.shape (523515, 10)
bureau_balance_agg_test.shape (250839, 10)
bureau_balance_agg_train.shape (523515, 13)
bureau_balance_agg_test.shape (250839, 13)
bureau_balance_agg_train.shape (523515, 10)
bureau_balance_agg_test.shape (250839, 10)
bureau_balance_agg_train.s

In [14]:
# aggregate over "SK_ID_CURR"
time_start = time.time()

# this will change [0, 1] into bool, and it is ok
bureau_balance_agg_train = load_csv("data/data_/bureau_balance_agg_train_tmp.csv")
bureau_balance_agg_test = load_csv("data/data_/bureau_balance_agg_test_tmp.csv")
print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)

bureau_balance_agg_train = bureau_balance_agg_train.merge(bureau_train_keys, how="left", on="SK_ID_BUREAU")
bureau_balance_agg_test = bureau_balance_agg_test.merge(bureau_test_keys, how="left", on="SK_ID_BUREAU")
print("bureau_balance_agg_train.isnull().sum().sum():", bureau_balance_agg_train.isnull().sum().sum())
print("bureau_balance_agg_test.isnull().sum().sum():", bureau_balance_agg_test.isnull().sum().sum())
print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)

bureau_balance_agg_train = bureau_balance_agg_train.drop(["SK_ID_BUREAU"], axis="columns")
bureau_balance_agg_test = bureau_balance_agg_test.drop(["SK_ID_BUREAU"], axis="columns")
print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)

# aggregate
num_stats = ["count", "mean", "median", "min"]
bool_stats = ["count", "mean", "var", mode, entropy]
cat_stats = ["count", "nunique", mode, entropy]

by_list_cols = ["SK_ID_CURR"]
aggregator = Aggregator(by_list_cols, num_stats, bool_stats, cat_stats,
                        one_hot_encode_cat=True,
                        iqr=True, minmax_range=True, mean_median_diff=True)
aggregator.fit(bureau_balance_agg_train)
bureau_balance_agg_train = aggregator.transform(bureau_balance_agg_train)
bureau_balance_agg_test = aggregator.transform(bureau_balance_agg_test)
print("bureau_balance_agg_train.isnull().sum().sum():", bureau_balance_agg_train.isnull().sum().sum())
print("bureau_balance_agg_test.isnull().sum().sum():", bureau_balance_agg_test.isnull().sum().sum())
print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)

bureau_balance_agg_train = bureau_balance_agg_train.fillna(0.)
bureau_balance_agg_test = bureau_balance_agg_test.fillna(0.)

# remove collinear columns
remover = CollinearColumnRemover(0.99)
remover.fit(bureau_balance_agg_train)
bureau_balance_agg_train = remover.transform(bureau_balance_agg_train)
bureau_balance_agg_test = remover.transform(bureau_balance_agg_test)

print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)

bureau_balance_agg_train = bureau_balance_agg_train.reset_index()
bureau_balance_agg_test = bureau_balance_agg_test.reset_index()
print("bureau_balance_agg_train.shape", bureau_balance_agg_train.shape)
print("bureau_balance_agg_test.shape", bureau_balance_agg_test.shape)

if False:
    bureau_balance_agg_train.to_csv("data/data_/bureau_balance_agg_train.csv", index=False)
    bureau_balance_agg_test.to_csv("data/data_/bureau_balance_agg_test.csv", index=False)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

Memory usage before changing types 180.09 MB
Memory usage after changing types 63.35 MB
Memory usage before changing types 86.29 MB
Memory usage after changing types 30.35 MB
bureau_balance_agg_train.shape (523515, 43)
bureau_balance_agg_test.shape (250839, 43)
bureau_balance_agg_train.isnull().sum().sum(): 0
bureau_balance_agg_test.isnull().sum().sum(): 0
bureau_balance_agg_train.shape (523515, 44)
bureau_balance_agg_test.shape (250839, 44)
bureau_balance_agg_train.shape (523515, 43)
bureau_balance_agg_test.shape (250839, 43)
Aggregating bool df with shape: (523515, 13)
Aggregating num df with shape: (523515, 57)
Aggregating cat df with shape: (523515, 4)
Aggregating num df with iqr
Aggregating num df with range
Aggregating num df with mean-median difference
Aggregating bool df with shape: (250839, 13)
Aggregating num df with shape: (250839, 57)
Aggregating cat df with shape: (250839, 4)
Aggregating num df with iqr
Aggregating num df with range
Aggregating num df with mean-median diff

In [28]:
del bureau_balance_agg_train, bureau_balance_agg_test

# Feature extraction from `previous application` data

## `previous_application.csv`

In [11]:
class PrevApplImputer(Imputer):
    def __init__(self):
        self._regex_strings = None
        self._spec_impt_regex_val_num = None
        
        self._spec_impt_vals_num = {"RATE_DOWN_PAYMENT": -1.,
                                   "CNT_PAYMENT": -10.,
                                   "DAYS_FIRST_DRAWING": 0., 
                                   "DAYS_FIRST_DUE": 0.,
                                   "DAYS_LAST_DUE_1ST_VERSION": 0.,
                                   "DAYS_LAST_DUE": 0.,
                                   "DAYS_TERMINATION": 0.}
        self._default_imput_vals_num = "median"
        
        self._spec_impt_vals_cat = {"NAME_TYPE_SUITE": "missing_value",
                                    "NFLAG_INSURED_ON_APPROVAL": "missing_value"}
        self._default_imput_vals_cat = "missing_value"
    

def hour_period_bin(hours):
    hours = hours.values
    hour_bin = np.array(["evening"] * len(hours), dtype="object")
    morning_mask = (hours > 5) & (hours < 12)
    afternoon_mask = (hours >= 12) & (hours < 18)
    
    hour_bin[morning_mask] = "morning"
    hour_bin[afternoon_mask] = "afternoon"
    return hour_bin

In [13]:
previous_application = load_csv("data/download/previous_application.csv")
print("previous_application.shape", previous_application.shape)

prev_appl_train, prev_appl_test = train_test_partition(previous_application, "SK_ID_CURR", appl_train_key)

print("prev_appl_train.shape", prev_appl_train.shape)
print("prev_appl_test.shape", prev_appl_test.shape)


prev_appl_train_keys = prev_appl_train[["SK_ID_CURR", "SK_ID_PREV"]]
prev_appl_test_keys = prev_appl_test[["SK_ID_CURR", "SK_ID_PREV"]]

prev_appl_train.head()

Memory usage before changing types 494.38 MB
Memory usage after changing types 162.02 MB
previous_application.shape (1670214, 37)
prev_appl_train.shape (1413701, 37)
prev_appl_test.shape (256513, 37)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.430054,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615234,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735352,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335938,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.394531,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


In [119]:
start = time.time()

prev_appl_train, prev_appl_test = train_test_partition(previous_application, "SK_ID_CURR", appl_train_key)

print("prev_appl_train.shape", prev_appl_train.shape)
print("prev_appl_test.shape", prev_appl_test.shape)


prev_appl_train_keys = prev_appl_train[["SK_ID_CURR", "SK_ID_PREV"]]
prev_appl_test_keys = prev_appl_test[["SK_ID_CURR", "SK_ID_PREV"]]

prev_appl_train = prev_appl_train.drop(["SK_ID_PREV"], axis="columns")
prev_appl_test = prev_appl_test.drop(["SK_ID_PREV"], axis="columns")

print("prev_appl_train.shape", prev_appl_train.shape)
print("prev_appl_test.shape", prev_appl_test.shape)


# drop cols with high percentage of null
cols_to_drop = ["RATE_INTEREST_PRIMARY", "RATE_INTEREST_PRIVILEGED"]
prev_appl_train = prev_appl_train.drop(cols_to_drop, axis="columns")
prev_appl_test = prev_appl_test.drop(cols_to_drop, axis="columns")

print("prev_appl_train.shape", prev_appl_train.shape)
print("prev_appl_test.shape", prev_appl_test.shape)


# impute missing values
imputer = PrevApplImputer()
imputer.fit(prev_appl_train)
prev_appl_train = imputer.transform(prev_appl_train)
prev_appl_test = imputer.transform(prev_appl_test)

print("prev_appl_train.isnull().sum().sum()", prev_appl_train.isnull().sum().sum())
print("prev_appl_test.isnull().sum().sum()", prev_appl_test.isnull().sum().sum())
print("prev_appl_train.shape", prev_appl_train.shape)
print("prev_appl_test.shape", prev_appl_test.shape)


# add to bool cols to identify if values are non-negative
cols_is_nonneg = ["DAYS_FIRST_DRAWING", "DAYS_FIRST_DUE", "DAYS_LAST_DUE_1ST_VERSION", 
                    "DAYS_LAST_DUE", "DAYS_TERMINATION"]
for col in cols_is_nonneg:
    prev_appl_train[col + "_IS_NONNEG"] = prev_appl_train[col] >= 0
    prev_appl_test[col + "_IS_NONNEG"] = prev_appl_test[col] >= 0

print("prev_appl_train.shape", prev_appl_train.shape)
print("prev_appl_test.shape", prev_appl_test.shape)

prev_appl_train["PERIOD_APPR_PROCESS_START"] = hour_period_bin(prev_appl_train["HOUR_APPR_PROCESS_START"])
prev_appl_test["PERIOD_APPR_PROCESS_START"] = hour_period_bin(prev_appl_test["HOUR_APPR_PROCESS_START"])
prev_appl_train["PERIOD_APPR_PROCESS_START"] = prev_appl_train["PERIOD_APPR_PROCESS_START"].astype("category")
prev_appl_test["PERIOD_APPR_PROCESS_START"] = prev_appl_test["PERIOD_APPR_PROCESS_START"].astype("category")

print("prev_appl_train.shape", prev_appl_train.shape)
print("prev_appl_test.shape", prev_appl_test.shape)


# aggregate over "SK_ID_CURR"
num_stats = ["count", "mean", "median", "min", "max", "var"]
bool_stats = ["count", "mean", mode, entropy]
cat_stats = ["count", "nunique", mode, entropy]

by_list_cols = ["SK_ID_CURR"]

aggregator = Aggregator(by_list_cols, num_stats, bool_stats, cat_stats,
                        one_hot_encode_cat=True,
                        iqr=True, minmax_range=True, mean_median_diff=True)
aggregator.fit(prev_appl_train)
prev_appl_train = aggregator.transform(prev_appl_train)
prev_appl_test = aggregator.transform(prev_appl_test)

print("prev_appl_train.isnull().sum().sum()", prev_appl_train.isnull().sum().sum())
print("prev_appl_test.isnull().sum().sum()", prev_appl_test.isnull().sum().sum())
print("prev_appl_train.shape", prev_appl_train.shape)
print("prev_appl_test.shape", prev_appl_test.shape)

# aggregating by var sometimes gives nan
prev_appl_train = prev_appl_train.fillna(0.)
prev_appl_test = prev_appl_test.fillna(0.)

# remove collinear columns
remover = CollinearColumnRemover(0.99)
remover.fit(prev_appl_train)
prev_appl_train = remover.transform(prev_appl_train)
prev_appl_test = remover.transform(prev_appl_test)
print("prev_appl_train.shape", prev_appl_train.shape)
print("prev_appl_test.shape", prev_appl_test.shape)

prev_appl_train = prev_appl_train.reset_index()
prev_appl_test = prev_appl_test.reset_index()

if False:
    prev_appl_train.to_csv("data/data_/previous_application_agg_train.csv", index=False)
    prev_appl_test.to_csv("data/data_/previous_application_agg_test.csv", index=False)

time_end = time.time()
time_elapse = time_end - time_start
print("Elapsed Time", time_elapse)

prev_appl_train.shape (1413701, 37)
prev_appl_test.shape (256513, 37)
prev_appl_train.shape (1413701, 36)
prev_appl_test.shape (256513, 36)
prev_appl_train.shape (1413701, 34)
prev_appl_test.shape (256513, 34)
prev_appl_train.isnull().sum().sum() 0
prev_appl_test.isnull().sum().sum() 0
prev_appl_train.shape (1413701, 41)
prev_appl_test.shape (256513, 41)
prev_appl_train.shape (1413701, 46)
prev_appl_test.shape (256513, 46)
prev_appl_train.shape (1413701, 47)
prev_appl_test.shape (256513, 47)
Aggregating bool df with shape: (1413701, 13)
Aggregating num df with shape: (1413701, 164)
Aggregating cat df with shape: (1413701, 17)
Aggregating num df with iqr
Aggregating num df with range
Aggregating num df with mean-median difference
Aggregating bool df with shape: (256513, 13)
Aggregating num df with shape: (256513, 164)
Aggregating cat df with shape: (256513, 17)
Aggregating num df with iqr
Aggregating num df with range
Aggregating num df with mean-median difference
prev_appl_train.isnull

In [132]:
del prev_appl_train, prev_appl_test

## `POS_CASH_balance.csv`

## `installments_payments.csv`