In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import scipy

import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin

# Helper functions

In [3]:
def change_dtypes(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        if (df[col].dtype == "object") and (df[col].nunique() < df.shape[0]):
            df[col] = df[col].astype("category")

        elif set(df[col].unique()) == set([0, 1]):
            df[col] = df[col].astype(bool)

        elif df[col].dtype == float:
            df[col] = df[col].astype(np.float32)

        elif df[col].dtype == int:
            df[col] = df[col].astype(np.int32)

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtypes(df)
    return df

In [130]:
def check_type(val):
    if type(val) == str:
        return "string"
    
    if np.issubsctype(type(val), np.number):
        return "number"
    
    if callable(val):
        return "function"
    
    return str(type(val))


class NumColsImputer(BaseEstimator, TransformerMixin):
    def __init__(self, specified_values=None, default="median"):
        """
        :param specified_values: dict {colname (str): val (float)}, impute values for some specific columns
        :param default: str, function or float, value or function used for the remaining columns
        """
        assert (specified_values is None) or isinstance(specified_values, 
                                                        dict), "specified_values must be None or dict"
        
        self._specified_values = specified_values
        if self._specified_values is not None:
            for col, val in self._specified_values.items():
                assert check_type(val) == "number", "Impute value for " + col + " is not number."
        
        self._default = default
        self._default_type = check_type(self._default)
        if self._default_type not in ["number", "string", "function"]:
            raise ValueError("Unsupported stat type " + self._default_type)
    
    def _cal_imput_vals(self, df):
        cat_cols = df.select_dtypes(["object", "category", "bool"]).columns.to_list()
        if len(cat_cols) > 0:
            raise ValueError("There are non-number columns: " + ", ".join(cat_cols))
        
        all_cols = df.columns.to_list()
        if self._default_type == "number":
            impute_values = {col: self._default for col in all_cols}
            
        elif self._default_type == "string":
            impute_values = getattr(df, self._default)()
        
        elif self._default_type == "function":
            impute_values = df.apply(self._default)
        
        else:
            return None
            
        impute_values = dict(impute_values)
        if self._specified_values is None:
            return impute_values
        
        for col in self._specified_values:
            impute_values[col] = self._specified_values[col]
            
        return impute_values
    
    def fit(self, df):
        impute_values = self._cal_imput_vals(df)
        
        cols_with_na = [col for col in df.columns if df[col].isnull().any()]
        self._impute_values = {col: impute_values[col] for col in cols_with_na}
        
        for k, v in self._impute_values.items():
            if np.isnan(v):
                raise ValueError("One of the impute_values is NaN: " + k)
        
        return self
    
    def transform(self, df):
        return df.fillna(self._impute_values)


class CatColsImputer(BaseEstimator, TransformerMixin):
    def __init__(self, specified_values=None, default="missing_value"):
        """
        :param specified_values: dict {colname (str): val (str, float, function)}, 
                                 impute values for some specific columns
        :param default: str, used for the remaining columns
        """
        assert (specified_values is None) or isinstance(specified_values, 
                                                        dict), "specified_values must be None or dict"
        
        self._specified_values = specified_values
        if self._specified_values is not None:
            for col, val in self._specified_values.items():
                assert check_type(val) in ["string", 
                                           "function"], "Impute value for " + col + " is " + check_type(val)
        
        self._default = default
        assert check_type(self._default) == "string", "default must be string"
        
        
    def _cal_imput_vals(self, df):
        num_cols = df.select_dtypes(["number"]).columns.to_list()
        if len(num_cols) > 0:
            raise ValueError("There are number columns: " + ", ".join(num_cols))
        
        all_cols = df.columns.to_list()
        impute_values = {col: self._default for col in all_cols}
        if self._specified_values is None:
            return impute_values
        
        for col, val in self._specified_values.items():
            dtype = check_type(val)
            if dtype == "string":
                impute_values[col] = val
            
            elif dtype == "function":
                impute_values[col] = val(df[col])
            
            else:
                return None
        return impute_values
    
    def fit(self, df):
        impute_values = self._cal_imput_vals(df)
        
        cols_with_na = [col for col in df.columns if df[col].isnull().any()]
        self._impute_values = {col: impute_values[col] for col in cols_with_na}
        
        return self
    
    def transform(self, df):
        df_new = df.copy()
        for col, val in self._impute_values.items():
            df_new[col] = df_new[col].astype("object").fillna(val).astype("category")
            
        return df_new

    
class CollinearColumnRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold, col_regex=None):
        """
        :param threshold: float in [0, 1], if two columns have correlation greater than threshold
                          one of them will be removed
        :param col_regex: str, regular expression to select columns
        """
        self._threshold = threshold
        self._col_regex = col_regex
    
    def _collinear_columns(self, df, threshold):
        if self._col_regex is None:
            df_sel = df.select_dtypes(["number", "bool"])
        else:
            df_sel = df.filter(regex=self._col_regex)
        
        all_cols = df_sel.columns.to_list()
        ncols = len(all_cols)
        
        collin_cols = []
        for i in range(ncols-1):
            col_i = all_cols[i]
            if col_i in collin_cols:
                continue
            
            for j in range(i + 1, ncols):
                col_j = all_cols[j]
                if col_j in collin_cols:
                    continue
                
                corr = df_sel[[col_i]].corrwith(df_sel[col_j]).values[0]
                if corr > threshold:
                    collin_cols.append(col_j)
        
        collin_cols = list(set(collin_cols))
        return collin_cols
    
    def _collinear_columns_NOTUSED(self, df, threshold):
        if self._col_regex is None:
            df_sel = df.select_dtypes(["number", "bool"])
        else:
            df_sel = df.filter(regex=self._col_regex)
        
        corr_matr = df_sel.corr().abs()
        upper_matr = corr_matr.where(np.triu(np.ones(corr_matr.shape), k=1).astype(np.bool))
        collin_cols = [col for col in upper_matr.columns if (upper_matr[col] > threshold).any()]
        return collin_cols
    
    def fit(self, df):
        self._collin_cols = self._collinear_columns(df, self._threshold)
        return self
    
    def transform(self, df):
        all_cols = df.columns.to_list()
        nonexist_cols = [col for col in self._collin_cols if col not in all_cols]
        if len(nonexist_cols) > 0:
            print("WARNING: These collinear cols to be droped do not exist in df:", nonexist_cols)
            
        droped_col = [col for col in self._collin_cols if col in all_cols]
        return df.drop(droped_col, axis="columns")

In [5]:
def flatten_multiindex_cols(columns):
    fat_cols = ["_".join([str(c) for c in flat_col]) for flat_col in columns.to_flat_index()]
    return fat_cols


def agg_num_cols(df, by_sers, stats):
    assert type(by_sers) in [list, tuple], "by_sers must be a list or tuple"
    assert type(stats) in [list, tuple], "stats must be a list or tuple"
    
    for ser in by_sers:
        assert isinstance(ser, pd.Series), "ser in by_sers must be Series"
        
    cat_cols = df.select_dtypes(["object", "bool", "category"]).columns.to_list()
    if len(cat_cols) > 0:
        raise ValueError("There are non-number cols: " + ", ".join(cat_cols))
    
    df_agg = df.groupby(by_sers).agg(stats)
    df_agg.columns = flatten_multiindex_cols(df_agg.columns)
    df_agg = df_agg.reset_index()
    
    return df_agg


def agg_cat_cols(df, by_sers, stats):
    assert type(by_sers) in [list, tuple], "by_sers must be a list or tuple"
    assert type(stats) in [list, tuple], "stats must be a list or tuple"
    
    for ser in by_sers:
        assert isinstance(ser, pd.Series), "ser in by_sers must be Series"
        
    num_cols = df.select_dtypes(["number"]).columns.to_list()
    if len(num_cols) > 0:
        raise ValueError("There are number cols: " + ", ".join(num_cols))
    
    df_agg = df.groupby(by_sers).agg(stats)
    df_agg.columns = flatten_multiindex_cols(df_agg.columns)
    df_agg = df_agg.reset_index()
    
    return df_agg

In [6]:
def mode(ser):
    return ser.mode().values[0]

In [144]:
def get_colnames_from_regex(df, regex_strings):
    cols = []
    for regex_str in regex_strings:
        cols.extend(df.filter(regex=regex_str).columns.to_list())
    return cols

def impute_application(appl_df):
    regex_strings = ["^APARTMENTS_", "^BASEMENTAREA_", "^YEARS_B", "^COMMONAREA_", 
                     "^ELEVATORS_", "^ENTRANCES_", "^FLOORS", "^LANDAREA_", "^LIVING", 
                     "^NONLIVING", "AMT_REQ_CREDIT_BUREAU_"]
    
    cols_imput_with_m1 = get_colnames_from_regex(application_train, regex_strings)
    spec_impt_vals_num = {col: -1. for col in cols_imput_with_m1}
    
    spec_impt_vals_num.update({"OWN_CAR_AGE": -1.,
                           "EXT_SOURCE_1": 0.,
                           "EXT_SOURCE_3": 0.,
                           "TOTALAREA_MODE": -1.})
    
    df_num = appl_df.select_dtypes(["number"])
    
    df_isnull = df_num[["SK_ID_CURR"]]
    for col in spec_impt_vals_num:
        df_isnull[col + "_ISNULL"] = df_num[col].isnull()
    df_isnull.drop(["SK_ID_CURR"], axis="columns")
    
    imputer_num = NumColsImputer(specified_values=spec_impt_vals_num, default="median")
    df_num = imputer_num.fit_transform(df_num)
    
    df_cat = appl_df.select_dtypes(["object", "category", "bool"])
    imputer_cat = CatColsImputer(specified_values=None, default="missing_value")
    df_cat = imputer_cat.fit_transform(df_cat)
    
    
    return pd.concat([df_num, df_isnull, df_cat], axis="columns")

# Load data

In [8]:
application_train = load_csv("data/download/application_train.csv")
application_test = load_csv("data/download/application_test.csv")

print(application_train.shape, application_test.shape)

appl_train_key = application_train["SK_ID_CURR"]
appl_test_key = application_test["SK_ID_CURR"]

Memory usage before changing types 300.13 MB
Memory usage after changing types 104.87 MB
Memory usage before changing types 47.18 MB
Memory usage after changing types 18.19 MB
(307511, 122) (48744, 121)


In [9]:
bureau = load_csv("data/download/bureau.csv")
print(bureau.shape)

bureau_key = bureau["SK_ID_BUREAU"]

Memory usage before changing types 233.43 MB
Memory usage after changing types 101.27 MB
(1716428, 17)


In [10]:
bureau_balance = load_csv("data/download/bureau_balance.csv")
bureau_balance.shape

Memory usage before changing types 655.20 MB
Memory usage after changing types 245.70 MB


(27299925, 3)

In [11]:
previous_application = load_csv("data/download/previous_application.csv")
print(previous_application.shape)

previous_application_key = previous_application["SK_ID_PREV"]

Memory usage before changing types 494.38 MB
Memory usage after changing types 162.02 MB
(1670214, 37)


In [12]:
POS_CASH_balance = load_csv("data/download/POS_CASH_balance.csv")
POS_CASH_balance.shape

Memory usage before changing types 640.09 MB
Memory usage after changing types 290.04 MB


(10001358, 8)

In [13]:
installments_payments = load_csv("data/download/installments_payments.csv")
installments_payments.shape

Memory usage before changing types 870.75 MB
Memory usage after changing types 435.37 MB


(13605401, 8)

In [14]:
credit_card_balance = load_csv("data/download/credit_card_balance.csv")
credit_card_balance.shape

Memory usage before changing types 706.62 MB
Memory usage after changing types 341.79 MB


(3840312, 23)

# Feature extraction from `application_[train|test]`

In [15]:


"""
df = application_train.copy()

df["DAYS_EMPLOYED_POSITIVE"] = df["DAYS_EMPLOYED"] > 0
df["DAYS_EMPLOYED"] = df["DAYS_EMPLOYED"].replace({days_emp_max: np.nan})
df["AMT_INCOME_TOTAL_LOG"] = np.log(df["AMT_INCOME_TOTAL"])

df["CREDIT_TO_INCOME"] = df["AMT_CREDIT"] / df["AMT_INCOME_TOTAL"]
df["CREDIT_TO_GOODS"] = df["AMT_CREDIT"] / df["AMT_GOODS_PRICE"]
"""


'\ndf = application_train.copy()\n\ndf["DAYS_EMPLOYED_POSITIVE"] = df["DAYS_EMPLOYED"] > 0\ndf["DAYS_EMPLOYED"] = df["DAYS_EMPLOYED"].replace({days_emp_max: np.nan})\ndf["AMT_INCOME_TOTAL_LOG"] = np.log(df["AMT_INCOME_TOTAL"])\n\ndf["CREDIT_TO_INCOME"] = df["AMT_CREDIT"] / df["AMT_INCOME_TOTAL"]\ndf["CREDIT_TO_GOODS"] = df["AMT_CREDIT"] / df["AMT_GOODS_PRICE"]\n'

In [145]:
df_imput = impute_application(application_train)
df_imput.shape

(307511, 175)

In [107]:
df_imput.filter(like="ISNULL").shape

(307511, 52)

In [104]:
df_imput.filter(regex="\s*_ISNULL").shape

(307511, 52)

In [125]:
remover = CollinearColumnRemover(0.99, col_regex="_ISNULL$")
remover.fit(df_imput)
#df_imput_remove_collin = remover.transform(df_imput)

CollinearColumnRemover(threshold=None)

In [126]:
len(remover._collin_cols)

33

In [127]:
df_imput_remove_collin = remover.transform(df_imput)
df_imput_remove_collin.shape

(307511, 142)

In [128]:
application_train.shape

(307511, 122)

In [129]:
df_imput_remove_collin.filter(like="ISNULL").corr()

Unnamed: 0,APARTMENTS_AVG_ISNULL,BASEMENTAREA_AVG_ISNULL,YEARS_BEGINEXPLUATATION_AVG_ISNULL,YEARS_BUILD_AVG_ISNULL,COMMONAREA_AVG_ISNULL,ELEVATORS_AVG_ISNULL,ENTRANCES_AVG_ISNULL,FLOORSMAX_AVG_ISNULL,FLOORSMIN_AVG_ISNULL,LANDAREA_AVG_ISNULL,LIVINGAPARTMENTS_AVG_ISNULL,LIVINGAREA_AVG_ISNULL,NONLIVINGAPARTMENTS_AVG_ISNULL,NONLIVINGAREA_AVG_ISNULL,AMT_REQ_CREDIT_BUREAU_HOUR_ISNULL,OWN_CAR_AGE_ISNULL,EXT_SOURCE_1_ISNULL,EXT_SOURCE_3_ISNULL,TOTALAREA_MODE_ISNULL
APARTMENTS_AVG_ISNULL,1.0,0.837753,0.951014,0.706118,0.659789,0.924016,0.960503,0.968785,0.687961,0.823466,0.683329,0.934788,0.67079,0.894478,0.026708,-0.032045,0.038879,0.018508,0.949334
BASEMENTAREA_AVG_ISNULL,0.837753,1.0,0.818425,0.776638,0.746792,0.853312,0.843455,0.835734,0.761455,0.861864,0.756031,0.832559,0.74685,0.865329,0.024601,-0.026163,0.04226,0.017627,0.81264
YEARS_BEGINEXPLUATATION_AVG_ISNULL,0.951014,0.818425,1.0,0.688479,0.639607,0.90762,0.959797,0.969179,0.667815,0.803016,0.661539,0.961579,0.645643,0.874321,0.027406,-0.034728,0.039953,0.018989,0.987385
YEARS_BUILD_AVG_ISNULL,0.706118,0.776638,0.688479,1.0,0.906573,0.736926,0.71136,0.704121,0.953324,0.758223,0.936403,0.703409,0.917868,0.744372,0.021487,-0.017572,0.039504,0.016514,0.684372
COMMONAREA_AVG_ISNULL,0.659789,0.746792,0.639607,0.906573,1.0,0.692634,0.660235,0.652434,0.911239,0.731342,0.928458,0.658111,0.925983,0.71547,0.019676,-0.015068,0.038829,0.014193,0.634045
ELEVATORS_AVG_ISNULL,0.924016,0.853312,0.90762,0.736926,0.692634,1.0,0.928486,0.925196,0.723273,0.831589,0.714786,0.900765,0.701809,0.904721,0.026398,-0.02654,0.03899,0.017953,0.90205
ENTRANCES_AVG_ISNULL,0.960503,0.843455,0.959797,0.71136,0.660235,0.928486,1.0,0.984628,0.691585,0.827011,0.683406,0.945228,0.667019,0.898523,0.027438,-0.032912,0.040701,0.018905,0.956619
FLOORSMAX_AVG_ISNULL,0.968785,0.835734,0.969179,0.704121,0.652434,0.925196,0.984628,1.0,0.683718,0.821578,0.675704,0.949255,0.659307,0.892988,0.027783,-0.033116,0.040154,0.01919,0.967783
FLOORSMIN_AVG_ISNULL,0.687961,0.761455,0.667815,0.953324,0.911239,0.723273,0.691585,0.683718,1.0,0.743227,0.940838,0.683306,0.927862,0.731412,0.019761,-0.016219,0.039809,0.014846,0.663563
LANDAREA_AVG_ISNULL,0.823466,0.861864,0.803016,0.758223,0.731342,0.831589,0.827011,0.821578,0.743227,1.0,0.741722,0.818916,0.730936,0.85173,0.025466,-0.028871,0.039959,0.019148,0.798168


In [140]:
df_imput.select_dtypes(["category"]).shape

(307511, 16)

In [146]:
pd.get_dummies(df_imput[["NAME_CONTRACT_TYPE"]]).corr()

Unnamed: 0,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Revolving loans
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
307506,1,0
307507,1,0
307508,1,0
307509,1,0
