In [228]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import scipy
from scipy import stats

import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin

# Helper functions

In [2]:
def change_dtypes(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        if (df[col].dtype == "object") and (df[col].nunique() < df.shape[0]):
            df[col] = df[col].astype("category")

        elif set(df[col].unique()) == set([0, 1]):
            df[col] = df[col].astype(bool)

        elif df[col].dtype == float:
            df[col] = df[col].astype(np.float32)

        elif df[col].dtype == int:
            df[col] = df[col].astype(np.int32)

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtypes(df)
    return df

In [3]:
def check_type(val):
    if type(val) == str:
        return "string"
    
    if np.issubsctype(type(val), np.number):
        return "number"
    
    if callable(val):
        return "function"
    
    return str(type(val))


class NumColsImputer(BaseEstimator, TransformerMixin):
    def __init__(self, specified_values=None, default="median"):
        """
        :param specified_values: dict {colname (str): val (float)}, impute values for some specific columns
        :param default: str, function or float, value or function used for the remaining columns
        """
        assert (specified_values is None) or isinstance(specified_values, 
                                                        dict), "specified_values must be None or dict"
        
        self._specified_values = specified_values
        if self._specified_values is not None:
            for col, val in self._specified_values.items():
                assert check_type(val) == "number", "Impute value for " + col + " is not number."
        
        self._default = default
        self._default_type = check_type(self._default)
        if self._default_type not in ["number", "string", "function"]:
            raise ValueError("Unsupported stat type " + self._default_type)
    
    def _cal_imput_vals(self, df):
        cat_cols = df.select_dtypes(["object", "category", "bool"]).columns.to_list()
        if len(cat_cols) > 0:
            raise ValueError("There are non-number columns: " + ", ".join(cat_cols))
        
        all_cols = df.columns.to_list()
        if self._default_type == "number":
            impute_values = {col: self._default for col in all_cols}
            
        elif self._default_type == "string":
            impute_values = getattr(df, self._default)()
        
        elif self._default_type == "function":
            impute_values = df.apply(self._default)
        
        else:
            return None
            
        impute_values = dict(impute_values)
        if self._specified_values is None:
            return impute_values
        
        for col in self._specified_values:
            impute_values[col] = self._specified_values[col]
            
        return impute_values
    
    def fit(self, df):
        impute_values = self._cal_imput_vals(df)
        
        cols_with_na = [col for col in df.columns if df[col].isnull().any()]
        self._impute_values = {col: impute_values[col] for col in cols_with_na}
        
        for k, v in self._impute_values.items():
            if np.isnan(v):
                raise ValueError("One of the impute_values is NaN: " + k)
        
        return self
    
    def transform(self, df):
        return df.fillna(self._impute_values)


class CatColsImputer(BaseEstimator, TransformerMixin):
    def __init__(self, specified_values=None, default="missing_value"):
        """
        :param specified_values: dict {colname (str): val (str, float, function)}, 
                                 impute values for some specific columns
        :param default: str, used for the remaining columns
        """
        assert (specified_values is None) or isinstance(specified_values, 
                                                        dict), "specified_values must be None or dict"
        
        self._specified_values = specified_values
        if self._specified_values is not None:
            for col, val in self._specified_values.items():
                assert check_type(val) in ["string", 
                                           "function"], "Impute value for " + col + " is " + check_type(val)
        
        self._default = default
        assert check_type(self._default) == "string", "default must be string"
        
        
    def _cal_imput_vals(self, df):
        num_cols = df.select_dtypes(["number"]).columns.to_list()
        if len(num_cols) > 0:
            raise ValueError("There are number columns: " + ", ".join(num_cols))
        
        all_cols = df.columns.to_list()
        impute_values = {col: self._default for col in all_cols}
        if self._specified_values is None:
            return impute_values
        
        for col, val in self._specified_values.items():
            dtype = check_type(val)
            if dtype == "string":
                impute_values[col] = val
            
            elif dtype == "function":
                impute_values[col] = val(df[col])
            
            else:
                return None
        return impute_values
    
    def fit(self, df):
        impute_values = self._cal_imput_vals(df)
        
        cols_with_na = [col for col in df.columns if df[col].isnull().any()]
        self._impute_values = {col: impute_values[col] for col in cols_with_na}
        
        return self
    
    def transform(self, df):
        df_new = df.copy()
        for col, val in self._impute_values.items():
            df_new[col] = df_new[col].astype("object").fillna(val).astype("category")
            
        return df_new

    
class CollinearColumnRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold, col_regex=None):
        """
        :param threshold: float in [0, 1], if two columns have correlation greater than threshold
                          one of them will be removed
        :param col_regex: str, regular expression to select columns
        """
        self._threshold = threshold
        self._col_regex = col_regex
    
    def _collinear_columns(self, df, threshold):
        if self._col_regex is None:
            df_sel = df.select_dtypes(["number", "bool"])
        else:
            df_sel = df.filter(regex=self._col_regex)
        
        all_cols = df_sel.columns.to_list()
        ncols = len(all_cols)
        
        collin_cols = []
        for i in range(ncols-1):
            col_i = all_cols[i]
            if col_i in collin_cols:
                continue
            
            for j in range(i + 1, ncols):
                col_j = all_cols[j]
                if col_j in collin_cols:
                    continue
                
                corr = df_sel[[col_i]].corrwith(df_sel[col_j]).values[0]
                if corr > threshold:
                    collin_cols.append(col_j)
        
        collin_cols = list(set(collin_cols))
        return collin_cols
    
    def _collinear_columns_NOTUSED(self, df, threshold):
        if self._col_regex is None:
            df_sel = df.select_dtypes(["number", "bool"])
        else:
            df_sel = df.filter(regex=self._col_regex)
        
        corr_matr = df_sel.corr().abs()
        upper_matr = corr_matr.where(np.triu(np.ones(corr_matr.shape), k=1).astype(np.bool))
        collin_cols = [col for col in upper_matr.columns if (upper_matr[col] > threshold).any()]
        return collin_cols
    
    def fit(self, df):
        self._collin_cols = self._collinear_columns(df, self._threshold)
        return self
    
    def transform(self, df):
        all_cols = df.columns.to_list()
        nonexist_cols = [col for col in self._collin_cols if col not in all_cols]
        if len(nonexist_cols) > 0:
            print("WARNING: These collinear cols to be droped do not exist in df:", nonexist_cols)
            
        droped_col = [col for col in self._collin_cols if col in all_cols]
        return df.drop(droped_col, axis="columns")


class OneHotEncoder(BaseEstimator, TransformerMixin):
    def fit(self, train_df):
        df_cat = train_df.select_dtypes(["object", "category"])
        self._cat_cols = df_cat.columns.to_list()
        self._cat_cols_ohe = pd.get_dummies(df_cat).columns.to_list()
    
    def transform(self, df):
        df_cat = df.select_dtypes(["object", "category"])
        cat_cols = df_cat.columns.to_list()
        assert set(cat_cols) == set(self._cat_cols), "df does not have the same categorical cols as train_df"
        
        # one-hot encode
        df_cat = pd.get_dummies(df_cat)
        # drop redundant classes which my be present in test_df
        for col in df_cat.columns:
            if col not in self._cat_cols_ohe:
                df_cat = df_cat.drop([col], axis="columns")
        
        # if some some colums are lacking in test but present in train, make them will all zero 
        cat_cols_ohe = df_cat.columns.to_list()
        for col in self._cat_cols_ohe:
            if col not in cat_cols_ohe:
                df_cat[col] = 0
                df_cat[col] = df_cat[col].astype(np.uint8)
        
        num_cols = [col for col in df.columns if col not in cat_cols]
        df_num = df[num_cols]
        
        return pd.concat([df_num, df_cat], axis="columns")

In [None]:
def get_colnames_from_regex(df, regex_strings):
    cols = []
    for regex_str in regex_strings:
        cols.extend(df.filter(regex=regex_str).columns.to_list())
    return cols


class Imputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        raise NotImplementedError("Not implemented")
        
        self._regex_strings = None
        self._spec_impt_regex_val_num = None
        
        self._spec_impt_vals_num = None
        self._default_imput_vals_num = "median"
        
        self._spec_impt_vals_cat = None
        self._default_imput_vals_cat = "missing_value"
    
    def fit(self, df_train):
        if self._regex_strings is not None:
            cols_imput_with_regex = get_colnames_from_regex(df_train, self._regex_strings)
            self._spec_impt_vals_num.update({col: self._spec_impt_regex_val_num 
                                             for col in cols_imput_with_regex})
        
        df_num = df_train.select_dtypes(["number"])
        self._imputer_num = NumColsImputer(specified_values=self._spec_impt_vals_num, 
                                           default=self._default_imput_vals_num)
        self._imputer_num.fit(df_num)
        
        df_cat = df_train.select_dtypes(["object", "category", "bool"])
        self._imputer_cat = CatColsImputer(specified_values=self._spec_impt_vals_cat, 
                                           default=self._default_imput_vals_cat)
        self._imputer_cat.fit(df_cat)
    
    def transform(self, df):
        num_df = df.select_dtypes(["number"])
        
        some_col = list(self._spec_impt_vals_num.keys())[0]
        isnull_df = num_df[[some_col]]
        for col in self._spec_impt_vals_num:
            isnull_df[col + "_ISNULL"] = num_df[col].isnull()
            
        isnull_df = isnull_df.drop([some_col], axis="columns")
        num_df = self._imputer_num.transform(num_df)
        
        # cat
        cat_df = df.select_dtypes(["object", "category", "bool"])
        cat_df = self._imputer_cat.transform(cat_df)
        
        return pd.concat([num_df, isnull_df, cat_df], axis="columns")

In [238]:
def flatten_multiindex_cols(columns):
    fat_cols = ["_".join([str(c) for c in flat_col]) for flat_col in columns.to_flat_index()]
    return fat_cols


def agg_num_cols(df, by_sers, stats):
    assert type(by_sers) in [list, tuple], "by_sers must be a list or tuple"
    assert type(stats) in [list, tuple], "stats must be a list or tuple"
    
    for ser in by_sers:
        assert isinstance(ser, pd.Series), "ser in by_sers must be Series"
        
    cat_cols = df.select_dtypes(["object", "category"]).columns.to_list()
    if len(cat_cols) > 0:
        raise ValueError("There are non-number cols: " + ", ".join(cat_cols))
    
    df_agg = df.groupby(by_sers).agg(stats)
    df_agg.columns = flatten_multiindex_cols(df_agg.columns)
    
    return df_agg


def agg_cat_cols(df, by_sers, stats):
    assert type(by_sers) in [list, tuple], "by_sers must be a list or tuple"
    assert type(stats) in [list, tuple], "stats must be a list or tuple"
    
    for ser in by_sers:
        assert isinstance(ser, pd.Series), "ser in by_sers must be Series"
        
    num_cols = df.select_dtypes(["number"]).columns.to_list()
    if len(num_cols) > 0:
        raise ValueError("There are number cols: " + ", ".join(num_cols))
    
    df_agg = df.groupby(by_sers).agg(stats)
    df_agg.columns = flatten_multiindex_cols(df_agg.columns)
    
    return df_agg


class Aggregator:
    
    def __init(self, by_sers, 
               num_stats, bool_stats, cat_stats, 
               iqr=True, minmax_range=True, mean_median_diff=True):
        
        self._by_sers = by_sers
        
        self._num_stats = num_stats
        self._bool_stats = bool_stats
        self._cat_stats = cat_stats
        
        self._iqr = iqr
        self._minmax_range = minmax_range
        self._mean_median_diff = mean_median_diff
    
    def _select_num(self, df):
        return df.select_dtypes(["number"])
    
    def _select_bool(self, df):
        return df.select_dtypes(["bool"])
    
    def _select_cat(self, df):
        return df.select_dtypes(["category", "object"])
    
    
    def _num_agg(self, num_df):
        num_agg_df = agg_num_cols(num_df, self._by_sers, stats=self._num_stats)
        return num_agg_df
    
    def _bool_agg(self, bool_df):
        bool_agg_df = agg_num_cols(bool_df, self._by_sers, stats=self._bool_stats)
    
    def _cat_agg(self, cat_df):
        cat_agg_df =  agg_cat_cols(cat_df, self._by_sers, stats=self._cat_stats)
        return cat_agg_df
    
    
    def _iqr(self, num_df):
        grouped = num_df.groupby(self._by_sers)
        iqr_df = grouped.quantile(0.75) - grouped.quantile(0.25)
        iqr_df.columns = [col + "_iqr" for col in iqr_df.columns]
        return iqr_df
    
    def _range(self, num_df):
        grouped = num_df.groupby(self._by_sers)
        range_df = grouped.max() - grouped.min()
        range_df.columns = [col + "_range" for col in range_df.columns]
        return range_df
    
    def _mean_median_diff(self, num_df):
        grouped = num_df.groupby(self._by_sers)
        diff_df = grouped.mean() - grouped.median()
        diff_df.columns = [col + "_mm_diff" for col in diff_df.columns]
        return diff_df
    
    
    def fit(self, df_train):
        self._ohe = OneHotEncoder()
        self._ohe.fit(df_train)
        return self
    
    def transform(self, df):
        dfs = []
        
        df_bool = self._select_bool(df)
        if df_bool.shape[1] > 0:
            print("Aggregating bool df with shape:", df_bool.shape)
            df_bool = self._bool_agg(df_bool)
            dfs.append(df_bool)
        
        df_num = self._ohe.transform(df)
        df_num = self._select_num(df_num)
        if df_num.shape[1] > 0:
            print("Aggregating num df with shape:", df_num.shape)
            df_num = self._num_agg(df_num)
            dfs.append(df_num)
        
        df_cat = self._select_cat(df)
        if df_cat.shape[1] > 0:
            print("Aggregating cat df with shape:" df_cat.shape)
            df_cat = self._cat_agg(df_cat)
            dfs.append(df_cat)
        
        if self._iqr:
            print("Aggregating num df with iqr")
            df_iqr = self._iqr(df_num)
            dfs.append(df_iqr)
        
        if self._minmax_range:
            print("Aggregating num df with range")
            df_range = self._range(df_num)
            dfs.append(df_range)
        
        if self._mean_median_diff:
            print("Aggregating num df with mean-median difference")
            df_diff = self._mean_median_diff(df_num)
            dfs.append(df_diff)

In [80]:
def train_test_partition(df, matching_key, train_id_ser):
    is_train = df[matching_key].isin(train_id_ser.values)
    
    train = df.loc[is_train, :]
    test = df.loc[~is_train, :]
    return train, test

In [229]:
def minmax_range(x):
    return x.max() - x.min()


def mean_median_diff(x):
    return x.mean() - x.median()


def iqr(x):
    l, h = np.quantile(x.dropna(), [0.25, 0.75])
    return h - l


def mode(ser):
    return ser.mode().values[0]


def entropy(ser):
    pk = ser.value_counts(normalize=True)
    return stats.entropy(pk)

# Load data

In [7]:
bureau_balance = load_csv("data/download/bureau_balance.csv")
bureau_balance.shape

Memory usage before changing types 655.20 MB
Memory usage after changing types 245.70 MB


(27299925, 3)

In [11]:
previous_application = load_csv("data/download/previous_application.csv")
print(previous_application.shape)

Memory usage before changing types 494.38 MB
Memory usage after changing types 162.02 MB
(1670214, 37)


In [8]:
POS_CASH_balance = load_csv("data/download/POS_CASH_balance.csv")
POS_CASH_balance.shape

Memory usage before changing types 640.09 MB
Memory usage after changing types 290.04 MB


(10001358, 8)

In [9]:
installments_payments = load_csv("data/download/installments_payments.csv")
installments_payments.shape

Memory usage before changing types 870.75 MB
Memory usage after changing types 435.37 MB


(13605401, 8)

In [18]:
credit_card_balance = load_csv("data/download/credit_card_balance.csv")
credit_card_balance.shape

Memory usage before changing types 706.62 MB
Memory usage after changing types 341.79 MB


(3840312, 23)

# Feature extraction from `application_[train|test]`

In [102]:
class ApplImputer(Imputer):
    def __init__(self):
        self._regex_strings = ["^APARTMENTS_", "^BASEMENTAREA_", "^YEARS_B", "^COMMONAREA_", 
                               "^ELEVATORS_", "^ENTRANCES_", "^FLOORS", "^LANDAREA_", "^LIVING", 
                               "^NONLIVING", "AMT_REQ_CREDIT_BUREAU_"]
        self._spec_impt_regex_val_num = -1.
        
        self._spec_impt_vals_num = {"OWN_CAR_AGE": -1.,
                                    "EXT_SOURCE_1": 0.,
                                    "EXT_SOURCE_3": 0.,
                                    "TOTALAREA_MODE": -1.}
        self._default_imput_vals_num = "median"
        
        self._spec_impt_vals_cat = None
        self._default_imput_vals_cat = "missing_value"
        

class ApplNewColsAdder(BaseEstimator, TransformerMixin):
    def fit(self, df_train):
        credit_to_income = df_train["AMT_CREDIT"] / df_train["AMT_INCOME_TOTAL"]
        self._cti_min = credit_to_income.replace(-np.inf, np.nan).min() / 10.
        self._cti_max = credit_to_income.replace(np.inf, np.nan).max() * 10.
        
        credit_to_goods = df_train["AMT_CREDIT"] / df_train["AMT_GOODS_PRICE"]
        self._ctg_min = credit_to_goods.replace(-np.inf, np.nan).min() / 10.
        self._ctg_max = credit_to_goods.replace(np.inf, np.nan).max() * 10.
    
    def transform(self, df):
        df_new = df.copy()
        df_new["AMT_INCOME_TOTAL_LOG"] = np.log(df_new["AMT_INCOME_TOTAL"])
        df_new["DAYS_EMPLOYED_POSITIVE"] = df_new["DAYS_EMPLOYED"] > 0
        days_emp_max = df_new["DAYS_EMPLOYED"].max()
        if days_emp_max > 0:
            df_new["DAYS_EMPLOYED"] = df_new["DAYS_EMPLOYED"].replace({days_emp_max: 100.})
        
        df_new["CREDIT_TO_INCOME"] = df_new["AMT_CREDIT"] / df_new["AMT_INCOME_TOTAL"]
        df_new["CREDIT_TO_INCOME"] = df_new["CREDIT_TO_INCOME"].replace(-np.inf, self._cti_min)
        df_new["CREDIT_TO_INCOME"] = df_new["CREDIT_TO_INCOME"].replace(np.inf, self._cti_max)
        
        df_new["CREDIT_TO_GOODS"] = df_new["AMT_CREDIT"] / df_new["AMT_GOODS_PRICE"]
        df_new["CREDIT_TO_GOODS"] = df_new["CREDIT_TO_GOODS"].replace(-np.inf, self._ctg_min)
        df_new["CREDIT_TO_GOODS"] = df_new["CREDIT_TO_GOODS"].replace(np.inf, self._ctg_max)
        
        return df_new

In [100]:
application_train = load_csv("data/download/application_train.csv")
application_test = load_csv("data/download/application_test.csv")

appl_train_key = application_train["SK_ID_CURR"]
appl_test_key = application_test["SK_ID_CURR"]
print(application_train.shape, application_test.shape)

Memory usage before changing types 300.13 MB
Memory usage after changing types 104.87 MB
Memory usage before changing types 47.18 MB
Memory usage after changing types 18.19 MB
(307511, 122) (48744, 121)


In [101]:
appl_train = application_train.copy()
appl_test = application_test.copy()

imputer = ApplImputer()
imputer.fit(appl_train)
appl_train = imputer.transform(appl_train)
appl_test = imputer.transform(appl_test)

remover = CollinearColumnRemover(0.99, col_regex="_ISNULL$")
remover.fit(appl_train)
appl_train = remover.transform(appl_train)
appl_test = remover.transform(appl_test)

adder = ApplNewColsAdder()
adder.fit(appl_train)
appl_train = adder.transform(appl_train)
appl_test = adder.transform(appl_test)

print("appl_train.shape", appl_train.shape)
print("appl_test.shape", appl_test.shape)


appl_train.to_csv("data/data_/application_train.csv", index=False)
appl_test.to_csv("data/data_/application_test.csv", index=False)

del appl_train, appl_test

appl_train.shape (307511, 145)
appl_test.shape (48744, 144)


# Feature extraction from `bureau` data

## `bureau.csv`

In [123]:
class BureauImputer(Imputer):
    def __init__(self):
        self._regex_strings = None
        self._spec_impt_regex_val_num = None
        
        self._spec_impt_vals_num = {"DAYS_ENDDATE_FACT": 100.,
                                    "AMT_CREDIT_MAX_OVERDUE": -1000.,
                                   "AMT_CREDIT_SUM_DEBT": 0.,
                                   "AMT_CREDIT_SUM_LIMIT": 0.,
                                   "AMT_ANNUITY": -1000.}
        self._default_imput_vals_num = "median"
        
        self._spec_impt_vals_cat = None
        self._default_imput_vals_cat = "missing_value"

        
def bureau_add_cols(bu_df):
    df = bu_df.copy()
    df["DAYS_CREDIT_ENDDATE_ISPOSITIVE"] = df["DAYS_CREDIT_ENDDATE"] > 0
    df["DAYS_CREDIT_UPDATE_ISPOSITIVE"] = df["DAYS_CREDIT_UPDATE"] > 0
    return df

In [65]:
bureau = load_csv("data/download/bureau.csv")
print(bureau.shape)
bureau.head()

Memory usage before changing types 233.43 MB
Memory usage after changing types 101.27 MB
(1716428, 17)


Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


In [81]:
bureau_train, bureau_test = train_test_partition(bureau, "SK_ID_CURR", appl_train_key)

print("bureau_train.shape", bureau_train.shape)
print("bureau_test.shape", bureau_test.shape)

bureau_train_key = bureau_train["SK_ID_BUREAU"]
bureau_test_key = bureau_test["SK_ID_BUREAU"]

bureau_train.shape (1465325, 17)
bureau_test.shape (251103, 17)


In [124]:
bu_train = bureau_train.copy()
bu_test = bureau_test.copy()

imputer = BureauImputer()
imputer.fit(bu_train)
bu_train = imputer.transform(bu_train)
bu_test = imputer.transform(bu_test)

bu_train = bureau_add_cols(bu_train)
bu_test = bureau_add_cols(bu_test)

bu_train.shape, bu_test.shape

((1465325, 24), (251103, 24))

In [125]:
bu_train.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,...,DAYS_ENDDATE_FACT_ISNULL,AMT_CREDIT_MAX_OVERDUE_ISNULL,AMT_CREDIT_SUM_DEBT_ISNULL,AMT_CREDIT_SUM_LIMIT_ISNULL,AMT_ANNUITY_ISNULL,CREDIT_ACTIVE,CREDIT_CURRENCY,CREDIT_TYPE,DAYS_CREDIT_ENDDATE_ISPOSITIVE,DAYS_CREDIT_UPDATE_ISPOSITIVE
0,215354,5714462,-497,0,-153.0,-153.0,-1000.0,0,91323.0,0.0,...,False,True,False,True,True,Closed,currency 1,Consumer credit,False,False
1,215354,5714463,-208,0,1075.0,100.0,-1000.0,0,225000.0,171342.0,...,True,True,False,True,True,Active,currency 1,Credit card,True,False
2,215354,5714464,-203,0,528.0,100.0,-1000.0,0,464323.5,0.0,...,True,True,True,True,True,Active,currency 1,Consumer credit,True,False
3,215354,5714465,-203,0,-334.0,100.0,-1000.0,0,90000.0,0.0,...,True,True,True,True,True,Active,currency 1,Credit card,False,False
4,215354,5714466,-629,0,1197.0,100.0,77674.5,0,2700000.0,0.0,...,True,False,True,True,True,Active,currency 1,Consumer credit,True,False


In [126]:
bu_train.columns

Index(['SK_ID_CURR', 'SK_ID_BUREAU', 'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE',
       'DAYS_CREDIT_ENDDATE', 'DAYS_ENDDATE_FACT', 'AMT_CREDIT_MAX_OVERDUE',
       'CNT_CREDIT_PROLONG', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT',
       'AMT_CREDIT_SUM_LIMIT', 'AMT_CREDIT_SUM_OVERDUE', 'DAYS_CREDIT_UPDATE',
       'AMT_ANNUITY', 'DAYS_ENDDATE_FACT_ISNULL',
       'AMT_CREDIT_MAX_OVERDUE_ISNULL', 'AMT_CREDIT_SUM_DEBT_ISNULL',
       'AMT_CREDIT_SUM_LIMIT_ISNULL', 'AMT_ANNUITY_ISNULL', 'CREDIT_ACTIVE',
       'CREDIT_CURRENCY', 'CREDIT_TYPE', 'DAYS_CREDIT_ENDDATE_ISPOSITIVE',
       'DAYS_CREDIT_UPDATE_ISPOSITIVE'],
      dtype='object')

In [254]:
by_sers = [bu_train["SK_ID_CURR"]]

bu_num = bu_train.select_dtypes(["number"]).drop(["SK_ID_CURR", "SK_ID_BUREAU"], axis="columns")
bu_num.head()

Unnamed: 0,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,-497,0,-153.0,-153.0,-1000.0,0,91323.0,0.0,0.0,0.0,-131,-1000.0
1,-208,0,1075.0,100.0,-1000.0,0,225000.0,171342.0,0.0,0.0,-20,-1000.0
2,-203,0,528.0,100.0,-1000.0,0,464323.5,0.0,0.0,0.0,-16,-1000.0
3,-203,0,-334.0,100.0,-1000.0,0,90000.0,0.0,0.0,0.0,-16,-1000.0
4,-629,0,1197.0,100.0,77674.5,0,2700000.0,0.0,0.0,0.0,-21,-1000.0


In [262]:
bu_num_agg = bu_num.groupby(by_sers).quantile(q=0.75) - bu_num.groupby(by_sers).quantile(q=0.25)

In [263]:
bu_num_agg.head()

Unnamed: 0_level_0,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
100002,519.25,0.0,1010.5,982.75,1860.625,0.0,95542.07666,0.0,0.0,0.0,907.5,0.0
100003,1140.75,0.0,1002.5,618.5,0.0,0.0,226823.625,0.0,202500.0,0.0,529.5,0.0
100004,459.0,0.0,106.5,150.5,500.0,0.0,18.898438,0.0,0.0,0.0,150.0,0.0
100007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100008,509.5,0.0,662.0,564.0,500.0,0.0,86235.75,120028.5,0.0,0.0,505.5,0.0


In [267]:
stats=("count", "mean", "median", "var", "min", "max", minmax_range, iqr, mean_median_diff)
stats=("count", "mean", "median", "var", "min", "max")
bu_num_agg = agg_num_cols(bu_tmp, by_sers, stats=stats)

In [268]:
bu_num_agg.head()

Unnamed: 0,SK_ID_CURR,DAYS_CREDIT_count,DAYS_CREDIT_mean,DAYS_CREDIT_median,DAYS_CREDIT_var,DAYS_CREDIT_min,DAYS_CREDIT_max,CREDIT_DAY_OVERDUE_count,CREDIT_DAY_OVERDUE_mean,CREDIT_DAY_OVERDUE_median,...,DAYS_CREDIT_UPDATE_median,DAYS_CREDIT_UPDATE_var,DAYS_CREDIT_UPDATE_min,DAYS_CREDIT_UPDATE_max,AMT_ANNUITY_count,AMT_ANNUITY_mean,AMT_ANNUITY_median,AMT_ANNUITY_var,AMT_ANNUITY_min,AMT_ANNUITY_max
0,100002,8,-874.0,-1042.5,186150.0,-1437,-103,8,0.0,0.0,...,-402.5,268865.553571,-1185,-7,8,-125.0,0.0,125000.0,-1000.0,0.0
1,100003,4,-1400.75,-1205.5,827783.583333,-2586,-606,4,0.0,0.0,...,-545.0,824562.0,-2131,-43,4,-1000.0,-1000.0,0.0,-1000.0,-1000.0
2,100004,2,-867.0,-867.0,421362.0,-1326,-408,2,0.0,0.0,...,-532.0,45000.0,-682,-382,2,-1000.0,-1000.0,0.0,-1000.0,-1000.0
3,100007,1,-1149.0,-1149.0,,-1149,-1149,1,0.0,0.0,...,-783.0,,-783,-783,1,-1000.0,-1000.0,,-1000.0,-1000.0
4,100008,3,-757.333333,-1097.0,346120.333333,-1097,-78,3,0.0,0.0,...,-790.0,279561.0,-1027,-16,3,-1000.0,-1000.0,0.0,-1000.0,-1000.0


In [243]:
by_sers = [bu_train["SK_ID_CURR"]]
bu_bool = bu_train.select_dtypes(["bool"])
bu_bool.head()

Unnamed: 0,DAYS_ENDDATE_FACT_ISNULL,AMT_CREDIT_MAX_OVERDUE_ISNULL,AMT_CREDIT_SUM_DEBT_ISNULL,AMT_CREDIT_SUM_LIMIT_ISNULL,AMT_ANNUITY_ISNULL,DAYS_CREDIT_ENDDATE_ISPOSITIVE,DAYS_CREDIT_UPDATE_ISPOSITIVE
0,False,True,False,True,True,False,False
1,True,True,False,True,True,True,False
2,True,True,True,True,True,True,False
3,True,True,True,True,True,False,False
4,True,False,True,True,True,True,False


In [246]:
bu_bool.shape

(1465325, 7)

In [251]:
bu_bool_agg = agg_num_cols(bu_bool, by_sers, stats=("mean", "count", "var")).fillna(0)

In [252]:
bu_bool_agg.shape

(263491, 22)

In [253]:
bu_bool_agg.isnull().mean()

SK_ID_CURR                              0.0
DAYS_ENDDATE_FACT_ISNULL_mean           0.0
DAYS_ENDDATE_FACT_ISNULL_count          0.0
DAYS_ENDDATE_FACT_ISNULL_var            0.0
AMT_CREDIT_MAX_OVERDUE_ISNULL_mean      0.0
AMT_CREDIT_MAX_OVERDUE_ISNULL_count     0.0
AMT_CREDIT_MAX_OVERDUE_ISNULL_var       0.0
AMT_CREDIT_SUM_DEBT_ISNULL_mean         0.0
AMT_CREDIT_SUM_DEBT_ISNULL_count        0.0
AMT_CREDIT_SUM_DEBT_ISNULL_var          0.0
AMT_CREDIT_SUM_LIMIT_ISNULL_mean        0.0
AMT_CREDIT_SUM_LIMIT_ISNULL_count       0.0
AMT_CREDIT_SUM_LIMIT_ISNULL_var         0.0
AMT_ANNUITY_ISNULL_mean                 0.0
AMT_ANNUITY_ISNULL_count                0.0
AMT_ANNUITY_ISNULL_var                  0.0
DAYS_CREDIT_ENDDATE_ISPOSITIVE_mean     0.0
DAYS_CREDIT_ENDDATE_ISPOSITIVE_count    0.0
DAYS_CREDIT_ENDDATE_ISPOSITIVE_var      0.0
DAYS_CREDIT_UPDATE_ISPOSITIVE_mean      0.0
DAYS_CREDIT_UPDATE_ISPOSITIVE_count     0.0
DAYS_CREDIT_UPDATE_ISPOSITIVE_var       0.0
dtype: float64

In [264]:
l = ["a", "b", "c"]
l.pop("a")

TypeError: 'str' object cannot be interpreted as an integer

In [274]:
(bu_num.groupby(by_sers).agg("max") - bu_num.groupby(by_sers).agg("min"))

Unnamed: 0_level_0,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
100002,1334,0,1852.0,1285.0,6043.64502,0,4.500000e+05,245781.0,31988.564453,0.0,1178,1000.0
100003,1980,0,3650.0,2231.0,0.00000,0,7.877520e+05,0.0,810000.000000,0.0,2088,0.0
100004,918,0,213.0,301.0,1000.00000,0,3.779688e+01,0.0,0.000000,0.0,300,0.0
100007,0,0,0.0,0.0,0.00000,0,0.000000e+00,0.0,0.000000,0.0,0,0.0
100008,1019,0,1324.0,1128.0,1000.00000,0,1.724715e+05,240057.0,0.000000,0.0,1011,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
456247,2195,0,12916.0,2398.0,23027.50000,1,3.085488e+06,2193390.0,0.000000,0.0,2283,27275.5
456249,2230,0,3862.0,2625.0,19945.00000,0,7.215525e+05,163071.0,0.000000,0.0,2486,0.0
456253,206,0,1302.0,894.0,0.00000,0,1.890000e+06,1624797.0,0.000000,0.0,696,59369.5
456254,0,0,0.0,0.0,0.00000,0,0.000000e+00,0.0,0.000000,0.0,0,0.0


In [278]:
group.mean() - group.median()

Unnamed: 0_level_0,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
100002,168.500000,0.0,-11.250000,-18.625000,675.643127,0.000000,54001.445312,30722.625000,3998.570557,0.0,-97.375000,-125.000000
100003,-195.250000,0.0,-64.500000,-217.500000,0.000000,0.000000,161773.875000,0.000000,202500.000000,0.0,-271.000000,0.000000
100004,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
100007,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
100008,339.666667,0.0,400.666656,217.333313,333.333313,0.000000,50443.500000,80019.000000,0.000000,0.0,179.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
456247,-267.181818,0.0,1860.818237,-259.818176,4041.946289,0.090909,327738.687500,199399.093750,0.000000,0.0,-268.818182,3599.227295
456249,12.923077,0.0,17.769287,-12.076904,2748.846191,0.000000,35450.468750,12543.922852,0.000000,0.0,-155.538462,0.000000
456253,51.500000,0.0,181.500000,0.000000,0.000000,0.000000,315000.000000,363440.250000,0.000000,0.0,-99.750000,-14842.375000
456254,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000


In [277]:
group.quantile?

In [281]:
bu_num.select_dtypes("category").shape[-1]

0