In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

# Helper functions

In [4]:
def print_long_string(string, indent=0, max_words_per_line=10):
    """
    :param string: str
    :param indent: int
    :param max_words_per_line: int
    :return: None
    """
    words = [" "*indent]
    for i, word in enumerate(string.split()):
        words.append(word)
        if (i+1) % max_words_per_line == 0:
            words.append("\n" + " "*indent)
    print(" ".join(words))
    return None


def print_col_desc(df, table_name, col_desc_df):
    """
    To print out the information of columns of df. This information is taken from col_desc_df
    :param df: dataframe
    :param table_name: str
    :param col_desc_table:
    :return: None
    """
    nrows = df.shape[0]

    for i, col in enumerate(df.columns):
        mask = (col_desc_df["Table"] == table_name) & (col_desc_df["Row"] == col)
        print("Column Number:", i)
        print("Column Name:", col)
        print("Description:")

        if len(col_desc_df.loc[mask, :]) == 0:
            print(" " * 10 + "No Description, Maybe the column name does not match.")
        else:
            print_long_string(col_desc_df.loc[mask, "Description"].iloc[0], indent=10)
            print("Special:", col_desc_df.loc[mask, "Special"].iloc[0])

        print("Type:", df[col].dtype)
        n_null = np.sum(df[col].isnull())
        print("Number of NULL(s):", n_null)
        print("Percent of NULL(s): %.0.2f" % (n_null / nrows * 100))

        if (df[col].dtype == np.object) or (df[col].dtype == np.int):
            nunique = df[col].nunique(dropna=False)
            print("Number of Unique Values:", nunique)
            if nunique <= 20:
                print_long_string(", ".join([str(s) for s in df[col].unique()]),
                                  indent=28, max_words_per_line=5)
            else:
                print_long_string(", ".join([str(s) for s in df[col].unique()[:20]]) + " ...",
                                  indent=28, max_words_per_line=5)

        if np.issubdtype(df[col].dtype, np.number):
            print("Min:", df[col].min())
            print("Max:", df[col].max())

        print("-" * 50 + "\n")
    return None


def change_dtypes(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        if (df[col].dtype == "object") and (df[col].nunique() < df.shape[0]):
            df[col] = df[col].astype("category")

        elif set(df[col].unique()) == set([0, 1]):
            df[col] = df[col].astype(bool)

        elif df[col].dtype == float:
            df[col] = df[col].astype(np.float32)

        elif df[col].dtype == int:
            df[col] = df[col].astype(np.int32)

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df

In [3]:
col_desc = pd.read_csv("data/data_/HomeCredit_columns_description.csv", encoding="ISO-8859-1")

# `application_train.csv`

This is the main table, broken into two files for Train (with TARGET) and Test (without TARGET).

Static data for all applications. One row represents one loan in our data sample.

In [5]:
application_train = pd.read_csv("data/data_/application_train.csv")
application_train = change_dtypes(application_train)
print("application_train.shape:", application_train.shape)
application_train.head()

Memory usage before changing types 300.13 MB
Memory usage after changing types 104.87 MB
application_train.shape: (307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,True,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,False,False,False,False,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,False,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,False,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,False,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,False,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,False,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,False,False,False,False,,,,,,
4,100007,False,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,False,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
application_train.dtypes

In [None]:
application_train["SK_ID_CURR"].is_unique

In [None]:
application_train["CNT_CHILDREN"].dtype

In [None]:
_input_output.print_col_desc(application_train, "application_{train|test}.csv", col_desc)

In [None]:
application_train.shape

# `bureau.csv`

All client's previous credits provided by other financial institutions that were reported to Credit Bureau (for clients who have a loan in our sample).

For every loan in our sample, there are as many rows as number of credits the client had in Credit Bureau before the application date.

In [None]:
bureau = pd.read_csv("data/download/bureau.csv")
print("bureau.shape:", bureau.shape)
bureau.head()

In [None]:
bureau["SK_ID_CURR"].is_unique

In [None]:
bureau["SK_ID_BUREAU"].is_unique

In [None]:
print_col_desc(bureau, "bureau.csv", col_desc)

# Description of columns in table `bureau_balance.csv`

Monthly balances of previous credits in Credit Bureau.

This table has one row for each month of history of every previous credit reported to Credit Bureau – i.e the table has (#loans in sample * # of relative previous credits * # of months where we have some history observable for the previous credits) rows.

In [None]:
bureau_balance = pd.read_csv("data/download/bureau_balance.csv")
print("bureau_balance.shape:", bureau_balance.shape)
bureau_balance.head()

In [None]:
bureau_balance["SK_ID_BUREAU"].is_unique

In [None]:
print_col_desc(bureau_balance, "bureau_balance.csv", col_desc)

# Description of columns in table `previous_application.csv`

All previous applications for Home Credit loans of clients who have loans in our sample.

There is one row for each previous application related to loans in our data sample.

In [None]:
previous_application = pd.read_csv("data/download/previous_application.csv")
print("previous_application.shape:", previous_application.shape)
previous_application.head()

In [None]:
previous_application["SK_ID_PREV"].is_unique

In [None]:
previous_application["SK_ID_CURR"].is_unique

In [None]:
print_col_desc(previous_application, "previous_application.csv", col_desc)

# Description of columns in table `POS_CASH_balance.csv`

Monthly balance snapshots of previous POS (point of sales) and cash loans that the applicant had with Home Credit.

This table has one row for each month of history of every previous credit in Home Credit (consumer credit and cash loans) related to loans in our sample – i.e. the table has (#loans in sample * # of relative previous credits * # of months in which we have some history observable for the previous credits) rows.

In [None]:
POS_CASH_balance = pd.read_csv("data/download/POS_CASH_balance.csv")
print("POS_CASH_balance.shape:", POS_CASH_balance.shape)
POS_CASH_balance.head()

In [None]:
[id_col for id_col in POS_CASH_balance.columns.values if id_col.startswith("SK_ID")]

In [None]:
POS_CASH_balance["SK_ID_PREV"].is_unique

In [None]:
POS_CASH_balance["SK_ID_CURR"].is_unique

In [None]:
POS_CASH_balance.groupby(by=["SK_ID_PREV"])["SK_ID_CURR"].count().head()

In [None]:
tmp = POS_CASH_balance.groupby(by=["SK_ID_PREV"])["SK_ID_CURR"].nunique()
print(tmp.head())

# so each "SK_ID_PREV" is corresponding to a unique "SK_ID_CURR"
tmp[tmp > 1]

In [None]:
print_col_desc(POS_CASH_balance, "POS_CASH_balance.csv", col_desc)

# Description of columns in table `credit_card_balance.csv`

Monthly balance snapshots of previous credit cards that the applicant has with Home Credit.

This table has one row for each month of history of every previous credit in Home Credit (consumer credit and cash loans) related to loans in our sample – i.e. the table has (#loans in sample * # of relative previous credit cards * # of months where we have some history observable for the previous credit card) rows.

In [None]:
credit_card_balance = pd.read_csv("data/download/credit_card_balance.csv")
print("credit_card_balance.shape:", credit_card_balance.shape)
credit_card_balance.head()

In [None]:
credit_card_balance["SK_ID_PREV"].is_unique

In [None]:
credit_card_balance["SK_ID_CURR"].is_unique

In [None]:
credit_card_balance.groupby(by=["SK_ID_PREV"])["SK_ID_CURR"].count().head()

In [None]:
tmp = credit_card_balance.groupby(by=["SK_ID_PREV"])["SK_ID_CURR"].nunique()
print(tmp.head())

# so each "SK_ID_PREV" is corresponding to a unique "SK_ID_CURR"
tmp[tmp > 1]

In [None]:
print_col_desc(credit_card_balance, "credit_card_balance.csv", col_desc)

# Description of columns in table  `installments_payments.csv`

Repayment history for the previously disbursed credits in Home Credit related to the loans in our sample.

There is a) one row for every payment that was made plus b) one row each for missed payment.

One row is equivalent to one payment of one installment OR one installment corresponding to one payment of one previous Home Credit credit related to loans in our sample.

In [None]:
installments_payments = pd.read_csv("data/download/installments_payments.csv")
print("installments_payments.shape:", installments_payments.shape)
installments_payments.head()

In [None]:
installments_payments["SK_ID_PREV"].is_unique

In [None]:
installments_payments["SK_ID_CURR"].is_unique

In [None]:
installments_payments.groupby(by=["SK_ID_PREV"])["SK_ID_CURR"].count().head()

In [None]:
tmp = installments_payments.groupby(by=["SK_ID_PREV"])["SK_ID_CURR"].nunique()
print(tmp.head())

# so each "SK_ID_PREV" is corresponding to a unique "SK_ID_CURR"
tmp[tmp > 1]

In [None]:
print_col_desc(installments_payments, "installments_payments.csv", col_desc)