In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

# Helper functions

In [2]:
def print_long_string(string, indent=0, max_words_per_line=10):
    """
    :param string: str
    :param indent: int
    :param max_words_per_line: int
    :return: None
    """
    words = [" "*indent]
    for i, word in enumerate(string.split()):
        words.append(word)
        if (i+1) % max_words_per_line == 0:
            words.append("\n" + " "*indent)
    print(" ".join(words))
    return None


def print_col_desc(df, table_name, col_desc_df):
    """
    To print out the information of columns of df. This information is taken from col_desc_df
    :param df: dataframe
    :param table_name: str
    :param col_desc_table:
    :return: None
    """
    nrows = df.shape[0]
    all_cols = df.columns.to_list()
    cat_cols = df.select_dtypes(["category", "object", "bool"]).columns.to_list()
    num_cols = [col for col in all_cols if col not in cat_cols]

    for i, col in enumerate(df.columns):
        mask = (col_desc_df["Table"] == table_name) & (col_desc_df["Row"] == col)
        print("Column Number:", i)
        print("Column Name:", col)
        print("Description:")

        if len(col_desc_df.loc[mask, :]) == 0:
            print(" " * 10 + "No Description, Maybe the column name does not match.")
        else:
            print_long_string(col_desc_df.loc[mask, "Description"].iloc[0], indent=10)
            print("Special:", col_desc_df.loc[mask, "Special"].iloc[0])

        print("Type:", df[col].dtype)
        n_null = np.sum(df[col].isnull())
        print("Number of rows:", nrows)
        print("Number of NULL(s):", n_null)
        print("Percent of NULL(s): %.2f%%" % (n_null / nrows * 100))

        nunique = df[col].nunique(dropna=False)
        print("Number of Unique Values:", nunique)
        if nunique <= 20:
            print_long_string(", ".join([str(s) for s in df[col].unique()]),
                                indent=28, max_words_per_line=5)
        else:
            print_long_string(", ".join([str(s) for s in df[col].unique()[:20]]) + " ...",
                                indent=28, max_words_per_line=5)

        if col in num_cols:
            print("Min:", df[col].min())
            print("Max:", df[col].max())
        else:
            print("Mode: ", df[col].mode().values)

        print("-" * 50 + "\n")
    return None


def change_dtypes(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        if (df[col].dtype == "object") and (df[col].nunique() < df.shape[0]):
            df[col] = df[col].astype("category")

        elif set(df[col].unique()) == set([0, 1]):
            df[col] = df[col].astype(bool)

        elif df[col].dtype == float:
            df[col] = df[col].astype(np.float32)

        elif df[col].dtype == int:
            df[col] = df[col].astype(np.int32)

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df

In [3]:
col_desc = pd.read_csv("data/download/HomeCredit_columns_description.csv", encoding="ISO-8859-1")

# Understanding columns of dataframes

## `application_train.csv`

This is the main table, broken into two files for Train (with TARGET) and Test (without TARGET).

Static data for all applications. One row represents one loan in our data sample.

In [4]:
application_train = pd.read_csv("data/download/application_train.csv")
application_train = change_dtypes(application_train)
print("application_train.shape:", application_train.shape)
application_train.head()

Memory usage before changing types 300.13 MB
Memory usage after changing types 104.87 MB
application_train.shape: (307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,True,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,False,False,False,False,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,False,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,False,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,False,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,False,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,False,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,False,False,False,False,,,,,,
4,100007,False,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,False,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
application_train.shape

(307511, 122)

In [6]:
application_train.dtypes

SK_ID_CURR                       int32
TARGET                            bool
NAME_CONTRACT_TYPE            category
CODE_GENDER                   category
FLAG_OWN_CAR                  category
                                ...   
AMT_REQ_CREDIT_BUREAU_DAY      float32
AMT_REQ_CREDIT_BUREAU_WEEK     float32
AMT_REQ_CREDIT_BUREAU_MON      float32
AMT_REQ_CREDIT_BUREAU_QRT      float32
AMT_REQ_CREDIT_BUREAU_YEAR     float32
Length: 122, dtype: object

In [7]:
application_train["SK_ID_CURR"].is_unique

True

In [8]:
print_col_desc(application_train, "application_{train|test}.csv", col_desc)

Column Number: 0
Column Name: SK_ID_CURR
Description:
           ID of loan in our sample
Special: nan
Type: int32
Number of rows: 307511
Number of NULL(s): 0
Percent of NULL(s): 0.00%
Number of Unique Values: 307511
                             100002, 100003, 100004, 100006, 100007, 
                             100008, 100009, 100010, 100011, 100012, 
                             100014, 100015, 100016, 100017, 100018, 
                             100019, 100020, 100021, 100022, 100023 
                             ...
Min: 100002
Max: 456255
--------------------------------------------------

Column Number: 1
Column Name: TARGET
Description:
           Target variable (1 - client with payment difficulties: he/she had 
           late payment more than X days on at least one 
           of the first Y installments of the loan in our 
           sample, 0 - all other cases)
Special: nan
Type: bool
Number of rows: 307511
Number of NULL(s): 0
Percent of NULL(s): 0.00%
Number of Unique

Number of Unique Values: 2
                             False, True
Mode:  [False]
--------------------------------------------------

Column Number: 28
Column Name: OCCUPATION_TYPE
Description:
           What kind of occupation does the client have
Special: nan
Type: category
Number of rows: 307511
Number of NULL(s): 96391
Percent of NULL(s): 31.35%
Number of Unique Values: 19
                             Laborers, Core staff, Accountants, Managers, 
                             nan, Drivers, Sales staff, Cleaning 
                             staff, Cooking staff, Private service 
                             staff, Medicine staff, Security staff, 
                             High skill tech staff, Waiters/barmen 
                             staff, Low-skill Laborers, Realty agents, 
                             Secretaries, IT staff, HR staff 
                            
Mode:  ['Laborers']
Categories (18, object): ['Accountants', 'Cleaning staff', 'Cooking staff', 'Core staff',

Min: 0.0
Max: 1.0
--------------------------------------------------

Column Number: 53
Column Name: LANDAREA_AVG
Description:
           Normalized information about building where the client lives, What is 
           average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment 
           size, common area, living area, age of building, number of 
           elevators, number of entrances, state of the building, number of 
           floor
Special: normalized
Type: float32
Number of rows: 307511
Number of NULL(s): 182590
Percent of NULL(s): 59.38%
Number of Unique Values: 3528
                             0.0369, 0.013, nan, 0.0135, 0.0931, 
                             0.1758, 0.0279, 0.0534, 0.0898, 0.2371, 
                             0.0861, 0.0498, 0.0265, 0.0872, 0.1569, 
                             0.038, 0.0888, 0.0566, 0.1404, 0.0142 
                             ...
Min: 0.0
Max: 1.0
--------------------------------------------------

Column Number: 54
Co

                             0.0144, 0.0497, nan, 0.0587, 0.1153, 
                             0.0018, 0.0121, 0.0, 0.0214, 0.001, 
                             0.019, 0.0057, 0.0143, 0.083, 0.0533, 
                             0.0388, 0.0029, 0.0006, 0.0097, 0.0131 
                             ...
Min: 0.0
Max: 1.0
--------------------------------------------------

Column Number: 63
Column Name: ELEVATORS_MODE
Description:
           Normalized information about building where the client lives, What is 
           average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment 
           size, common area, living area, age of building, number of 
           elevators, number of entrances, state of the building, number of 
           floor
Special: normalized
Type: float32
Number of rows: 307511
Number of NULL(s): 163891
Percent of NULL(s): 53.30%
Number of Unique Values: 27
                             0.0, 0.0806, nan, 0.1611, 0.4028, 
                             0

                             0.025, 0.0968, nan, 0.0833, 0.1489, 
                             0.3529, 0.0281, 0.0729, 0.0916, 0.1457, 
                             0.1447, 0.0167, 0.152, 0.0125, 0.3706, 
                             0.0937, 0.0958, 0.0822, 0.1322, 0.2207 
                             ...
Min: 0.0
Max: 1.0
--------------------------------------------------

Column Number: 73
Column Name: BASEMENTAREA_MEDI
Description:
           Normalized information about building where the client lives, What is 
           average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment 
           size, common area, living area, age of building, number of 
           elevators, number of entrances, state of the building, number of 
           floor
Special: normalized
Type: float32
Number of rows: 307511
Number of NULL(s): 179943
Percent of NULL(s): 58.52%
Number of Unique Values: 3773
                             0.0369, 0.0529, nan, 0.0973, 0.1335, 
                  

Min: 0.0
Max: 1.0
--------------------------------------------------

Column Number: 83
Column Name: LIVINGAREA_MEDI
Description:
           Normalized information about building where the client lives, What is 
           average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment 
           size, common area, living area, age of building, number of 
           elevators, number of entrances, state of the building, number of 
           floor
Special: normalized
Type: float32
Number of rows: 307511
Number of NULL(s): 154350
Percent of NULL(s): 50.19%
Number of Unique Values: 5282
                             0.0193, 0.0558, nan, 0.0792, 0.1422, 
                             0.3842, 0.0295, 0.063, 0.0889, 0.151, 
                             0.1239, 0.0682, 0.0096, 0.1437, 0.0152, 
                             0.262, 0.0613, 0.0648, 0.0592, 0.0938 
                             ...
Min: 0.0
Max: 1.0
--------------------------------------------------

Column Number: 84


                             2.0, 1.0, 0.0, 4.0, 8.0, 
                             10.0, nan, 7.0, 3.0, 6.0, 
                             5.0, 12.0, 9.0, 13.0, 11.0, 
                             14.0, 21.0, 15.0, 22.0, 16.0 
                             ...
Min: 0.0
Max: 344.0
--------------------------------------------------

Column Number: 94
Column Name: DEF_60_CNT_SOCIAL_CIRCLE
Description:
           How many observation of client's social surroundings defaulted on 60 
           (days past due) DPD
Special: nan
Type: float32
Number of rows: 307511
Number of NULL(s): 1021
Percent of NULL(s): 0.33%
Number of Unique Values: 10
                             2.0, 0.0, 1.0, nan, 3.0, 
                             5.0, 4.0, 7.0, 24.0, 6.0 
                            
Min: 0.0
Max: 24.0
--------------------------------------------------

Column Number: 95
Column Name: DAYS_LAST_PHONE_CHANGE
Description:
           How many days before application did client change phone
Special: nan


In [9]:
application_test = pd.read_csv("data/download/application_test.csv")
application_test = change_dtypes(application_test)

Memory usage before changing types 47.18 MB
Memory usage after changing types 18.19 MB


In [10]:
nrows_train_test = application_train.shape[0] + application_test.shape[0]
nrows_train_test

356255

## `bureau.csv`

All client's previous credits provided by other financial institutions that were reported to Credit Bureau (for clients who have a loan in our sample).

For every loan in our sample, there are as many rows as number of credits the client had in Credit Bureau before the application date.

In [11]:
bureau = pd.read_csv("data/download/bureau.csv")
bureau = change_dtypes(bureau)
print("bureau.shape:", bureau.shape)
bureau.head()

Memory usage before changing types 233.43 MB
Memory usage after changing types 101.27 MB
bureau.shape: (1716428, 17)


Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


In [12]:
bureau["SK_ID_CURR"].is_unique

False

In [13]:
bureau["SK_ID_CURR"].nunique()

305811

In [14]:
# there are about 24% of applications do not have bureau data
bureau["SK_ID_CURR"].nunique() / nrows_train_test

0.8584047943186762

In [15]:
bureau["SK_ID_BUREAU"].is_unique

True

In [16]:
print_col_desc(bureau, "bureau.csv", col_desc)

Column Number: 0
Column Name: SK_ID_CURR
Description:
           ID of loan in our sample - one loan in 
           our sample can have 0,1,2 or more related previous credits 
           in credit bureau
Special: hashed
Type: int32
Number of rows: 1716428
Number of NULL(s): 0
Percent of NULL(s): 0.00%
Number of Unique Values: 305811
                             215354, 162297, 402440, 238881, 222183, 
                             426155, 136226, 400486, 435112, 452585, 
                             389599, 242993, 303740, 234931, 311918, 
                             119939, 388421, 419892, 387080, 293201 
                             ...
Min: 100001
Max: 456255
--------------------------------------------------

Column Number: 1
Column Name: SK_ID_BUREAU
Description:
          No Description, Maybe the column name does not match.
Type: int32
Number of rows: 1716428
Number of NULL(s): 0
Percent of NULL(s): 0.00%
Number of Unique Values: 1716428
                             5714462, 571

                             -131, -20, -16, -21, -31, 
                             -22, -1710, -840, -690, -706, 
                             -185, -2601, -984, -4, -7, 
                             -71, -694, -210, -24, -2559 
                             ...
Min: -41947
Max: 372
--------------------------------------------------

Column Number: 16
Column Name: AMT_ANNUITY
Description:
           Annuity of the Credit Bureau credit
Special: nan
Type: float32
Number of rows: 1716428
Number of NULL(s): 1226791
Percent of NULL(s): 71.47%
Number of Unique Values: 40322
                             nan, 0.0, 2691.0, 24462.0, 8181.0, 
                             8061.21, 13131.0, 40522.004, 26550.0, 67995.0, 
                             43668.0, 15321.87, 10491.885, 27000.0, 8245.395, 
                             12162.15, 3337.11, 16200.0, 14301.0, 13396.5 
                             ...
Min: 0.0
Max: 118453420.0
--------------------------------------------------



## `bureau_balance.csv`

Monthly balances of previous credits in Credit Bureau.

This table has one row for each month of history of every previous credit reported to Credit Bureau – i.e the table has (#loans in sample * # of relative previous credits * # of months where we have some history observable for the previous credits) rows.

In [18]:
bureau_balance = pd.read_csv("data/download/bureau_balance.csv")
print("bureau_balance.shape:", bureau_balance.shape)
bureau_balance.head()

bureau_balance.shape: (27299925, 3)


Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [19]:
bureau_balance["SK_ID_BUREAU"].is_unique

False

In [20]:
# only half of the applications in bureau have month records in bureau_balance
bureau_balance["SK_ID_BUREAU"].nunique() / bureau.shape[0]

0.4762186354452386

In [21]:
print_col_desc(bureau_balance, "bureau_balance.csv", col_desc)

Column Number: 0
Column Name: SK_ID_BUREAU
Description:
          No Description, Maybe the column name does not match.
Type: int64
Number of rows: 27299925
Number of NULL(s): 0
Percent of NULL(s): 0.00%
Number of Unique Values: 817395
                             5715448, 5715449, 5715451, 5715452, 5715453, 
                             5715454, 5715455, 5715456, 5715457, 5715458, 
                             5715459, 5715460, 5715461, 5715478, 5715479, 
                             5715480, 5715521, 5715529, 5715530, 5715531 
                             ...
Min: 5001709
Max: 6842888
--------------------------------------------------

Column Number: 1
Column Name: MONTHS_BALANCE
Description:
           Month of balance relative to application date (-1 means the 
           freshest balance date)
Special: time only relative to the application
Type: int64
Number of rows: 27299925
Number of NULL(s): 0
Percent of NULL(s): 0.00%
Number of Unique Values: 97
                             0,

## `previous_application.csv`

All previous applications for Home Credit loans of clients who have loans in our sample.

There is one row for each previous application related to loans in our data sample.

In [22]:
previous_application = pd.read_csv("data/download/previous_application.csv")
previous_application = change_dtypes(previous_application)
print("previous_application.shape:", previous_application.shape)
previous_application.head()

Memory usage before changing types 494.38 MB
Memory usage after changing types 162.02 MB
previous_application.shape: (1670214, 37)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.430054,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615234,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735352,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335938,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.394531,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


In [23]:
previous_application["SK_ID_PREV"].is_unique

True

In [24]:
previous_application["SK_ID_CURR"].is_unique

False

In [27]:
# about 5% of current applications are compleltely new, having no previous applications at Home Credit.
previous_application["SK_ID_CURR"].nunique() / nrows_train_test

0.9511641941867482

In [28]:
print_col_desc(previous_application, "previous_application.csv", col_desc)

Column Number: 0
Column Name: SK_ID_PREV
Description:
          No Description, Maybe the column name does not match.
Type: int32
Number of rows: 1670214
Number of NULL(s): 0
Percent of NULL(s): 0.00%
Number of Unique Values: 1670214
                             2030495, 2802425, 2523466, 2819243, 1784265, 
                             1383531, 2315218, 1656711, 2367563, 2579447, 
                             1715995, 2257824, 2330894, 1397919, 2273188, 
                             1232483, 2163253, 1285768, 2393109, 1173070 
                             ...
Min: 1000001
Max: 2845382
--------------------------------------------------

Column Number: 1
Column Name: SK_ID_CURR
Description:
           ID of loan in our sample
Special: hashed
Type: int32
Number of rows: 1670214
Number of NULL(s): 0
Percent of NULL(s): 0.00%
Number of Unique Values: 338857
                             271877, 108129, 122040, 176158, 202054, 
                             199383, 175704, 296299, 342292, 3343

                             0.86733615, nan, 0.83509517, 0.5687104, 0.8451374, 
                             0.852537, 0.71564484, 0.6379493, 0.8208245, 0.4244186, 
                             0.8324524, 0.6448203, 0.5137421, 0.5428118, 0.3731501, 
                             0.8065539, 0.5021142, 0.6374207, 0.7251586, 0.7806554 
                             ...
Min: 0.3731501
Max: 1.0
--------------------------------------------------

Column Number: 15
Column Name: NAME_CASH_LOAN_PURPOSE
Description:
           Purpose of the cash loan
Special: nan
Type: category
Number of rows: 1670214
Number of NULL(s): 0
Percent of NULL(s): 0.00%
Number of Unique Values: 25
                             XAP, XNA, Repairs, Everyday expenses, 
                             Car repairs, Building a house 
                             or an annex, Other, Journey, 
                             Purchase of electronic equipment, Medicine, 
                             Payments on other loans, Urgent 
   

Min: 0.0
Max: 84.0
--------------------------------------------------

Column Number: 29
Column Name: NAME_YIELD_GROUP
Description:
           Grouped interest rate into small medium and high of the 
           previous application
Special: grouped
Type: category
Number of rows: 1670214
Number of NULL(s): 0
Percent of NULL(s): 0.00%
Number of Unique Values: 5
                             middle, low_action, high, low_normal, XNA 
                            
Mode:  ['XNA']
Categories (5, object): ['XNA', 'high', 'low_action', 'low_normal', 'middle']
--------------------------------------------------

Column Number: 30
Column Name: PRODUCT_COMBINATION
Description:
           Detailed product combination of the previous application
Special: nan
Type: category
Number of rows: 1670214
Number of NULL(s): 346
Percent of NULL(s): 0.02%
Number of Unique Values: 18
                             POS mobile with interest, Cash 
                             X-Sell: low, Cash X-Sell: high, 
        

In [33]:
application_train["NAME_CONTRACT_TYPE"].unique()

['Cash loans', 'Revolving loans']
Categories (2, object): ['Cash loans', 'Revolving loans']

In [43]:
previous_application["NAME_CONTRACT_TYPE"].unique()

['Consumer loans', 'Cash loans', 'Revolving loans', 'XNA']
Categories (4, object): ['Consumer loans', 'Cash loans', 'Revolving loans', 'XNA']

In [40]:
(previous_application["NAME_CONTRACT_TYPE"] == "Consumer loans").mean()

0.4365614226679934

In [39]:
previous_application[previous_application["NAME_CONTRACT_TYPE"] == "Consumer loans"]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.430054,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
13,1397919,321676,Consumer loans,7654.859863,53779.5,57564.0,0.0,53779.5,SUNDAY,15,...,Consumer electronics,8.0,low_action,POS household without interest,365243.0,-378.0,-168.0,-168.0,-163.0,1.0
14,2273188,270658,Consumer loans,9644.219727,26550.0,27252.0,0.0,26550.0,SATURDAY,10,...,Consumer electronics,3.0,middle,POS household with interest,365243.0,-693.0,-633.0,-633.0,-627.0,0.0
15,1232483,151612,Consumer loans,21307.455078,126490.5,119853.0,12649.5,126490.5,TUESDAY,7,...,Industry,6.0,low_normal,POS other with interest,365243.0,-668.0,-518.0,-518.0,-512.0,0.0
16,2163253,154602,Consumer loans,4187.339844,26955.0,27297.0,1350.0,26955.0,SATURDAY,12,...,Consumer electronics,8.0,high,POS household with interest,365243.0,-1440.0,-1230.0,-1230.0,-1226.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1670202,2205099,157707,Consumer loans,4334.354980,22207.5,23854.5,0.0,22207.5,THURSDAY,10,...,Clothing,6.0,middle,POS industry with interest,365243.0,-948.0,-798.0,-858.0,-850.0,0.0
1670203,2844282,383898,Consumer loans,,14791.5,14791.5,0.0,14791.5,TUESDAY,13,...,Connectivity,,XNA,POS mobile with interest,,,,,,
1670209,2300464,352015,Consumer loans,14704.290039,267295.5,311400.0,0.0,267295.5,WEDNESDAY,12,...,Furniture,30.0,low_normal,POS industry with interest,365243.0,-508.0,362.0,-358.0,-351.0,0.0
1670210,2357031,334635,Consumer loans,6622.020020,87750.0,64291.5,29250.0,87750.0,TUESDAY,15,...,Furniture,12.0,middle,POS industry with interest,365243.0,-1604.0,-1274.0,-1304.0,-1297.0,0.0


## `POS_CASH_balance.csv`

Monthly balance snapshots of previous POS (point of sales) and cash loans that the applicant had with Home Credit.

This table has one row for each month of history of every previous credit in Home Credit (consumer credit and cash loans) related to loans in our sample – i.e. the table has (#loans in sample * # of relative previous credits * # of months in which we have some history observable for the previous credits) rows.

In [44]:
POS_CASH_balance = pd.read_csv("data/download/POS_CASH_balance.csv")
POS_CASH_balance = change_dtypes(POS_CASH_balance)
print("POS_CASH_balance.shape:", POS_CASH_balance.shape)
POS_CASH_balance.head()

Memory usage before changing types 640.09 MB
Memory usage after changing types 290.04 MB
POS_CASH_balance.shape: (10001358, 8)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [45]:
[id_col for id_col in POS_CASH_balance.columns.values if id_col.startswith("SK_ID")]

['SK_ID_PREV', 'SK_ID_CURR']

In [46]:
POS_CASH_balance["SK_ID_PREV"].is_unique

False

In [47]:
POS_CASH_balance["SK_ID_CURR"].is_unique

False

In [50]:
# about 44% of previous applications do not have monthly data in POS_CASH_balance
POS_CASH_balance["SK_ID_PREV"].nunique() / previous_application.shape[0]

0.5606018150967481

In [48]:
POS_CASH_balance.groupby(by=["SK_ID_PREV"])["SK_ID_CURR"].count().head()

SK_ID_PREV
1000001     3
1000002     5
1000003     4
1000004     8
1000005    11
Name: SK_ID_CURR, dtype: int64

In [51]:
tmp = POS_CASH_balance.groupby(by=["SK_ID_PREV"])["SK_ID_CURR"].nunique()
print(tmp.head())

# so each "SK_ID_PREV" is corresponding to a unique "SK_ID_CURR"
tmp[tmp > 1]

SK_ID_PREV
1000001    1
1000002    1
1000003    1
1000004    1
1000005    1
Name: SK_ID_CURR, dtype: int64


Series([], Name: SK_ID_CURR, dtype: int64)

In [52]:
print_col_desc(POS_CASH_balance, "POS_CASH_balance.csv", col_desc)

Column Number: 0
Column Name: SK_ID_PREV
Description:
          No Description, Maybe the column name does not match.
Type: int32
Number of rows: 10001358
Number of NULL(s): 0
Percent of NULL(s): 0.00%
Number of Unique Values: 936325
                             1803195, 1715348, 1784872, 1903291, 2341044, 
                             2207092, 1110516, 1387235, 1220500, 2371489, 
                             2328294, 2023570, 1258390, 1627166, 1711625, 
                             2374177, 1716688, 1846127, 1911764, 2645133 
                             ...
Min: 1000001
Max: 2843499
--------------------------------------------------

Column Number: 1
Column Name: SK_ID_CURR
Description:
           ID of loan in our sample
Special: nan
Type: int32
Number of rows: 10001358
Number of NULL(s): 0
Percent of NULL(s): 0.00%
Number of Unique Values: 337252
                             182943, 367990, 397406, 269225, 334279, 
                             342166, 204376, 153211, 112740, 274851

In [65]:
group = POS_CASH_balance.groupby(["SK_ID_PREV"])
count = 0
for name, df in group:
    print(df[["MONTHS_BALANCE"]].sort_values(by="MONTHS_BALANCE", ascending=False).head())
    count += 1
    if count > 5:
        break

         MONTHS_BALANCE
45995                -8
8470736              -9
6030662             -10
         MONTHS_BALANCE
2792302             -50
2658581             -51
2549591             -52
3346102             -53
4467804             -54
         MONTHS_BALANCE
7085917              -1
9852178              -2
6496374              -3
6584537              -4
         MONTHS_BALANCE
4078063             -22
5534254             -23
4978909             -24
5674882             -25
514734              -26
         MONTHS_BALANCE
7852623             -46
6387329             -47
2703957             -48
229709              -49
8657924             -50
         MONTHS_BALANCE
7452784              -1
7240168              -2
9660900              -3
9396411              -4
6185246              -5


In [63]:
POS_CASH_balance.sort_values?

## `credit_card_balance.csv`

Monthly balance snapshots of previous credit cards that the applicant has with Home Credit.

This table has one row for each month of history of every previous credit in Home Credit (consumer credit and cash loans) related to loans in our sample – i.e. the table has (#loans in sample * # of relative previous credit cards * # of months where we have some history observable for the previous credit card) rows.

In [66]:
credit_card_balance = pd.read_csv("data/download/credit_card_balance.csv")
credit_card_balance = change_dtypes(credit_card_balance)
print("credit_card_balance.shape:", credit_card_balance.shape)
credit_card_balance.head()

Memory usage before changing types 706.62 MB
Memory usage after changing types 341.79 MB
credit_card_balance.shape: (3840312, 23)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.970001,135000,0.0,877.5,0.0,877.5,1700.324951,...,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.554688,45000,2250.0,2250.0,0.0,0.0,2250.0,...,64875.554688,64875.554688,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.224609,450000,0.0,0.0,0.0,0.0,2250.0,...,31460.085938,31460.085938,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.109375,225000,2250.0,2250.0,0.0,0.0,11795.759766,...,233048.96875,233048.96875,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.46875,450000,0.0,11547.0,0.0,11547.0,22924.890625,...,453919.46875,453919.46875,0.0,1,0.0,1.0,101.0,Active,0,0


In [67]:
credit_card_balance["SK_ID_PREV"].is_unique

False

In [68]:
credit_card_balance["SK_ID_CURR"].is_unique

False

In [70]:
# Only 6% of previous applications having records in credit_card_balance.csv
credit_card_balance["SK_ID_PREV"].nunique() / previous_application.shape[0]

0.06245127869841829

In [69]:
credit_card_balance.groupby(by=["SK_ID_PREV"])["SK_ID_CURR"].count().head()

SK_ID_PREV
1000018     5
1000030     8
1000031    16
1000035     5
1000077    11
Name: SK_ID_CURR, dtype: int64

In [71]:
tmp = credit_card_balance.groupby(by=["SK_ID_PREV"])["SK_ID_CURR"].nunique()
print(tmp.head())

# so each "SK_ID_PREV" is corresponding to a unique "SK_ID_CURR"
tmp[tmp > 1]

SK_ID_PREV
1000018    1
1000030    1
1000031    1
1000035    1
1000077    1
Name: SK_ID_CURR, dtype: int64


Series([], Name: SK_ID_CURR, dtype: int64)

In [72]:
print_col_desc(credit_card_balance, "credit_card_balance.csv", col_desc)

Column Number: 0
Column Name: SK_ID_PREV
Description:
          No Description, Maybe the column name does not match.
Type: int32
Number of rows: 3840312
Number of NULL(s): 0
Percent of NULL(s): 0.00%
Number of Unique Values: 104307
                             2562384, 2582071, 1740877, 1389973, 1891521, 
                             2646502, 1079071, 2095912, 2181852, 1235299, 
                             1108284, 2740914, 1985699, 1441883, 1864742, 
                             2037377, 1816636, 1039039, 2016842, 1189095 
                             ...
Min: 1000018
Max: 2843496
--------------------------------------------------

Column Number: 1
Column Name: SK_ID_CURR
Description:
           ID of loan in our sample
Special: hashed
Type: int32
Number of rows: 3840312
Number of NULL(s): 0
Percent of NULL(s): 0.00%
Number of Unique Values: 103558
                             378907, 363914, 371185, 337855, 126868, 
                             380010, 171320, 118650, 367360, 20388

Number of Unique Values: 1195807
                             0.0, 60175.08, 26926.426, 224949.28, 443044.4, 
                             80519.04, 345433.88, 44735.31, 285376.4, 192793.28, 
                             92237.35, 121430.7, 128518.02, 68954.805, 440329.06, 
                             43650.855, 67341.87, 85086.945, 448552.12, 146858.0 
                             ...
Min: -423305.8
Max: 1472316.8
--------------------------------------------------

Column Number: 13
Column Name: AMT_RECIVABLE
Description:
           Amount receivable on the previous credit
Special: nan
Type: float32
Number of rows: 3840312
Number of NULL(s): 0
Percent of NULL(s): 0.00%
Number of Unique Values: 1338857
                             0.0, 64875.555, 31460.086, 233048.97, 453919.47, 
                             82773.31, 351881.16, 47962.125, 286831.56, 197224.69, 
                             94224.06, 127891.94, 131294.34, 71554.45, 451429.2, 
                             44068.23, 687

##  `installments_payments.csv`

Repayment history for the previously disbursed credits in Home Credit related to the loans in our sample.

There is a) one row for every payment that was made plus b) one row each for missed payment.

One row is equivalent to one payment of one installment OR one installment corresponding to one payment of one previous Home Credit credit related to loans in our sample.

In [73]:
installments_payments = pd.read_csv("data/download/installments_payments.csv")
installments_payments = change_dtypes(installments_payments)
print("installments_payments.shape:", installments_payments.shape)
installments_payments.head()

Memory usage before changing types 870.75 MB
Memory usage after changing types 435.37 MB
installments_payments.shape: (13605401, 8)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.359863,6948.359863
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525024,1716.525024
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.130859,24350.130859
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.040039,2160.584961


In [74]:
installments_payments["SK_ID_PREV"].is_unique

False

In [75]:
installments_payments["SK_ID_CURR"].is_unique

False

In [78]:
# about 40% of previous applications do not have records in installments_payments
installments_payments["SK_ID_PREV"].nunique() / previous_application.shape[0]

0.5973797369678376

In [79]:
installments_payments.groupby(by=["SK_ID_PREV"])["SK_ID_CURR"].count().head()

SK_ID_PREV
1000001     2
1000002     4
1000003     3
1000004     7
1000005    11
Name: SK_ID_CURR, dtype: int64

In [80]:
tmp = installments_payments.groupby(by=["SK_ID_PREV"])["SK_ID_CURR"].nunique()
print(tmp.head())

# so each "SK_ID_PREV" is corresponding to a unique "SK_ID_CURR"
tmp[tmp > 1]

SK_ID_PREV
1000001    1
1000002    1
1000003    1
1000004    1
1000005    1
Name: SK_ID_CURR, dtype: int64


Series([], Name: SK_ID_CURR, dtype: int64)

In [81]:
print_col_desc(installments_payments, "installments_payments.csv", col_desc)

Column Number: 0
Column Name: SK_ID_PREV
Description:
          No Description, Maybe the column name does not match.
Type: int32
Number of rows: 13605401
Number of NULL(s): 0
Percent of NULL(s): 0.00%
Number of Unique Values: 997752
                             1054186, 1330831, 2085231, 2452527, 2714724, 
                             1137312, 2234264, 1818599, 2723183, 1413990, 
                             1782554, 2558880, 1570206, 1723268, 1594684, 
                             1456258, 1053286, 1410565, 2391610, 2329072 
                             ...
Min: 1000001
Max: 2843499
--------------------------------------------------

Column Number: 1
Column Name: SK_ID_CURR
Description:
           ID of loan in our sample
Special: hashed
Type: int32
Number of rows: 13605401
Number of NULL(s): 0
Percent of NULL(s): 0.00%
Number of Unique Values: 339587
                             161674, 151639, 193053, 199697, 167756, 
                             164489, 184693, 111420, 112102, 109