In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

In [2]:
col_desc = pd.read_csv("data/download/HomeCredit_columns_description.csv", 
                       encoding="ISO-8859-1")

In [3]:
def print_long_string(string, indent=0, max_line_len=10):
    words = [" "*indent]
    for i, word in enumerate(string.split()):
        words.append(word)
        if (i+1) % max_line_len == 0:
            words.append("\n" + " "*indent)
    print(" ".join(words))

In [4]:
def print_col_desc(table, table_name, col_desc_table):
    for i, col in enumerate(table.columns):
        mask = (col_desc_table["Table"] == table_name) & (col_desc_table["Row"] == col)
        print("Column Number:", i)
        print("Column Name:", col)
        print("Description:")
        
        if len(col_desc_table.loc[mask, :]) == 0:
            print(" "*10 + "No Description, Maybe the column name does not match.")
        else:
            print_long_string(col_desc_table.loc[mask, "Description"].iloc[0], indent=10)
            print("Special:", col_desc_table.loc[mask, "Special"].iloc[0])
            
        print("Type:", table[col].dtype)
        print("Number of NULL(s):", np.sum(table[col].isnull()))
        
        if (table[col].dtype == np.object) or (table[col].dtype == np.int):
            nunique = table[col].nunique(dropna=False)
            print("Number of Unique Values:", nunique)
            if nunique <= 20:
                print_long_string(", ".join([str(s) for s in table[col].unique()]), 
                                  indent=28, max_line_len=5)
            else:
                print_long_string(", ".join([str(s) for s in table[col].unique()[:20]]) + " ...", 
                                  indent=28, max_line_len=5)
        
        if  np.issubdtype(table[col].dtype, np.number):
            print("Min:", table[col].min())
            print("Max:", table[col].max())
        
        print("-"*50 + "\n")
    return None

# `application_train.csv`

This is the main table, broken into two files for Train (with TARGET) and Test (without TARGET).

Static data for all applications. One row represents one loan in our data sample.

In [5]:
application_train = pd.read_csv("data/download/application_train.csv")
print("application_train.shape:", application_train.shape)
application_train.head()

application_train.shape: (307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
application_train["SK_ID_CURR"].is_unique

True

In [7]:
print_col_desc(application_train, "application_{train|test}.csv", col_desc)

Column Number: 0
Column Name: SK_ID_CURR
Description:
           ID of loan in our sample
Special: nan
Type: int64
Number of NULL(s): 0
Number of Unique Values: 307511
                             100002, 100003, 100004, 100006, 100007, 
                             100008, 100009, 100010, 100011, 100012, 
                             100014, 100015, 100016, 100017, 100018, 
                             100019, 100020, 100021, 100022, 100023 
                             ...
Min: 100002
Max: 456255
--------------------------------------------------

Column Number: 1
Column Name: TARGET
Description:
           Target variable (1 - client with payment difficulties: he/she had 
           late payment more than X days on at least one 
           of the first Y installments of the loan in our 
           sample, 0 - all other cases)
Special: nan
Type: int64
Number of NULL(s): 0
Number of Unique Values: 2
                             1, 0
Min: 0
Max: 1
--------------------------------------

Max: 1
--------------------------------------------------

Column Number: 23
Column Name: FLAG_EMP_PHONE
Description:
           Did client provide work phone (1=YES, 0=NO)
Special: nan
Type: int64
Number of NULL(s): 0
Number of Unique Values: 2
                             1, 0
Min: 0
Max: 1
--------------------------------------------------

Column Number: 24
Column Name: FLAG_WORK_PHONE
Description:
           Did client provide home phone (1=YES, 0=NO)
Special: nan
Type: int64
Number of NULL(s): 0
Number of Unique Values: 2
                             0, 1
Min: 0
Max: 1
--------------------------------------------------

Column Number: 25
Column Name: FLAG_CONT_MOBILE
Description:
           Was mobile phone reachable (1=YES, 0=NO)
Special: nan
Type: int64
Number of NULL(s): 0
Number of Unique Values: 2
                             1, 0
Min: 0
Max: 1
--------------------------------------------------

Column Number: 26
Column Name: FLAG_PHONE
Description:
           Did client pro

Number of NULL(s): 163891
Min: 0.0
Max: 1.0
--------------------------------------------------

Column Number: 50
Column Name: ENTRANCES_AVG
Description:
           Normalized information about building where the client lives, What is 
           average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment 
           size, common area, living area, age of building, number of 
           elevators, number of entrances, state of the building, number of 
           floor
Special: normalized
Type: float64
Number of NULL(s): 154828
Min: 0.0
Max: 1.0
--------------------------------------------------

Column Number: 51
Column Name: FLOORSMAX_AVG
Description:
           Normalized information about building where the client lives, What is 
           average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment 
           size, common area, living area, age of building, number of 
           elevators, number of entrances, state of the building, number of 
   

Number of NULL(s): 208642
Min: 0.0
Max: 1.0
--------------------------------------------------

Column Number: 67
Column Name: LANDAREA_MODE
Description:
           Normalized information about building where the client lives, What is 
           average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment 
           size, common area, living area, age of building, number of 
           elevators, number of entrances, state of the building, number of 
           floor
Special: normalized
Type: float64
Number of NULL(s): 182590
Min: 0.0
Max: 1.0
--------------------------------------------------

Column Number: 68
Column Name: LIVINGAPARTMENTS_MODE
Description:
           Normalized information about building where the client lives, What is 
           average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment 
           size, common area, living area, age of building, number of 
           elevators, number of entrances, state of the building, number

Number of NULL(s): 210295
Number of Unique Values: 5
                             reg oper account, nan, org 
                             spec account, reg oper spec 
                             account, not specified
--------------------------------------------------

Column Number: 87
Column Name: HOUSETYPE_MODE
Description:
           Normalized information about building where the client lives, What is 
           average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment 
           size, common area, living area, age of building, number of 
           elevators, number of entrances, state of the building, number of 
           floor
Special: normalized
Type: object
Number of NULL(s): 154297
Number of Unique Values: 4
                             block of flats, nan, terraced 
                             house, specific housing
--------------------------------------------------

Column Number: 88
Column Name: TOTALAREA_MODE
Description:
           Normalized i

                             0, 1
Min: 0
Max: 1
--------------------------------------------------

Column Number: 115
Column Name: FLAG_DOCUMENT_21
Description:
           Did client provide document 21
Special: nan
Type: int64
Number of NULL(s): 0
Number of Unique Values: 2
                             0, 1
Min: 0
Max: 1
--------------------------------------------------

Column Number: 116
Column Name: AMT_REQ_CREDIT_BUREAU_HOUR
Description:
           Number of enquiries to Credit Bureau about the client one 
           hour before application
Special: nan
Type: float64
Number of NULL(s): 41519
Min: 0.0
Max: 4.0
--------------------------------------------------

Column Number: 117
Column Name: AMT_REQ_CREDIT_BUREAU_DAY
Description:
           Number of enquiries to Credit Bureau about the client one 
           day before application (excluding one hour before application)
Special: nan
Type: float64
Number of NULL(s): 41519
Min: 0.0
Max: 9.0
---------------------------------------

# `bureau.csv`

All client's previous credits provided by other financial institutions that were reported to Credit Bureau (for clients who have a loan in our sample).

For every loan in our sample, there are as many rows as number of credits the client had in Credit Bureau before the application date.

In [8]:
bureau = pd.read_csv("data/download/bureau.csv")
print("bureau.shape:", bureau.shape)
bureau.head()

bureau.shape: (1716428, 17)


Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


In [9]:
bureau["SK_ID_CURR"].is_unique

False

In [10]:
bureau["SK_ID_BUREAU"].is_unique

True

In [11]:
print_col_desc(bureau, "bureau.csv", col_desc)

Column Number: 0
Column Name: SK_ID_CURR
Description:
           ID of loan in our sample - one loan in 
           our sample can have 0,1,2 or more related previous credits 
           in credit bureau
Special: hashed
Type: int64
Number of NULL(s): 0
Number of Unique Values: 305811
                             215354, 162297, 402440, 238881, 222183, 
                             426155, 136226, 400486, 435112, 452585, 
                             389599, 242993, 303740, 234931, 311918, 
                             119939, 388421, 419892, 387080, 293201 
                             ...
Min: 100001
Max: 456255
--------------------------------------------------

Column Number: 1
Column Name: SK_ID_BUREAU
Description:
          No Description, Maybe the column name does not match.
Type: int64
Number of NULL(s): 0
Number of Unique Values: 1716428
                             5714462, 5714463, 5714464, 5714465, 5714466, 
                             5714467, 5714468, 5714469, 5714470, 5

# Description of columns in table `bureau_balance.csv`

Monthly balances of previous credits in Credit Bureau.

This table has one row for each month of history of every previous credit reported to Credit Bureau – i.e the table has (#loans in sample * # of relative previous credits * # of months where we have some history observable for the previous credits) rows.

In [12]:
bureau_balance = pd.read_csv("data/download/bureau_balance.csv")
print("bureau_balance.shape:", bureau_balance.shape)
bureau_balance.head()

bureau_balance.shape: (27299925, 3)


Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [13]:
bureau_balance["SK_ID_BUREAU"].is_unique

False

In [14]:
print_col_desc(bureau_balance, "bureau_balance.csv", col_desc)

Column Number: 0
Column Name: SK_ID_BUREAU
Description:
          No Description, Maybe the column name does not match.
Type: int64
Number of NULL(s): 0
Number of Unique Values: 817395
                             5715448, 5715449, 5715451, 5715452, 5715453, 
                             5715454, 5715455, 5715456, 5715457, 5715458, 
                             5715459, 5715460, 5715461, 5715478, 5715479, 
                             5715480, 5715521, 5715529, 5715530, 5715531 
                             ...
Min: 5001709
Max: 6842888
--------------------------------------------------

Column Number: 1
Column Name: MONTHS_BALANCE
Description:
           Month of balance relative to application date (-1 means the 
           freshest balance date)
Special: time only relative to the application
Type: int64
Number of NULL(s): 0
Number of Unique Values: 97
                             0, -1, -2, -3, -4, 
                             -5, -6, -7, -8, -9, 
                             -10, 

# Description of columns in table `previous_application.csv`

All previous applications for Home Credit loans of clients who have loans in our sample.

There is one row for each previous application related to loans in our data sample.

In [15]:
previous_application = pd.read_csv("data/download/previous_application.csv")
print("previous_application.shape:", previous_application.shape)
previous_application.head()

previous_application.shape: (1670214, 37)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


In [16]:
previous_application["SK_ID_PREV"].is_unique

True

In [17]:
previous_application["SK_ID_CURR"].is_unique

False

In [18]:
print_col_desc(previous_application, "previous_application.csv", col_desc)

Column Number: 0
Column Name: SK_ID_PREV
Description:
          No Description, Maybe the column name does not match.
Type: int64
Number of NULL(s): 0
Number of Unique Values: 1670214
                             2030495, 2802425, 2523466, 2819243, 1784265, 
                             1383531, 2315218, 1656711, 2367563, 2579447, 
                             1715995, 2257824, 2330894, 1397919, 2273188, 
                             1232483, 2163253, 1285768, 2393109, 1173070 
                             ...
Min: 1000001
Max: 2845382
--------------------------------------------------

Column Number: 1
Column Name: SK_ID_CURR
Description:
           ID of loan in our sample
Special: hashed
Type: int64
Number of NULL(s): 0
Number of Unique Values: 338857
                             271877, 108129, 122040, 176158, 202054, 
                             199383, 175704, 296299, 342292, 334349, 
                             447712, 161140, 258628, 321676, 270658, 
                         

Number of NULL(s): 820405
Number of Unique Values: 8
                             nan, Unaccompanied, Spouse, partner, Family, 
                             Children, Other_B, Other_A, Group of 
                             people
--------------------------------------------------

Column Number: 21
Column Name: NAME_CLIENT_TYPE
Description:
           Was the client old or new client when applying for 
           the previous application
Special: nan
Type: object
Number of NULL(s): 0
Number of Unique Values: 4
                             Repeater, New, Refreshed, XNA
--------------------------------------------------

Column Number: 22
Column Name: NAME_GOODS_CATEGORY
Description:
           What kind of goods did the client apply for in 
           the previous application
Special: nan
Type: object
Number of NULL(s): 0
Number of Unique Values: 28
                             Mobile, XNA, Consumer Electronics, Construction 
                             Materials, Auto Accessories, Ph

# Description of columns in table `POS_CASH_balance.csv`

Monthly balance snapshots of previous POS (point of sales) and cash loans that the applicant had with Home Credit.

This table has one row for each month of history of every previous credit in Home Credit (consumer credit and cash loans) related to loans in our sample – i.e. the table has (#loans in sample * # of relative previous credits * # of months in which we have some history observable for the previous credits) rows.

In [19]:
POS_CASH_balance = pd.read_csv("data/download/POS_CASH_balance.csv")
print("POS_CASH_balance.shape:", POS_CASH_balance.shape)
POS_CASH_balance.head()

POS_CASH_balance.shape: (10001358, 8)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [20]:
[id_col for id_col in POS_CASH_balance.columns.values if id_col.startswith("SK_ID")]

['SK_ID_PREV', 'SK_ID_CURR']

In [21]:
POS_CASH_balance["SK_ID_PREV"].is_unique

False

In [22]:
POS_CASH_balance["SK_ID_CURR"].is_unique

False

In [23]:
POS_CASH_balance.groupby(by=["SK_ID_PREV"])["SK_ID_CURR"].count().head()

SK_ID_PREV
1000001     3
1000002     5
1000003     4
1000004     8
1000005    11
Name: SK_ID_CURR, dtype: int64

In [24]:
tmp = POS_CASH_balance.groupby(by=["SK_ID_PREV"])["SK_ID_CURR"].nunique()
print(tmp.head())

# so each "SK_ID_PREV" is corresponding to a unique "SK_ID_CURR"
tmp[tmp > 1]

SK_ID_PREV
1000001    1
1000002    1
1000003    1
1000004    1
1000005    1
Name: SK_ID_CURR, dtype: int64


Series([], Name: SK_ID_CURR, dtype: int64)

In [25]:
print_col_desc(POS_CASH_balance, "POS_CASH_balance.csv", col_desc)

Column Number: 0
Column Name: SK_ID_PREV
Description:
          No Description, Maybe the column name does not match.
Type: int64
Number of NULL(s): 0
Number of Unique Values: 936325
                             1803195, 1715348, 1784872, 1903291, 2341044, 
                             2207092, 1110516, 1387235, 1220500, 2371489, 
                             2328294, 2023570, 1258390, 1627166, 1711625, 
                             2374177, 1716688, 1846127, 1911764, 2645133 
                             ...
Min: 1000001
Max: 2843499
--------------------------------------------------

Column Number: 1
Column Name: SK_ID_CURR
Description:
           ID of loan in our sample
Special: nan
Type: int64
Number of NULL(s): 0
Number of Unique Values: 337252
                             182943, 367990, 397406, 269225, 334279, 
                             342166, 204376, 153211, 112740, 274851, 
                             287361, 237959, 278261, 146161, 131467, 
                             

# Description of columns in table `credit_card_balance.csv`

Monthly balance snapshots of previous credit cards that the applicant has with Home Credit.

This table has one row for each month of history of every previous credit in Home Credit (consumer credit and cash loans) related to loans in our sample – i.e. the table has (#loans in sample * # of relative previous credit cards * # of months where we have some history observable for the previous credit card) rows.

In [26]:
credit_card_balance = pd.read_csv("data/download/credit_card_balance.csv")
print("credit_card_balance.shape:", credit_card_balance.shape)
credit_card_balance.head()

credit_card_balance.shape: (3840312, 23)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,...,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,...,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,...,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,...,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,...,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0


In [27]:
credit_card_balance["SK_ID_PREV"].is_unique

False

In [28]:
credit_card_balance["SK_ID_CURR"].is_unique

False

In [29]:
credit_card_balance.groupby(by=["SK_ID_PREV"])["SK_ID_CURR"].count().head()

SK_ID_PREV
1000018     5
1000030     8
1000031    16
1000035     5
1000077    11
Name: SK_ID_CURR, dtype: int64

In [30]:
tmp = credit_card_balance.groupby(by=["SK_ID_PREV"])["SK_ID_CURR"].nunique()
print(tmp.head())

# so each "SK_ID_PREV" is corresponding to a unique "SK_ID_CURR"
tmp[tmp > 1]

SK_ID_PREV
1000018    1
1000030    1
1000031    1
1000035    1
1000077    1
Name: SK_ID_CURR, dtype: int64


Series([], Name: SK_ID_CURR, dtype: int64)

In [31]:
print_col_desc(credit_card_balance, "credit_card_balance.csv", col_desc)

Column Number: 0
Column Name: SK_ID_PREV
Description:
          No Description, Maybe the column name does not match.
Type: int64
Number of NULL(s): 0
Number of Unique Values: 104307
                             2562384, 2582071, 1740877, 1389973, 1891521, 
                             2646502, 1079071, 2095912, 2181852, 1235299, 
                             1108284, 2740914, 1985699, 1441883, 1864742, 
                             2037377, 1816636, 1039039, 2016842, 1189095 
                             ...
Min: 1000018
Max: 2843496
--------------------------------------------------

Column Number: 1
Column Name: SK_ID_CURR
Description:
           ID of loan in our sample
Special: hashed
Type: int64
Number of NULL(s): 0
Number of Unique Values: 103558
                             378907, 363914, 371185, 337855, 126868, 
                             380010, 171320, 118650, 367360, 203885, 
                             209660, 340339, 302517, 171537, 303581, 
                          

# Description of columns in table  `installments_payments.csv`

Repayment history for the previously disbursed credits in Home Credit related to the loans in our sample.

There is a) one row for every payment that was made plus b) one row each for missed payment.

One row is equivalent to one payment of one installment OR one installment corresponding to one payment of one previous Home Credit credit related to loans in our sample.

In [32]:
installments_payments = pd.read_csv("data/download/installments_payments.csv")
print("installments_payments.shape:", installments_payments.shape)
installments_payments.head()

installments_payments.shape: (13605401, 8)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


In [33]:
installments_payments["SK_ID_PREV"].is_unique

False

In [34]:
installments_payments["SK_ID_CURR"].is_unique

False

In [35]:
installments_payments.groupby(by=["SK_ID_PREV"])["SK_ID_CURR"].count().head()

SK_ID_PREV
1000001     2
1000002     4
1000003     3
1000004     7
1000005    11
Name: SK_ID_CURR, dtype: int64

In [36]:
tmp = installments_payments.groupby(by=["SK_ID_PREV"])["SK_ID_CURR"].nunique()
print(tmp.head())

# so each "SK_ID_PREV" is corresponding to a unique "SK_ID_CURR"
tmp[tmp > 1]

SK_ID_PREV
1000001    1
1000002    1
1000003    1
1000004    1
1000005    1
Name: SK_ID_CURR, dtype: int64


Series([], Name: SK_ID_CURR, dtype: int64)

In [37]:
print_col_desc(installments_payments, "installments_payments.csv", col_desc)

Column Number: 0
Column Name: SK_ID_PREV
Description:
          No Description, Maybe the column name does not match.
Type: int64
Number of NULL(s): 0
Number of Unique Values: 997752
                             1054186, 1330831, 2085231, 2452527, 2714724, 
                             1137312, 2234264, 1818599, 2723183, 1413990, 
                             1782554, 2558880, 1570206, 1723268, 1594684, 
                             1456258, 1053286, 1410565, 2391610, 2329072 
                             ...
Min: 1000001
Max: 2843499
--------------------------------------------------

Column Number: 1
Column Name: SK_ID_CURR
Description:
           ID of loan in our sample
Special: hashed
Type: int64
Number of NULL(s): 0
Number of Unique Values: 339587
                             161674, 151639, 193053, 199697, 167756, 
                             164489, 184693, 111420, 112102, 109741, 
                             106597, 154793, 147645, 197273, 100193, 
                          