In [1]:
import os, math, subprocess
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# some settings for displaying Pandas results
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.precision', 4)
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_colwidth', -1)

# Load data and exploring statistics

In [2]:
!ls -lah home-credit-default-risk/

total 2.5G
drwxrwsr-x 2 hong ubuntu 4.0K Apr 16 16:38 .
drwxrwsr-x 4 hong ubuntu 4.0K Apr 16 16:39 ..
-rw-rw-r-- 1 hong ubuntu  26M Jun 26  2018 application_test.csv
-rw-rw-r-- 1 hong ubuntu 159M Jun 26  2018 application_train.csv
-rw-rw-r-- 1 hong ubuntu 359M Jun 26  2018 bureau_balance.csv
-rw-rw-r-- 1 hong ubuntu 163M Jun 26  2018 bureau.csv
-rw-rw-r-- 1 hong ubuntu 405M Jun 26  2018 credit_card_balance.csv
-rw-rw-r-- 1 hong ubuntu  37K Jun 26  2018 HomeCredit_columns_description.csv
-rw-rw-r-- 1 hong ubuntu 690M Jun 26  2018 installments_payments.csv
-rw-rw-r-- 1 hong ubuntu 375M Jun 26  2018 POS_CASH_balance.csv
-rw-rw-r-- 1 hong ubuntu 387M Jun 26  2018 previous_application.csv
-rw-rw-r-- 1 hong ubuntu 524K Jun 26  2018 sample_submission.csv


In [3]:
ls_files = subprocess.check_output(["ls", "-lah", "home-credit-default-risk/"]).decode("utf-8").splitlines()[3:]
ls_files = [f.split()[-1] for f in ls_files]
ls_files = sorted([f for f in ls_files if f not in ["HomeCredit_columns_description.csv"]])
ls_files

[u'POS_CASH_balance.csv',
 u'application_test.csv',
 u'application_train.csv',
 u'bureau.csv',
 u'bureau_balance.csv',
 u'credit_card_balance.csv',
 u'installments_payments.csv',
 u'previous_application.csv',
 u'sample_submission.csv']

In [4]:
def exploring_stats(pdf_input):
    # check rows, cols
    total_records = pdf_input.shape[0]
    total_columns = pdf_input.shape[1]
    print("Total records:", total_records)
    print("Total columns:", total_columns)

    # check dtypes
    name = []
    sub_type = []
    for n, t in pdf_input.dtypes.iteritems():
        name.append(n)
        sub_type.append(t)

    # check distinct
    ls_ndist = []
    for cname in pdf_input.columns:
        ndist = pdf_input[cname].nunique()
        pct_dist = ndist * 100.0 / total_records
        ls_ndist.append("{} ({:0.2f}%)".format(ndist, pct_dist))

    # check missing
    ls_nmiss = []
    for cname in pdf_input.columns:
        nmiss = pdf_input[cname].isnull().sum()
        pct_miss = nmiss * 100.0 / total_records
        ls_nmiss.append("{} ({:0.2f}%)".format(nmiss, pct_miss))

    # check zeros
    ls_zeros = []
    for cname in pdf_input.columns:
        try:
            nzeros = (pdf_input[cname] == 0).sum()
            pct_zeros = nzeros * 100.0 / total_records
            ls_zeros.append("{} ({:0.2f}%)".format(nzeros, pct_zeros))
        except:
            ls_zeros.append("{} ({:0.2f}%)".format(0, 0))
            continue

    # check negative
    ls_neg = []
    for cname in pdf_input.columns:
        try:
            nneg = (pdf_input[cname].astype("float") < 0).sum()
            pct_neg = nneg * 100.0 / total_records
            ls_neg.append("{} ({:0.2f}%)".format(nneg, pct_neg))
        except:
            ls_neg.append("{} ({:0.2f}%)".format(0, 0))
            continue

    # prepare output
    data = {
        "name": name,
        "sub_type": sub_type,
        "n_distinct": ls_ndist,
        "n_miss": ls_nmiss,
        "n_zeros": ls_zeros,
        "n_negative": ls_neg,
    }

    # check stats
    pdf_stats = pdf_input.describe().transpose()
    ls_stats = []
    for stat in pdf_stats.columns:
        data[stat] = []
        for cname in pdf_input.columns:
            try:
                data[stat].append(pdf_stats.loc[cname, stat])
            except:
                data[stat].append(0.0)

    # take samples
    nsample = 10
    pdf_sample = pdf_input.sample(frac=.5).head(nsample).transpose()
    pdf_sample.columns = ["sample_{}".format(i) for i in range(nsample)]

    # output
    col_ordered = ["sub_type", "n_distinct", "n_miss", "n_negative", "n_zeros",
                   "25%", "50%", "75%", "count", "max", "mean", "min", "std"] + list(pdf_sample.columns)
    pdf_data = pd.DataFrame(data).set_index("name")
    pdf_data = pd.concat([pdf_data, pdf_sample], axis=1)
    pdf_data = pdf_data[col_ordered]

    return pdf_data

In [5]:
%%time
ls_report = {}
for f in ls_files:
    print("Exploring {}".format(f))
    data_path = os.path.join("home-credit-default-risk", f)
    pdf_data = pd.read_csv(data_path)
    ls_report[f] = exploring_stats(pdf_data)
    display(ls_report[f])

Exploring POS_CASH_balance.csv
('Total records:', 10001358)
('Total columns:', 8)


Unnamed: 0_level_0,sub_type,n_distinct,n_miss,n_negative,n_zeros,25%,50%,75%,count,max,mean,min,std,sample_0,sample_1,sample_2,sample_3,sample_4,sample_5,sample_6,sample_7,sample_8,sample_9
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
SK_ID_PREV,int64,936325 (9.36%),0 (0.00%),0 (0.00%),0 (0.00%),1434405.0,1896565.0,2368963.0,10001000.0,2843499.0,1903200.0,1000001.0,535846.5307,2749119,1439762,2404083,1417828,1975679,2465369,2059772,1570568,2313395,2556181
SK_ID_CURR,int64,337252 (3.37%),0 (0.00%),0 (0.00%),0 (0.00%),189550.0,278654.0,367429.0,10001000.0,456255.0,278400.0,100001.0,102763.7451,136725,412165,394468,230896,290188,301648,415723,432016,154158,280427
MONTHS_BALANCE,int64,96 (0.00%),0 (0.00%),10001358 (100.00%),0 (0.00%),-54.0,-28.0,-13.0,10001000.0,-1.0,-35.013,-96.0,26.0666,-40,-11,-24,-6,-63,-12,-2,-92,-21,-22
CNT_INSTALMENT,float64,73 (0.00%),26071 (0.26%),0 (0.00%),0 (0.00%),10.0,12.0,24.0,9975300.0,92.0,17.09,1.0,11.9951,12,12,10,8,10,36,12,10,60,18
CNT_INSTALMENT_FUTURE,float64,79 (0.00%),26087 (0.26%),0 (0.00%),1185960 (11.86%),3.0,7.0,14.0,9975300.0,85.0,10.484,0.0,11.1091,1,3,10,8,1,17,1,10,51,17
NAME_CONTRACT_STATUS,object,9 (0.00%),0 (0.00%),0 (0.00%),0 (0.00%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Active,Active,Active,Active,Active,Active,Active,Active,Active,Active
SK_DPD,int64,3400 (0.03%),0 (0.00%),0 (0.00%),9706131 (97.05%),0.0,0.0,0.0,10001000.0,4231.0,11.607,0.0,132.714,0,0,0,0,0,0,0,0,0,0
SK_DPD_DEF,int64,2307 (0.02%),0 (0.00%),0 (0.00%),9887389 (98.86%),0.0,0.0,0.0,10001000.0,3595.0,0.65447,0.0,32.7625,0,0,0,0,0,0,0,0,0,0


Exploring application_test.csv
('Total records:', 48744)
('Total columns:', 121)


Unnamed: 0_level_0,sub_type,n_distinct,n_miss,n_negative,n_zeros,25%,50%,75%,count,max,mean,min,std,sample_0,sample_1,sample_2,sample_3,sample_4,sample_5,sample_6,sample_7,sample_8,sample_9
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
SK_ID_CURR,int64,48744 (100.00%),0 (0.00%),0 (0.00%),0 (0.00%),188557.75,277549.0,367555.5,48744.0,456250.0,277800.0,100000.0,103169.5473,413769,267284,260484,359319,152020,211553,183878,303497,285833,261749
NAME_CONTRACT_TYPE,object,2 (0.00%),0 (0.00%),0 (0.00%),0 (0.00%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cash loans,Cash loans,Cash loans,Cash loans,Cash loans,Cash loans,Cash loans,Cash loans,Cash loans,Cash loans
CODE_GENDER,object,2 (0.00%),0 (0.00%),0 (0.00%),0 (0.00%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,F,F,F,F,F,M,F,M,F,F
FLAG_OWN_CAR,object,2 (0.00%),0 (0.00%),0 (0.00%),0 (0.00%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Y,N,N,N,N,N,Y,Y,N,N
FLAG_OWN_REALTY,object,2 (0.00%),0 (0.00%),0 (0.00%),0 (0.00%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Y,N,Y,N,N,N,N,Y,N,Y
CNT_CHILDREN,int64,11 (0.02%),0 (0.00%),0 (0.00%),34685 (71.16%),0.0,0.0,1.0,48744.0,20.0,0.39705,0.0,0.709,0,0,0,0,0,0,0,1,0,0
AMT_INCOME_TOTAL,float64,606 (1.24%),0 (0.00%),0 (0.00%),0 (0.00%),112500.0,157500.0,225000.0,48744.0,4410000.0,178430.0,26942.0,101522.5915,1.125e+05,9e+04,3.375e+05,1.8e+05,9e+04,1.35e+05,1.125e+05,2.475e+05,6.75e+04,8.1e+04
AMT_CREDIT,float64,2937 (6.03%),0 (0.00%),0 (0.00%),0 (0.00%),260640.0,450000.0,675000.0,48744.0,2245500.0,516740.0,45000.0,365397.0042,3.769e+05,2.725e+05,1.305e+06,3.6e+05,2.37e+05,2.7e+05,6.75e+05,3.820e+05,6.75e+05,3.644e+05
AMT_ANNUITY,float64,7491 (15.37%),24 (0.05%),0 (0.00%),0 (0.00%),17973.0,26199.0,37390.5,48720.0,180580.0,29426.0,2295.0,16016.3683,1.937e+04,2.166e+04,4.21e+04,2.202e+04,1.528e+04,2.476e+04,3.46e+04,2.35e+04,2.545e+04,1.545e+04
AMT_GOODS_PRICE,float64,677 (1.39%),0 (0.00%),0 (0.00%),0 (0.00%),225000.0,396000.0,630000.0,48744.0,2245500.0,462620.0,45000.0,336710.215,2.7e+05,2.25e+05,1.305e+06,3.6e+05,1.8e+05,2.7e+05,6.75e+05,3.375e+05,6.75e+05,2.385e+05


Exploring application_train.csv
('Total records:', 307511)
('Total columns:', 122)


Unnamed: 0_level_0,sub_type,n_distinct,n_miss,n_negative,n_zeros,25%,50%,75%,count,max,mean,min,std,sample_0,sample_1,sample_2,sample_3,sample_4,sample_5,sample_6,sample_7,sample_8,sample_9
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
SK_ID_CURR,int64,307511 (100.00%),0 (0.00%),0 (0.00%),0 (0.00%),189145.5,278202.0,367142.5,307511.0,456260.0,278180.0,100000.0,102790.1753,326682,414578,432657,346257,169928,228494,305986,450918,393627,121604
TARGET,int64,2 (0.00%),0 (0.00%),0 (0.00%),282686 (91.93%),0.0,0.0,0.0,307511.0,1.0,0.080729,0.0,0.2724,0,1,0,0,1,0,0,1,0,0
NAME_CONTRACT_TYPE,object,2 (0.00%),0 (0.00%),0 (0.00%),0 (0.00%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cash loans,Cash loans,Revolving loans,Cash loans,Cash loans,Cash loans,Cash loans,Cash loans,Cash loans,Cash loans
CODE_GENDER,object,3 (0.00%),0 (0.00%),0 (0.00%),0 (0.00%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,M,M,M,M,M,F,M,F,F,F
FLAG_OWN_CAR,object,2 (0.00%),0 (0.00%),0 (0.00%),0 (0.00%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Y,Y,N,N,N,N,N,Y,N,N
FLAG_OWN_REALTY,object,2 (0.00%),0 (0.00%),0 (0.00%),0 (0.00%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Y,N,Y,Y,Y,Y,Y,N,Y,Y
CNT_CHILDREN,int64,15 (0.00%),0 (0.00%),0 (0.00%),215371 (70.04%),0.0,0.0,1.0,307511.0,19.0,0.41705,0.0,0.7221,0,1,0,0,0,0,0,1,2,0
AMT_INCOME_TOTAL,float64,2548 (0.83%),0 (0.00%),0 (0.00%),0 (0.00%),112500.0,147150.0,202500.0,307511.0,117000000.0,168800.0,25650.0,237123.1463,1.665e+05,4.5e+05,1.575e+05,1.35e+05,2.025e+05,1.17e+05,9e+04,1.8e+05,1.575e+05,9e+04
AMT_CREDIT,float64,5603 (1.82%),0 (0.00%),0 (0.00%),0 (0.00%),270000.0,513531.0,808650.0,307511.0,4050000.0,599030.0,45000.0,402490.777,2.547e+05,1.381e+06,4.5e+05,1.764e+06,2.038e+05,2.547e+05,5.387e+05,6.3e+05,6.795e+05,7.552e+05
AMT_ANNUITY,float64,13672 (4.45%),12 (0.00%),0 (0.00%),0 (0.00%),16524.0,24903.0,34596.0,307499.0,258030.0,27109.0,1615.5,14493.7373,2.519e+04,3.971e+04,2.25e+04,4.851e+04,1.623e+04,2.519e+04,2.605e+04,2.327e+04,3.62e+04,3.646e+04


Exploring bureau.csv
('Total records:', 1716428)
('Total columns:', 17)


Unnamed: 0_level_0,sub_type,n_distinct,n_miss,n_negative,n_zeros,25%,50%,75%,count,max,mean,min,std,sample_0,sample_1,sample_2,sample_3,sample_4,sample_5,sample_6,sample_7,sample_8,sample_9
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
SK_ID_CURR,int64,305811 (17.82%),0 (0.00%),0 (0.00%),0 (0.00%),188870.0,278055.0,367430.0,1716428.0,456260.0,278210.0,100000.0,102940.0,226678,158734,202284,275757,308924,298539,220126,180625,152418,134343
SK_ID_BUREAU,int64,1716428 (100.00%),0 (0.00%),0 (0.00%),0 (0.00%),5464000.0,5926303.5,6385700.0,1716428.0,6843500.0,5924400.0,5000000.0,532270.0,6173729,6281010,6584010,6680033,5347959,5746640,6356593,6387149,6351262,5829930
CREDIT_ACTIVE,object,4 (0.00%),0 (0.00%),0 (0.00%),0 (0.00%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Closed,Closed,Closed,Active,Closed,Closed,Closed,Closed,Active,Active
CREDIT_CURRENCY,object,4 (0.00%),0 (0.00%),0 (0.00%),0 (0.00%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,currency 1,currency 1,currency 1,currency 1,currency 1,currency 1,currency 1,currency 1,currency 1,currency 1
DAYS_CREDIT,int64,2923 (0.17%),0 (0.00%),1716403 (100.00%),25 (0.00%),-1666.0,-987.0,-474.0,1716428.0,0.0,-1142.1,-2922.0,795.16,-1467,-2507,-643,-924,-304,-711,-2742,-405,-1615,-16
CREDIT_DAY_OVERDUE,int64,942 (0.05%),0 (0.00%),0 (0.00%),1712211 (99.75%),0.0,0.0,0.0,1716428.0,2792.0,0.81817,0.0,36.544,0,0,0,0,0,0,0,0,0,0
DAYS_CREDIT_ENDDATE,float64,14096 (0.82%),105553 (6.15%),1007389 (58.69%),883 (0.05%),-1138.0,-330.0,474.0,1610875.0,31199.0,510.52,-42060.0,4994.2,-676,-2202,-97,163,,-619,-2374,326,955,1068
DAYS_ENDDATE_FACT,float64,2917 (0.17%),633653 (36.92%),1082711 (63.08%),64 (0.00%),-1489.0,-897.0,-425.0,1082775.0,0.0,-1017.4,-42023.0,714.01,-676,-2230,-97,,-279,-619,-2374,-160,,
AMT_CREDIT_MAX_OVERDUE,float64,68251 (3.98%),1124488 (65.51%),0 (0.00%),470650 (27.42%),0.0,0.0,0.0,591940.0,115990000.0,3825.4,0.0,206030.0,,0,,,,,,,,
CNT_CREDIT_PROLONG,int64,10 (0.00%),0 (0.00%),0 (0.00%),1707314 (99.47%),0.0,0.0,0.0,1716428.0,9.0,0.0064104,0.0,0.096224,0,0,0,0,0,0,0,0,0,0


Exploring bureau_balance.csv
('Total records:', 27299925)
('Total columns:', 3)


Unnamed: 0_level_0,sub_type,n_distinct,n_miss,n_negative,n_zeros,25%,50%,75%,count,max,mean,min,std,sample_0,sample_1,sample_2,sample_3,sample_4,sample_5,sample_6,sample_7,sample_8,sample_9
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
SK_ID_BUREAU,int64,817395 (2.99%),0 (0.00%),0 (0.00%),0 (0.00%),5730933.0,6070821.0,6431951.0,27300000.0,6842888.0,6036300.0,5001709.0,492348.8569,5162830,5115417,5999691,5898774,5797891,6492595,6809852,6053213,6169803,6686363
MONTHS_BALANCE,int64,97 (0.00%),0 (0.00%),26688960 (97.76%),610965 (2.24%),-46.0,-25.0,-11.0,27300000.0,0.0,-30.742,-96.0,23.8645,-26,-39,-69,-25,-36,-1,-43,-27,-15,-30
STATUS,object,8 (0.00%),0 (0.00%),0 (0.00%),0 (0.00%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,X,1,X,X,C,C,0,C,C,0


Exploring credit_card_balance.csv
('Total records:', 3840312)
('Total columns:', 23)


Unnamed: 0_level_0,sub_type,n_distinct,n_miss,n_negative,n_zeros,25%,50%,75%,count,max,mean,min,std,sample_0,sample_1,sample_2,sample_3,sample_4,sample_5,sample_6,sample_7,sample_8,sample_9
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
SK_ID_PREV,int64,104307 (2.72%),0 (0.00%),0 (0.00%),0 (0.00%),1434400.0,1897122.0,2369300.0,3840312.0,2843500.0,1904500.0,1000000.0,536469.4706,2033976,1298413,1590798,1979274,1903431,2022496,2498831,1235207,2837796,2399399
SK_ID_CURR,int64,103558 (2.70%),0 (0.00%),0 (0.00%),0 (0.00%),189520.0,278396.0,367580.0,3840312.0,456250.0,278320.0,100010.0,102704.4751,379258,281213,422598,393320,223935,240743,167235,359955,102048,387413
MONTHS_BALANCE,int64,96 (0.00%),0 (0.00%),3840312 (100.00%),0 (0.00%),-55.0,-28.0,-11.0,3840312.0,-1.0,-34.522,-96.0,26.6678,-6,-68,-5,-1,-41,-60,-2,-71,-26,-65
AMT_BALANCE,float64,1347904 (35.10%),0 (0.00%),2345 (0.06%),2156420 (56.15%),0.0,0.0,89047.0,3840312.0,1505900.0,58300.0,-420250.0,106307.031,4.221e+04,0,0,0,1.817e+05,0,1.879e+05,0,0,1.717e+04
AMT_CREDIT_LIMIT_ACTUAL,int64,181 (0.00%),0 (0.00%),0 (0.00%),753823 (19.63%),45000.0,112500.0,180000.0,3840312.0,1350000.0,153810.0,0.0,165145.6995,225000,67500,135000,0,180000,0,180000,0,90000,90000
AMT_DRAWINGS_ATM_CURRENT,float64,2267 (0.06%),749816 (19.52%),1 (0.00%),2665718 (69.41%),0.0,0.0,0.0,3090496.0,2115000.0,5961.3,-6827.3,28225.6886,0,0,,0,9000,0,2250,0,0,0
AMT_DRAWINGS_CURRENT,float64,187005 (4.87%),0 (0.00%),3 (0.00%),3223443 (83.94%),0.0,0.0,0.0,3840312.0,2287100.0,7433.4,-6211.6,33846.0773,2601,0,0,0,9000,0,2250,0,0,0
AMT_DRAWINGS_OTHER_CURRENT,float64,1832 (0.05%),749816 (19.52%),0 (0.00%),3078163 (80.15%),0.0,0.0,0.0,3090496.0,1529800.0,288.17,0.0,8201.9893,0,0,,0,0,0,0,0,0,0
AMT_DRAWINGS_POS_CURRENT,float64,168748 (4.39%),749816 (19.52%),0 (0.00%),2825595 (73.58%),0.0,0.0,0.0,3090496.0,2239300.0,2968.8,0.0,20796.887,2601,0,,0,0,0,0,0,0,0
AMT_INST_MIN_REGULARITY,float64,312266 (8.13%),305236 (7.95%),0 (0.00%),1928864 (50.23%),0.0,0.0,6633.9,3535076.0,202880.0,3540.2,0.0,5600.1541,2250,0,0,0,9000,0,9338,0,0,4500


Exploring installments_payments.csv
('Total records:', 13605401)
('Total columns:', 8)


Unnamed: 0_level_0,sub_type,n_distinct,n_miss,n_negative,n_zeros,25%,50%,75%,count,max,mean,min,std,sample_0,sample_1,sample_2,sample_3,sample_4,sample_5,sample_6,sample_7,sample_8,sample_9
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
SK_ID_PREV,int64,997752 (7.33%),0 (0.00%),0 (0.00%),0 (0.00%),1434200.0,1896500.0,2369100.0,13605000.0,2843500.0,1903400.0,1000001.0,536202.9055,1434300.0,1614000.0,1503500.0,2674600.0,2473148.0,1289400.0,2177200.0,1157500.0,1515366.0,1767561.0
SK_ID_CURR,int64,339587 (2.50%),0 (0.00%),0 (0.00%),0 (0.00%),189640.0,278680.0,367530.0,13605000.0,456260.0,278440.0,100001.0,102718.3104,149430.0,178200.0,371770.0,124630.0,355290.0,369560.0,131840.0,114880.0,369258.0,315860.0
NUM_INSTALMENT_VERSION,float64,65 (0.00%),0 (0.00%),0 (0.00%),4082498 (30.01%),0.0,1.0,1.0,13605000.0,178.0,0.85664,0.0,1.0352,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,2.0,1.0
NUM_INSTALMENT_NUMBER,int64,277 (0.00%),0 (0.00%),0 (0.00%),0 (0.00%),4.0,8.0,19.0,13605000.0,277.0,18.871,1.0,26.6641,5.0,7.0,8.0,1.0,63.0,5.0,3.0,1.0,15.0,5.0
DAYS_INSTALMENT,float64,2922 (0.02%),0 (0.00%),13605401 (100.00%),0 (0.00%),-1654.0,-818.0,-361.0,13605000.0,-1.0,-1042.3,-2922.0,800.9463,-401.0,-2258.0,-410.0,-553.0,-1816.0,-775.0,-395.0,-616.0,-450.0,-727.0
DAYS_ENTRY_PAYMENT,float64,3039 (0.02%),2905 (0.02%),13602496 (99.98%),0 (0.00%),-1662.0,-827.0,-370.0,13602000.0,-1.0,-1051.1,-4921.0,800.5859,-413.0,-2264.0,-412.0,-558.0,-1829.0,-791.0,-399.0,-630.0,-466.0,-735.0
AMT_INSTALMENT,float64,902539 (6.63%),0 (0.00%),0 (0.00%),290 (0.00%),4226.1,8884.1,16710.0,13605000.0,3771500.0,17051.0,0.0,50570.2544,11093.0,24050.0,4784.0,20124.0,7875.0,16709.0,11969.0,12441.0,294150.6,10495.8
AMT_PAYMENT,float64,944235 (6.94%),2905 (0.02%),0 (0.00%),1440 (0.01%),3398.3,8125.5,16108.0,13602000.0,3771500.0,17238.0,0.0,54735.784,11093.0,24050.0,4784.0,20124.0,7875.0,16709.0,11969.0,12441.0,294150.6,10495.8


Exploring previous_application.csv
('Total records:', 1670214)
('Total columns:', 37)


Unnamed: 0_level_0,sub_type,n_distinct,n_miss,n_negative,n_zeros,25%,50%,75%,count,max,mean,min,std,sample_0,sample_1,sample_2,sample_3,sample_4,sample_5,sample_6,sample_7,sample_8,sample_9
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
SK_ID_PREV,int64,1670214 (100.00%),0 (0.00%),0 (0.00%),0 (0.00%),1461900.0,1923100.0,2384300.0,1670214.0,2845400.0,1923100.0,1000000.0,532597.9587,1092395,1252745,1426776,1649070,1937880,2786742,2369053,2100244,2751684,1685570
SK_ID_CURR,int64,338857 (20.29%),0 (0.00%),0 (0.00%),0 (0.00%),189330.0,278710.0,367510.0,1670214.0,456260.0,278360.0,100000.0,102814.8238,334586,307326,164677,289639,316386,325443,329812,151568,184899,428430
NAME_CONTRACT_TYPE,object,4 (0.00%),0 (0.00%),0 (0.00%),0 (0.00%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Revolving loans,Cash loans,Cash loans,Consumer loans,Consumer loans,Cash loans,Consumer loans,Revolving loans,Cash loans,Consumer loans
AMT_ANNUITY,float64,357959 (21.43%),372235 (22.29%),0 (0.00%),1637 (0.10%),6321.8,11250.0,20658.0,1297979.0,418060.0,15955.0,0.0,14782.1373,2250,,,6263,7556,,1.26e+04,2.025e+04,2.758e+04,1.289e+04
AMT_APPLICATION,float64,93885 (5.62%),0 (0.00%),0 (0.00%),392402 (23.49%),18720.0,71046.0,180360.0,1670214.0,6905200.0,175230.0,0.0,292779.7624,4.5e+04,0,0,4.364e+04,7.308e+04,0,1.35e+05,0,4.5e+05,1.179e+05
AMT_CREDIT,float64,86803 (5.20%),1 (0.00%),0 (0.00%),336768 (20.16%),24160.0,80541.0,216420.0,1670213.0,6905200.0,196110.0,0.0,318574.6165,4.5e+04,0,0,5.328e+04,7.308e+04,0,1.35e+05,4.05e+05,4.916e+05,1.146e+05
AMT_DOWN_PAYMENT,float64,29278 (1.75%),895844 (53.64%),2 (0.00%),369854 (22.14%),0.0,1638.0,7740.0,774370.0,3060000.0,6697.4,-0.9,20921.4954,,,,0,0,,0,,,2.357e+04
AMT_GOODS_PRICE,float64,93885 (5.62%),385515 (23.08%),0 (0.00%),6869 (0.41%),50841.0,112320.0,234000.0,1284699.0,6905200.0,227850.0,0.0,315396.5579,4.5e+04,,,4.364e+04,7.308e+04,,1.35e+05,,4.5e+05,1.179e+05
WEEKDAY_APPR_PROCESS_START,object,7 (0.00%),0 (0.00%),0 (0.00%),0 (0.00%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,WEDNESDAY,FRIDAY,WEDNESDAY,SUNDAY,SATURDAY,FRIDAY,TUESDAY,MONDAY,SATURDAY,FRIDAY
HOUR_APPR_PROCESS_START,int64,24 (0.00%),0 (0.00%),0 (0.00%),109 (0.01%),10.0,12.0,15.0,1670214.0,23.0,12.484,0.0,3.334,12,16,11,7,10,16,21,17,13,17


Exploring sample_submission.csv
('Total records:', 48744)
('Total columns:', 2)


Unnamed: 0_level_0,sub_type,n_distinct,n_miss,n_negative,n_zeros,25%,50%,75%,count,max,mean,min,std,sample_0,sample_1,sample_2,sample_3,sample_4,sample_5,sample_6,sample_7,sample_8,sample_9
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
SK_ID_CURR,int64,48744 (100.00%),0 (0.00%),0 (0.00%),0 (0.00%),188557.75,277549.0,367555.5,48744.0,456250.0,277796.6763,100001.0,103169.5473,262887.0,279679.0,340511.0,256663.0,261967.0,359353.0,364403.0,423087.0,115680.0,370913.0
TARGET,float64,1 (0.00%),0 (0.00%),0 (0.00%),0 (0.00%),0.5,0.5,0.5,48744.0,0.5,0.5,0.5,0.0,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5


CPU times: user 6min 36s, sys: 16 s, total: 6min 52s
Wall time: 1min 39s


In [7]:
%%time
# save reports
for k, v in ls_report.items():
    print("Saving {}".format(k))
    ls_report[k].to_csv(os.path.join("reports", "report_{}".format(k)))

Saving previous_application.csv
Saving application_test.csv
Saving installments_payments.csv
Saving sample_submission.csv
Saving credit_card_balance.csv
Saving bureau.csv
Saving application_train.csv
Saving POS_CASH_balance.csv
Saving bureau_balance.csv
CPU times: user 14.3 ms, sys: 8.26 ms, total: 22.6 ms
Wall time: 26.1 ms
