In [1]:
import os
import numpy as np
import pandas as pd
import ppscore as pps
import time

In [2]:
list_files = [os.path.join("../data/raw", i) for i in os.listdir("../data/raw")]
size_files = [os.stat(i).st_size/(1024*1024) for i in list_files]
print(dict(zip(list_files, size_files)))

{'../data/raw/train_labels.csv': 29.322805404663086, '../data/raw/test_data.csv': 32257.8899641037, '../data/raw/sample_submission.csv': 59.07977104187012, '../data/raw/train_data.csv': 15633.853614807129}


In [3]:
train_label_df = pd.read_csv("../data/raw/train_labels.csv")

In [4]:
%%time
# Let's encode customer_ID for efficient memory management.
train_label_df['encoded_customerID'] = range(0, len(train_label_df))

CPU times: user 0 ns, sys: 2.47 ms, total: 2.47 ms
Wall time: 1.76 ms


In [5]:
%%time
custid_mapper = train_label_df[['customer_ID', 'encoded_customerID']].set_index('customer_ID').to_dict()
custid_mapper = custid_mapper['encoded_customerID']

CPU times: user 273 ms, sys: 21.6 ms, total: 295 ms
Wall time: 294 ms


In [6]:
train_label_df['target'].value_counts(dropna=False, normalize=True)

0    0.741066
1    0.258934
Name: target, dtype: float64

The dataset contains aggregated profile features for each customer at each statement date. Features are anonymized and normalized, and fall into the following general categories:

D_* = Delinquency variables
S_* = Spend variables
P_* = Payment variables
B_* = Balance variables
R_* = Risk variables
with the following features being categorical:

['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
Your task is to predict, for each customer_ID, the probability of a future payment default (target = 1).

Note that the negative class has been subsampled for this dataset at 5%, and thus receives a 20x weighting in the scoring metric.

## Data Seggregation

In [7]:
# Let's read top 10 records only, just to get the column names
df_sample10 = pd.read_csv("../data/raw/train_data.csv", nrows=10)
target_col = pd.read_csv("../data/raw/train_labels.csv")

In [8]:
all_columns = df_sample10.columns.tolist()
index_col = 'customer_ID'
delinquency_cols = [i for i in all_columns if i.startswith('D')]
spend_cols = [i for i in all_columns if i.startswith('S')]
payment_cols = [i for i in all_columns if i.startswith('P')]
balance_cols = [i for i in all_columns if i.startswith('B')]
risk_cols = [i for i in all_columns if i.startswith('R')]

In [9]:
%%time
# Column Profiling: data type, null percentage, describe (min, max, [5,25,50,75,95], mean, std, variance)

def col_profiling(dataframe, col_name):
    res_dict = {
        'column_name': col_name,
        'datatype': dataframe[col_name].dtype,
        'null_pct': dataframe[col_name].isna().sum()*100/len(dataframe),
        'min_val': dataframe[col_name].min(),
        'max_val': dataframe[col_name].max(),
        'mean_val': dataframe[col_name].mean(),
        'std': dataframe[col_name].std(),
        'variance': dataframe[col_name].std(),
        '5_pct': dataframe[col_name].quantile(q=0.05),
        '25_pct': dataframe[col_name].quantile(q=0.25),
        '50_pct': dataframe[col_name].quantile(q=0.5),
        '75_pct': dataframe[col_name].quantile(q=0.75),
        '95_pct': dataframe[col_name].quantile(q=0.95),
        'num_uniq': dataframe[col_name].nunique()}
    return res_dict

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs


In [10]:
%%time
def get_pps(dataframe, x_col, y_col='target'):
    score = pps.score(dataframe, x_col, y_col)
    keys = ['ppscore', 'baseline_score', 'model_score']
    score = {k: v for k, v in score.items() if k in keys}
    score['column_name'] = x_col
    return score

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 3.58 µs


In [16]:
%%time
def get_col_profile(dataframe):
    prof_ls, pps_ls = [], []
    for item in dataframe:
        if dataframe[item].dtype == np.float64 or dataframe[item].dtype == np.int64:
            profile = col_profiling(dataframe, item)
            prof_ls.append(profile)
        if item not in ['target', 'encoded_customerID']:
            pps_score = get_pps(dataframe, item)
            pps_ls.append(pps_score)
    prof_df = pd.DataFrame(prof_ls)
    pps_df = pd.DataFrame(pps_ls)
    res_df = pd.merge(prof_df, pps_df, on='column_name', how='left')
    return res_df

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 6.68 µs


In [None]:
%%time
data_sep = {"spend": spend_cols, "payments": payment_cols, "balance": balance_cols, "risk": risk_cols, "delinquency": delinquency_cols}
col_profile_ls = []
start_time = time.time()
for key, val in data_sep.items():
    df = pd.read_csv("../data/raw/train_data.csv", usecols=[index_col]+val)
    df['encoded_customerID'] = df[index_col].map(custid_mapper)
    df = df.drop(index_col, axis=1)
    df2 = pd.merge(df, train_label_df[['encoded_customerID', 'target']], on='encoded_customerID', how='left')
    read_time = time.time()
    print(f"{key} reading completed in {read_time - start_time}")
    profile_df = get_col_profile(df2)
    col_profile_ls.append(profile_df)
    prof_time = time.time()
    print(f"{key} profiling time is {prof_time - read_time}")
    print("total time taken ", prof_time - start_time)
print("separated dataframe has been stored!")

In [13]:
complete_profile_df = pd.concat(col_profile_ls)
complete_profile_df.to_csv("../data/metadata/column_profile.csv")

TypeError: cannot concatenate object of type '<class 'tuple'>'; only Series and DataFrame objs are valid

In [15]:
col_profile_ls[0]

(           column_name datatype   null_pct       min_val        max_val  \
 0                  S_3  float64  18.449843 -6.271320e-01       5.482888   
 1                  S_5  float64   0.000000  8.168135e-09     206.875280   
 2                  S_6  float64   0.000000  2.541465e-09       1.010000   
 3                  S_7  float64  18.449843 -4.701318e-01       3.948271   
 4                  S_8  float64   0.000000  6.187214e-09       1.231413   
 5                  S_9  float64  53.035686  2.823498e-07       2.839635   
 6                 S_11  float64   0.000000 -1.999987e-01       3.801432   
 7                 S_12  float64   0.000000 -4.041090e-01     175.331420   
 8                 S_13  float64   0.000000  3.655658e-09       1.010000   
 9                 S_15  float64   0.000000 -2.999998e-01       5.308959   
 10                S_16  float64   0.000000  5.083305e-09     231.758947   
 11                S_17  float64   0.000000  1.019806e-09       4.049070   
 12         