In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
from tqdm import tqdm

***
## feature groups
- D_* = Delinquency variables
- S_* = Spend variables
- P_* = Payment variables
- B_* = Balance variables
- R_* = Risk variables

In [2]:
data = pd.read_parquet("../data/ext/amex-data-integer-dtypes-parquet-format/train.parquet")
input_cols = data.columns[2:]

categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
numerical_cols = [col for col in input_cols if col not in categorical_cols]

labels = pd.read_csv("../data/raw/train_labels.csv")

display(data)

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0,0.008724,1.006838,0.009228,0.124035,0.0,0.004709,...,-1,-1,-1,0,0,0.0,,0,0.000610,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0,0.004923,1.000653,0.006151,0.126750,0.0,0.002714,...,-1,-1,-1,0,0,0.0,,0,0.005492,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.954180,3,0.021655,1.009672,0.006815,0.123977,0.0,0.009423,...,-1,-1,-1,0,0,0.0,,0,0.006986,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0,0.013683,1.002700,0.001373,0.117169,0.0,0.005531,...,-1,-1,-1,0,0,0.0,,0,0.006527,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0,0.015193,1.000727,0.007605,0.117325,0.0,0.009312,...,-1,-1,-1,0,0,0.0,,0,0.008126,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5531446,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2017-11-05,0.979333,14,0.020818,0.828199,0.003487,0.090743,0.0,0.025139,...,-1,-1,-1,0,0,0.0,,0,0.001498,0
5531447,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2017-12-23,0.984907,10,0.007209,0.812610,0.005904,0.079886,0.0,0.023691,...,-1,-1,-1,0,0,0.0,,0,0.008225,0
5531448,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2018-01-06,0.983019,15,0.013151,0.815422,0.003457,0.100503,0.0,0.012343,...,-1,-1,-1,0,0,0.0,,0,0.006773,0
5531449,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2018-02-06,0.969861,15,0.009855,1.003541,0.005117,0.101802,0.0,0.008578,...,-1,-1,-1,0,0,0.0,,0,0.001168,0


In [3]:
data[input_cols].mean(axis=0).sort_values(ascending=False)

S_8      1021.351741
S_13      258.162014
D_59       21.709431
B_19       14.507921
S_11       14.258346
            ...     
D_135      -0.963922
D_137      -0.964476
D_111      -0.984484
D_108      -0.994421
D_87       -0.998603
Length: 188, dtype: float64

In [4]:
data[input_cols].std(axis=0).sort_values(ascending=False)

S_8     959.537364
S_13    306.738952
B_19     28.956450
D_65     17.977417
B_4      17.346935
           ...    
R_28      0.027225
B_36      0.021064
R_23      0.017195
S_19      0.003061
B_27      0.003004
Length: 188, dtype: float64

***
## correlation analysis

In [5]:
data_feats = data.groupby("customer_ID")[numerical_cols].last().reset_index()
data_feats = pd.merge(data_feats, labels, how="inner", on="customer_ID")
data_feats

Unnamed: 0,customer_ID,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.934745,0,0.009382,1.007647,0.006104,0.135021,0.0,0.007174,,...,-1,-1,0,0,0.000000,,0,0.002970,0,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.880519,6,0.034684,1.004028,0.006911,0.165509,0.0,0.005068,,...,-1,-1,0,0,0.000000,,0,0.003169,0,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0.880875,0,0.004284,0.812649,0.006450,,0.0,0.007196,,...,-1,-1,0,0,0.000000,,0,0.000834,0,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0.621776,0,0.012564,1.006183,0.007829,0.287766,0.0,0.009937,,...,-1,-1,0,0,0.000000,,0,0.005560,0,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0.871900,0,0.007679,0.815746,0.001247,0.176403,0.0,0.005528,,...,-1,-1,0,0,0.000000,,0,0.006944,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458908,ffff41c8a52833b56430603969b9ca48d208e7c192c6a4...,0.844229,15,0.028515,1.009866,0.001928,0.128707,0.0,0.005893,,...,-1,-1,0,0,0.000000,,0,0.003009,0,0
458909,ffff518bb2075e4816ee3fe9f3b152c57fc0e6f01bf7fd...,0.831279,1,0.292360,0.055656,0.006953,0.194697,0.0,0.233078,,...,-1,-1,0,0,0.000000,,0,0.009230,0,0
458910,ffff9984b999fccb2b6127635ed0736dda94e544e67e02...,0.800522,9,0.020563,1.007023,0.000957,0.066648,0.0,0.006314,,...,-1,-1,0,0,0.000000,,0,0.000340,0,0
458911,ffffa5c46bc8de74f5a4554e74e239c8dee6b9baf38814...,0.754129,0,0.015838,0.714486,0.000993,0.408849,0.0,0.050048,,...,-1,-1,1,0,0.949723,0.446255,1,0.002502,2,1


In [6]:
def compute_feat_correlation(array, i):
    n = array.shape[1]
    out = list()
    
    for j in range(n):
        if i < j: continue
        
        x = array[:,i]
        y = array[:,j]
        
        mask = (~np.isnan(x)) & (~np.isnan(y))
        x = x[mask]
        y = y[mask]
        
        out.append((i,j,stats.pearsonr(x,y)[0]))
        
    return out

def compute_correlation(dataframe, columns):

    n = len(columns)
    array = data_feats[columns].values
    corr_array = np.empty((n,n))

    with Parallel(n_jobs=-1) as parallel:
        delayed_func = delayed(compute_feat_correlation)
        results = parallel(
            delayed_func(array, i) 
            for i in tqdm(range(n))
        )

    for r in results:
        for corr_row in r:
            i,j,corr = corr_row
            corr_array[i,j] = corr
            corr_array[j,i] = corr

    corr_df = pd.DataFrame(corr_array, columns=columns, index=columns)
    return corr_df

In [7]:
corr_df = compute_correlation(data_feats, columns=numerical_cols + ["target"])
corr_df

100%|██████████████████████████████████████████████████████| 178/178 [00:58<00:00,  3.06it/s]


Unnamed: 0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
P_2,1.000000,-0.329898,-0.399579,0.598197,-0.554895,-0.345474,-0.419560,-0.495857,-0.437932,-0.312842,...,-0.163460,-0.153595,-0.179396,-0.149200,-0.179575,0.103587,-0.179423,0.008972,-0.163353,-0.666979
D_39,-0.329898,1.000000,0.244918,-0.290357,0.387437,0.113234,0.645378,0.228649,0.138113,0.085187,...,0.021940,0.023319,0.044113,0.040186,0.041261,-0.005302,0.044068,0.003378,0.033710,0.339495
B_1,-0.399579,0.244918,1.000000,-0.627208,0.280121,0.171363,0.273028,0.735630,-0.027966,0.088642,...,0.060322,0.056622,0.104008,0.061279,0.105603,0.001925,0.103997,0.034230,0.082072,0.449427
B_2,0.598197,-0.290357,-0.627208,1.000000,-0.369797,-0.259755,-0.334210,-0.720642,-0.132882,-0.170853,...,-0.102853,-0.095190,-0.160060,-0.106513,-0.157754,0.022543,-0.160091,-0.035525,-0.130273,-0.557697
R_1,-0.554895,0.387437,0.280121,-0.369797,1.000000,0.231973,0.425737,0.320428,0.194729,0.215257,...,0.090682,0.090597,0.090044,0.082560,0.086529,-0.006612,0.090055,0.001547,0.079535,0.473530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D_142,0.103587,-0.005302,0.001925,0.022543,-0.006612,-0.075517,-0.010093,0.004682,-0.055475,-0.056672,...,-0.029384,-0.025727,-0.051845,-0.039371,0.111732,1.000000,-0.052029,0.380718,0.124662,-0.056792
D_143,-0.179423,0.044068,0.103997,-0.160091,0.090055,0.045894,0.064628,0.146416,-0.001540,0.073936,...,0.058877,0.045753,0.999679,0.332814,0.994432,-0.052029,1.000000,0.548649,0.640152,0.126769
D_144,0.008972,0.003378,0.034230,-0.035525,0.001547,-0.018357,0.007409,0.040517,-0.052557,-0.007470,...,-0.000661,-0.003611,0.549414,-0.040433,0.586529,0.380718,0.548649,1.000000,0.209424,-0.004368
D_145,-0.163353,0.033710,0.082072,-0.130273,0.079535,0.044715,0.055847,0.117478,0.023305,0.067403,...,0.058289,0.045170,0.640117,0.222767,0.631540,0.124662,0.640152,0.209424,1.000000,0.116926


In [11]:
corr_df["target"].abs().sort_values(ascending=False).head(20)

target    1.000000
P_2       0.666979
D_48      0.612167
B_2       0.557697
B_18      0.546708
B_9       0.540561
D_61      0.537193
D_55      0.526979
B_33      0.520457
B_3       0.506833
D_44      0.506809
B_7       0.501943
D_75      0.499836
D_58      0.498944
B_23      0.495195
R_1       0.473530
B_16      0.468953
B_4       0.462154
D_74      0.459716
B_20      0.459683
Name: target, dtype: float64

In [12]:
corr_df["target"].abs().sort_values(ascending=True).head(20)

D_134    0.001769
S_12     0.002455
S_18     0.003860
D_144    0.004368
S_19     0.008411
R_23     0.008443
B_15     0.013630
R_18     0.013649
D_106    0.015462
D_69     0.015657
B_27     0.017619
R_28     0.019002
D_109    0.019424
D_102    0.025011
D_108    0.025189
B_10     0.025842
B_13     0.028724
B_12     0.030271
D_105    0.031873
D_93     0.034180
Name: target, dtype: float64

***