In [None]:
%env JOBLIB_TEMP_FOLDER=/tmp

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
from tqdm import tqdm

***
## feature groups
- D_* = Delinquency variables
- S_* = Spend variables
- P_* = Payment variables
- B_* = Balance variables
- R_* = Risk variables

In [None]:
data = pd.read_parquet("../data/processed/dsv05/train.parquet")

In [None]:
data.info()

***
## correlation analysis

In [None]:
def compute_feat_correlation(array, i):
    n = array.shape[1]
    out = list()
    
    for j in range(n):
        if i < j: continue
        
        x = array[:,i]
        y = array[:,j]
        
        mask = (~np.isnan(x)) & (~np.isnan(y))
        x = x[mask]
        y = y[mask]
        
        out.append((i,j,stats.pearsonr(x,y)[0]))
        
    return out

def compute_correlation(dataframe, columns):

    n = len(columns)
    array = dataframe[columns].values
    corr_array = np.empty((n,n))

    with Parallel(n_jobs=-1) as parallel:
        delayed_func = delayed(compute_feat_correlation)
        results = parallel(
            delayed_func(array, i) 
            for i in tqdm(range(n))
        )

    for r in results:
        for corr_row in r:
            i,j,corr = corr_row
            corr_array[i,j] = corr
            corr_array[j,i] = corr

    corr_df = pd.DataFrame(corr_array, columns=columns, index=columns)
    return corr_df

In [None]:
corr_df = compute_correlation(data, columns=data.columns)
corr_df

In [None]:
corr_df.to_parquet("../data/feat-selection/corr.parquet")

***