### Import libraries

In [1]:
import scorecardpy as sc
import pandas as pd
import numpy as np
import warnings
import json

from scipy.stats import chi2_contingency

pd.set_option('display.float_format', '{:.6f}'.format)
warnings.filterwarnings('ignore')

### Read dataframe

In [2]:
df = pd.read_csv('data/train_set.csv')

input_file = "data/numerical_features.json"

with open(input_file, 'r') as f:
    continuous_features = json.load(f)

In [3]:
# Keep only continuous variables and the flag
df_categorical = df.drop(continuous_features, axis=1).copy()

### Calculate information values for each variable in order to see the predictive power

In [4]:
bins = sc.woebin(df_categorical, y='payment_delay')

df_list = []
for key, value in bins.items():
    value['variables'] = key 
    df_list.append(value)

df_iv = pd.concat(df_list, ignore_index=True)
df_iv = df_iv[['variables', 'total_iv']]
df_iv = df_iv.drop_duplicates().sort_values('total_iv', ascending=False).reset_index(drop=True)
df_iv.to_csv('data/iv.csv', index=False)

[INFO] creating woe binning ...


- IV < 0.02: No predictive power
- 0.02 ≤ IV < 0.1: Weak predictive power
- 0.1 ≤ IV < 0.3: Medium predictive power
- IV ≥ 0.3: Strong predictive powe

In [5]:
df_iv

Unnamed: 0,variables,total_iv
0,international_plan,0.379098
1,voice_mail_plan,0.116535
2,state_freq_encoded,0.027384
3,area_code_area_code_510,0.005778
4,area_code_area_code_408,0.002885
5,area_code_area_code_415,0.000452


### Chi-Square Test

In [6]:
target_var = 'payment_delay'
categorical_vars = list(df_categorical.drop('payment_delay', axis=1))

results = []

In [7]:
for var in categorical_vars:
    contingency_table = pd.crosstab(df_categorical[var], df_categorical[target_var])
    
    # Perform the Chi-Square Test
    chi2, p, _, _ = chi2_contingency(contingency_table)
    
    # Calculate Cramér's V
    n = contingency_table.sum().sum()
    phi2 = chi2/n
    r, k = contingency_table.shape
    cramers_v = np.sqrt(phi2 / (min(k-1, r-1)))
    
    results.append([var, chi2, cramers_v, p])

results_df = pd.DataFrame(results, columns=['variables', 'chi-Square', 'cramers_V', 'p-value'])
results_df = results_df.sort_values('p-value').reset_index(drop=True)
results_df.to_csv('data/chi_square.csv', index=False)

In [8]:
results_df

Unnamed: 0,variables,chi-Square,cramers_V,p-value
0,international_plan,195.164743,0.255059,0.0
1,voice_mail_plan,34.384006,0.107058,0.0
2,state_freq_encoded,40.481928,0.116164,0.046153
3,area_code_area_code_510,1.945206,0.025464,0.163104
4,area_code_area_code_408,0.885552,0.017181,0.346686
5,area_code_area_code_415,0.121093,0.006353,0.727853
