# EDA

In [1]:
# ====================================================
# Library
# ====================================================
import gc
import warnings
warnings.filterwarnings('ignore')
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from scipy import stats

# Check stable columns between train and test period

### Characteristic Stability Index (CSI)

The Characteristic Stability Index (CSI) is used to evaluate the stability or drift of each feature so that we can find the problematic one. As PSI is concerned with the effects of the population drift on the model’s predictions, the CSI is concerned with understanding how the feature distributions have changed

- CSI < 0.1 = The characteristic hasn’t changed, and we can use to train the model
- 0.1 ≤ CS1 < 0.2 = The characteristic has slightly changed, and it is advisable to evaluate the impacts of these changes
- CSI ≥ 0.2 = The changes in characteristic are significant, and the model should not be used the characteristic in model.



### Kolmogorov–Smirnov method (K–S test) 
The Kolmogorov–Smirnov method (K–S test) is used to compare the maximum distance between the experimental cumulative distribution function and the theoretical cumulative distribution function. A more general approach is to test for differences in the entire two distributions, e.g. training features vs test features.


In [2]:
def ks_test():
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    binary_features = ['B_31', 'D_87']

    cat_features = cat_features + binary_features
    statistic = []
    pvalue = []    

    num_features = list(set(train.columns) - set(cat_features))
    # num_features = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)    
    for i, name in enumerate(num_features):
        statistic_, pvalue_ = stats.ks_2samp(train[name], test[name])
        statistic.append(statistic_)
        pvalue.append(pvalue_) 
    return pd.DataFrame({'name':num_features, 'ks':statistic, 'pvalue':pvalue})

def csi(var):    
    x = train.groupby(var).size().to_frame()
    x.reset_index(inplace = True)        
    y = test.groupby(var).size().to_frame()
    y.reset_index(inplace = True)    
    csi_tbl = x.merge(y, how = 'inner', on = var)
    csi_tbl['perc_train'] = csi_tbl['0_x']/sum(csi_tbl['0_x'])
    csi_tbl['perc_test']= csi_tbl['0_y']/sum(csi_tbl['0_y'])
    csi_tbl['csi_sub']= (csi_tbl['perc_train']-csi_tbl['perc_test']) * np.log(csi_tbl['perc_train']/csi_tbl['perc_test'])    
    return sum(csi_tbl['csi_sub'])

def csi_test():
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    binary_features = ['B_31', 'D_87']
    cat_features = cat_features + binary_features
    csi_out = []
    for i, name in enumerate(cat_features):        
        csi_out.append(csi(name))                
    return pd.DataFrame({'name':cat_features, 'csi':csi_out})

In [3]:
train = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/train.parquet')
test = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/test.parquet')

In [4]:
csi_table = csi_test().sort_values('csi')
csi_table

Unnamed: 0,name,csi
11,B_31,4.8e-05
12,D_87,8.6e-05
0,B_30,0.000444
9,D_66,0.000495
2,D_114,0.002055
3,D_116,0.002946
10,D_68,0.003286
4,D_117,0.005318
8,D_64,0.006509
7,D_63,0.006725


==> Category variables are stable between train and test period


In [5]:
ks_table = ks_test().sort_values('ks')
ks_table

Unnamed: 0,name,ks,pvalue
174,R_18,2e-05,1.0
86,R_24,2.5e-05,1.0
75,R_13,8.7e-05,1.0
93,D_88,9.4e-05,1.0
21,R_28,0.000139,0.9999997
130,R_8,0.000216,0.9950105
142,R_4,0.000228,0.9902162
80,R_20,0.000245,0.9784611
107,R_17,0.000257,0.9665147
81,R_23,0.000269,0.9498369


S_9, B_29, D_59, S_11, R1, S_2 should be careful to using in training model.
