In [1]:
import pandas as pd
import numpy as np

In [31]:
df_sum = pd.read_csv('./data/winemag-data-130k-v2.csv')
df_sum.columns

Index(['Unnamed: 0', 'country', 'description', 'designation', 'points',
       'price', 'province', 'region_1', 'region_2', 'taster_name',
       'taster_twitter_handle', 'title', 'variety', 'winery'],
      dtype='object')

In [22]:
variables = ['country', 'points', 'province', 'region_1', 'region_2', 'taster_name',
       'taster_twitter_handle', 'variety']

In [24]:
def is_discrete(col_name, df=df_sum ):
    first = df[col_name].iloc[0]
    if isinstance(first,str):
        return 0 # 0--discrete, 1--continue, -1-- just skip
    elif isinstance(first,np.bool_):
        return 0
    elif isinstance(first,np.int64):
        if (len(df[col_name].unique())>100):
            return 1
        else:
            return 0
    elif isinstance(first,np.float64):
        return 1
    elif isinstance(first,pd._libs.tslibs.timestamps.Timestamp):
        return -1
    
    print("error___",type(first))
    assert False
    
def percent_range(col_name, df_=df_sum, N = 3):
    percent = [(i+1) * (1.0/N) for i in range(N-1)]
    x_percentile = df_[col_name].describe(percentiles=percent)
    
    if N == 3:
        x_percentile = x_percentile.drop(index=['50%'])
    
    ranges = x_percentile[3:].values
    return ranges

ranges = percent_range("price")
from scipy import stats

def indepedence_test(col_name):
    df_tmp = df_sum[['price', col_name]].dropna()
    
    # separate to grid
    df_tmp['range'] = pd.cut(df_tmp['price'], ranges)
    
    if is_discrete(col_name,df_tmp)==1: # considered as continueous
        # get range
        temp_range = percent_range(col_name, df_tmp) # min, max included
        df_tmp['temp_range'] = pd.cut(df_tmp[col_name], temp_range)
        pivot = df_tmp.pivot_table(index=['range'], columns=['temp_range'], aggfunc='size')
    elif is_discrete(col_name,df_tmp)==0: # considered as discrete
        pivot = df_tmp.pivot_table(index=['range'], columns=[col_name], aggfunc='size')
    elif is_discrete(col_name,df_tmp)==-1:
        # just skip
        print(col_name,"   skip")
        return None
    else:
        assert False # not implemented
    
    cols = np.array(pivot.columns)
    tmp = pd.DataFrame(pivot.to_numpy(), columns = cols, index=np.array( pivot.index ))
    # display(tmp)
    
    # independence test of chi-square
    res = stats.chi2_contingency(tmp.to_numpy())
    # display(res)
    print(col_name, "    chi(x^2)=", res[0], "    p-value=", res[1])
    return [res[0], res[1]]


In [29]:
independence = []
for col_name in variables:
    #print(col_name)
    res = indepedence_test(col_name) 
    if res is not None:
        independence.append(res+[col_name])

for i in range(len(independence)):
    tmp = independence[i]
    independence[i] = [tmp[2], tmp[0], tmp[1]]

country     chi(x^2)= 11537.802420755355     p-value= 0.0
points     chi(x^2)= 43032.181504008826     p-value= 0.0
province     chi(x^2)= 26505.342859941386     p-value= 0.0
region_1     chi(x^2)= 54693.097473815695     p-value= 0.0
region_2     chi(x^2)= 12273.673062244845     p-value= 0.0
taster_name     chi(x^2)= 12741.533449971726     p-value= 0.0
taster_twitter_handle     chi(x^2)= 12129.770005126187     p-value= 0.0
variety     chi(x^2)= 32089.11090622175     p-value= 0.0


In [30]:
from tabulate import tabulate
table =[["Feature name", "Chi-square value", "P-value"]] + independence
print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

╒═══════════════════════╤════════════════════╤═══════════╕
│ Feature name          │   Chi-square value │   P-value │
╞═══════════════════════╪════════════════════╪═══════════╡
│ country               │            11537.8 │         0 │
├───────────────────────┼────────────────────┼───────────┤
│ points                │            43032.2 │         0 │
├───────────────────────┼────────────────────┼───────────┤
│ province              │            26505.3 │         0 │
├───────────────────────┼────────────────────┼───────────┤
│ region_1              │            54693.1 │         0 │
├───────────────────────┼────────────────────┼───────────┤
│ region_2              │            12273.7 │         0 │
├───────────────────────┼────────────────────┼───────────┤
│ taster_name           │            12741.5 │         0 │
├───────────────────────┼────────────────────┼───────────┤
│ taster_twitter_handle │            12129.8 │         0 │
├───────────────────────┼────────────────────┼──────────