# Chi-Squared Test for Feature Selection

A higher Chi-Square value means the feature is more dependent on the label. 

Steps: 
- Define a hypothesis
- Build a contingency table
- Find the expected values
- Calculate the Chi-Square statistics
- Accept/Reject the null hypothesis

Assumptions:
- The observations are independant
- No expected cell count is = 0
- No more than 20% of the cells have an expected cell coutn of <5

Current goal: try to figure out how to make contingency table for all features/figure out the chi-square for ALL FEATURES and LABEL (or ALL FEATURES generally)

In [103]:
import pandas as pd 
from sklearn.datasets import load_iris 
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2 
import scipy.stats as stats
import numpy as np
from itertools import combinations
from sklearn.model_selection import train_test_split

In [105]:
#This is what I implemented --Fabi
#load dataset
data = np.loadtxt("uniform_large_d_1.tex")

# Creating NumPy array
array = np.array(data)

# Converting to Pandas DataFrame
df_table = pd.DataFrame(array)

# Displaying the table
#print(df_table)

In [107]:
# From the dataset, change 25 columns to 'categorical'
#Loop, converts floats to ints and then those ints to category
for i in range(26):
    df_table.iloc[:,i] = df_table.iloc[:,i].round()
    df_table.iloc[:,i] = df_table.iloc[:,i].astype(int)
    df_table.iloc[:,i] = df_table.iloc[:,i].astype("category")

df_table.iloc[:, 150] = df_table.iloc[:, 150].astype("category")

df_table.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.646452,2.997843,2.54626,2.423437,2.825879,2.566278,2.403595,2.396183,2.684211,1.0
1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.024768,2.424598,2.349128,2.325874,2.524994,2.862275,2.060383,2.505475,2.334364,1.0
2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.625961,2.962051,2.420763,2.411974,2.361735,2.667143,2.073825,2.388143,2.831569,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.864361,0.083862,0.616211,0.898456,0.117597,0.664931,0.813385,0.573604,0.117329,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.455617,0.51657,0.956458,0.97284,0.550108,0.503513,0.337278,0.735706,0.284006,0.0


In [109]:
# Split dataset into X_train and y_train
X_train, X_test, y_train, y_test = train_test_split(df_table.iloc[:,1:150], df_table.iloc[:,-1], test_size=0.2, random_state=52)

In [111]:
# Splitting X_train into numerical subset 
numerical_df = X_train.select_dtypes(include=['number'])
numerical_df


Unnamed: 0,26,27,28,29,30,31,32,33,34,35,...,140,141,142,143,144,145,146,147,148,149
159,2.737286,2.564694,2.078000,2.701250,2.172147,2.306469,2.833275,2.121672,2.214934,2.381538,...,2.653517,2.825271,2.758778,2.136084,2.767652,2.302499,2.246025,2.683174,2.566359,2.813462
198,0.449442,0.769937,0.043365,0.899528,0.743107,0.055306,0.189964,0.881079,0.855876,0.177939,...,0.238594,0.022022,0.508925,0.028390,0.660544,0.118136,0.086415,0.946498,0.558883,0.899343
259,2.730227,2.558789,2.801426,2.055368,2.278217,2.426925,2.431457,2.903312,2.106255,2.858540,...,2.887773,2.945521,2.368866,2.799701,2.487134,2.698392,2.588976,2.320205,2.480301,2.136642
301,2.233624,2.514446,2.930156,2.916944,2.435773,2.614566,2.833691,2.577013,2.563006,2.058731,...,2.395805,2.372597,2.512598,2.065079,2.459612,2.946646,2.619441,2.235146,2.659467,2.811038
220,0.687277,0.112971,0.248352,0.483834,0.189635,0.360817,0.808924,0.975149,0.609001,0.142674,...,0.319568,0.561645,0.396156,0.907804,0.942291,0.830752,0.881505,0.321756,0.910684,0.283537
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,2.465128,2.600074,2.676940,2.477864,2.639337,2.902997,2.084804,2.669015,2.656391,2.421545,...,2.604090,2.501031,2.667957,2.456940,2.130424,2.018010,2.816774,2.800348,2.036940,2.408504
151,0.688109,0.625346,0.802530,0.602819,0.343456,0.936555,0.263959,0.375823,0.553923,0.799001,...,0.228123,0.544925,0.596277,0.942976,0.213394,0.542929,0.105415,0.244743,0.052109,0.638471
13,0.839154,0.999741,0.972734,0.602158,0.763564,0.515336,0.740383,0.434245,0.317270,0.297843,...,0.710380,0.815848,0.727969,0.898495,0.241817,0.637411,0.740484,0.615302,0.018263,0.876028
267,0.927585,0.265717,0.361995,0.526540,0.688182,0.129511,0.233144,0.966997,0.239566,0.344881,...,0.850673,0.191450,0.968260,0.085316,0.577563,0.512694,0.746791,0.422051,0.036703,0.870898


In [113]:
# Splitting X_train into categorical subset 
categorical_df = X_train.select_dtypes(include=['object', 'category'])
categorical_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,16,17,18,19,20,21,22,23,24,25
159,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
259,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
301,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
220,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
267,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [115]:
####################### Chi-Square (F vs F) Code ################################################
# Finds dependency between all features in X_train
def chi_squared_fvf(X_train):
        
    # Extract variable names
    variable_names = list(X_train.columns)

    # Initialize matrices to store chi-squared and p-values
    num_variables = len(variable_names)
    chi_squared = np.zeros((num_variables, num_variables))
    p_values = np.zeros((num_variables, num_variables))

    # Compute chi-squared and p-values for each pair of variables
    print("Chi-Squared Statistics for Features v. Features")
    for i, j in combinations(range(num_variables), 2):
        contingency_table = pd.crosstab(X_train.iloc[:, i], X_train.iloc[:, j])
            
        # Compute chi-squared and p-values
        chi2 = stats.chi2_contingency(contingency_table)[0]
        p = stats.chi2_contingency(contingency_table)[1]
            
        # Assign results to chi_squared and p_values matrices
        chi_squared[i, j] = chi2
        chi_squared[j, i] = chi2  # Assign to symmetric position in the matrix
        p_values[i, j] = p
        p_values[j, i] = p  # Assign to symmetric position in the matrix

    # Create a DataFrame with variable names as index and columns
    chi_squared_df = pd.DataFrame(chi_squared, index=variable_names, columns=variable_names)
    p_values_df = pd.DataFrame(p_values, index=variable_names, columns=variable_names)

    # Printing the matrix-like output with variable names
    print("Chi-Squared Values:")
    print(chi_squared_df)
    print("\nP-Values:")
    print(p_values_df)
    
chi_squared_fvf(categorical_df)

Chi-Squared Statistics for Features v. Features
Chi-Squared Values:
            1           2           3           4           5           6   \
1     0.000000  396.009602  396.009602  396.009602  396.009602  396.009602   
2   396.009602    0.000000  396.009602  396.009602  396.009602  396.009602   
3   396.009602  396.009602    0.000000  396.009602  396.009602  396.009602   
4   396.009602  396.009602  396.009602    0.000000  396.009602  396.009602   
5   396.009602  396.009602  396.009602  396.009602    0.000000  396.009602   
6   396.009602  396.009602  396.009602  396.009602  396.009602    0.000000   
7   396.009602  396.009602  396.009602  396.009602  396.009602  396.009602   
8   396.009602  396.009602  396.009602  396.009602  396.009602  396.009602   
9   396.009602  396.009602  396.009602  396.009602  396.009602  396.009602   
10  396.009602  396.009602  396.009602  396.009602  396.009602  396.009602   
11  396.009602  396.009602  396.009602  396.009602  396.009602  396.009602

In [116]:
##################### Chi-Square (F vs label column) Code ####################################
# Finds dependency between all features in X_train & the label in y_train
def chi_squared_fvl(X_train, y_train):
        
    # Combining X_train and y_train
    df = X_train
    df['label'] = y_train

    # Number of features, excluding label
    var_count = len(df.columns)-1

    # Creates an empty array to print values in a table
    results = []

    for i in range(0, var_count):

        # Create contigency table of all features v. label
        crosstab = pd.crosstab(df.iloc[:, i], df.iloc[:,-1])
            
        # Compute chi-squared and p-values
        chi2 = stats.chi2_contingency(crosstab)[0]
        p = stats.chi2_contingency(crosstab)[1]
            
        # Append results to the list
        results.append({
            "Feature": df.columns[i],
            "Chi Squared Statistic": chi2,
            "P-Value": p})

    # Create a DataFrame from the results
    results_df = pd.DataFrame(results)

    # Print the DataFrame
    print("Label:", df.columns.values[-1])
    print(results_df.to_string(index=False))
    
chi_squared_fvl(categorical_df, y_train)

Label: label
 Feature  Chi Squared Statistic      P-Value
       1             396.009602 4.070098e-88
       2             396.009602 4.070098e-88
       3             396.009602 4.070098e-88
       4             396.009602 4.070098e-88
       5             396.009602 4.070098e-88
       6             396.009602 4.070098e-88
       7             396.009602 4.070098e-88
       8             396.009602 4.070098e-88
       9             396.009602 4.070098e-88
      10             396.009602 4.070098e-88
      11             396.009602 4.070098e-88
      12             396.009602 4.070098e-88
      13             396.009602 4.070098e-88
      14             396.009602 4.070098e-88
      15             396.009602 4.070098e-88
      16             396.009602 4.070098e-88
      17             396.009602 4.070098e-88
      18             396.009602 4.070098e-88
      19             396.009602 4.070098e-88
      20             396.009602 4.070098e-88
      21             396.009602 4.070098e-