# A Gentle Introduction to the Chi-Squared Test for Machine Learning

https://machinelearningmastery.com/chi-squared-test-for-machine-learning/

## Objective
- Feature selection: Determine whether input features are relevant to the outcome to be predicted

In [16]:
import pandas as pd
from scipy.stats import chi2_contingency
from scipy.stats import chi2

## Contingency table

In [23]:
dat = [(20, 30, 15),
      (20, 15, 30),]
labels_col = ['Science', 'Math', 'Art']
labels_row = ['Male', 'Female']
dat_df = pd.DataFrame.from_records(dat, index=labels_row, columns=labels_col)
print(dat_df)

# Data for test
table = [[10, 20, 30],
		[6,  9,  17]]
print(table)

        Science  Math  Art
Male         20    30   15
Female       20    15   30
[[10, 20, 30], [6, 9, 17]]


# Pearson's Chi-squared test

In [24]:
stat, p, dof, expected = chi2_contingency(table)
print(stat)
print(p)
print(dof)
print(expected)

0.271574651504
0.87302828338
2
[[ 10.43478261  18.91304348  30.65217391]
 [  5.56521739  10.08695652  16.34782609]]


In [25]:
# interpret test-statistic
prob = 0.95
critical = chi2.ppf(prob, dof)
if abs(stat) >= critical:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')

Independent (fail to reject H0)


In [26]:
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')

significance=0.050, p=0.873
Independent (fail to reject H0)
