# Chi-Square Test

In [29]:
# Import Libraries
import pandas as pd # For DataFrame Manipulation
from scipy.stats import chi2_contingency
from scipy.stats import chi2

In [44]:
# Import Dataset
df = pd.read_csv("E:\\ML Projects\\Income Classification\\income_evaluation.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [45]:
# Print column names
df.columns

Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' income'],
      dtype='object')

In [46]:
# Renaming columns to remove space in front of columns
df.columns = [col_name.strip() for col_name in df.columns]
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

### Checking Relationship Between "race" and "income".

In [47]:
# Removing empty spaces infront of values
df["race"] = df["race"].str.strip()
df["income"] = df["income"].str.strip()

In [48]:
# Crosstab between "income" and "race"
income_race = pd.crosstab(df["income"], df["race"], )
income_race

race,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White
income,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<=50K,275,763,2737,246,20699
>50K,36,276,387,25,7117


In [59]:
# Null Hypothesis(H0) : There is No significance relationship between income and race
# Alternative Hypothesis(H1) : There is significance relationship between income and race

# Getting required parameters using chi2_contingency()
stat, p, dof, expected = chi2_contingency(income_race)
print("Observed Statistic:", round(stat, 2))
print("P-value:", p)
print("Degrees of Freedom:", dof)

# Taking significance level of 0.05
significance_level = 0.05
critical_stat = chi2.ppf(1-significance_level, 4)
print("Critical Statistic:", critical_stat)

print("-----------------------------------------------")
if p < significance_level:
    print("Reject H0")
    print("So, There is significance relationship between income and race")
else:
    print("Fail to Reject H0")    

Observed Statistic: 330.92
P-value: 2.305960610160958e-70
Degrees of Freedom: 4
Critical Statistic: 9.487729036781154
-----------------------------------------------
Reject H0
So, There is significance relationship between income and race


### Checking Relation between "workclass" and "income".

In [61]:
# Removing empty spaces infront of values
df["workclass"] = df["workclass"].str.strip()

In [65]:
# Crosstab between "workclass" and "income"
income_workclass = pd.crosstab(df["income"], df["workclass"])
income_workclass = income_workclass.drop(columns = ["?"]) 
income_workclass

workclass,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay
income,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
<=50K,589,1476,7,17733,494,1817,945,14
>50K,371,617,0,4963,622,724,353,0


In [66]:
# Null Hypothesis(H0) : There is No significance relationship between income and workclass
# Alternative Hypothesis(H1) : There is significance relationship between income and workclass

# Getting required parameters using chi2_contingency()
stat, p, dof, expected = chi2_contingency(income_workclass)
print("Observed Statistic:", round(stat, 2))
print("P-value:", p)
print("Degrees of Freedom:", dof)

# Taking significance level of 0.05
significance_level = 0.05
critical_stat = chi2.ppf(1-significance_level, dof)
print("Critical Statistic:", critical_stat)

print("-----------------------------------------------")
if p < significance_level:
    print("Reject H0")
    print("So, There is significance relationship between income and workclass")
else:
    print("Fail to Reject H0")

Observed Statistic: 827.72
P-value: 1.9338476684848218e-174
Degrees of Freedom: 7
Critical Statistic: 14.067140449340169
-----------------------------------------------
Reject H0
So, There is significance relationship between income and workclass


### Checking Relation between "education" and "income".

In [67]:
# Removing empty spaces infront of values
df["education"] = df["education"].str.strip()

In [68]:
# Crosstab between "education" and "income"
income_education = pd.crosstab(df["income"], df["education"])
 
income_education

education,10th,11th,12th,1st-4th,5th-6th,7th-8th,9th,Assoc-acdm,Assoc-voc,Bachelors,Doctorate,HS-grad,Masters,Preschool,Prof-school,Some-college
income,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
<=50K,871,1115,400,162,317,606,487,802,1021,3134,107,8826,764,51,153,5904
>50K,62,60,33,6,16,40,27,265,361,2221,306,1675,959,0,423,1387


In [69]:
# Null Hypothesis(H0) : There is No significance relationship between income and education
# Alternative Hypothesis(H1) : There is significance relationship between income and education

# Getting required parameters using chi2_contingency()
stat, p, dof, expected = chi2_contingency(income_education)
print("Observed Statistic:", round(stat, 2))
print("P-value:", p)
print("Degrees of Freedom:", dof)

# Taking significance level of 0.05
significance_level = 0.05
critical_stat = chi2.ppf(1-significance_level, dof)
print("Critical Statistic:", critical_stat)

print("-----------------------------------------------")
if p < significance_level:
    print("Reject H0")
    print("So, There is significance relationship between income and education.")
else:
    print("Fail to Reject H0")

Observed Statistic: 4429.65
P-value: 0.0
Degrees of Freedom: 15
Critical Statistic: 24.995790139728616
-----------------------------------------------
Reject H0
So, There is significance relationship between income and education


### Checking Relation between "sex" and "income".

In [70]:
# Removing empty spaces infront of values
df["sex"] = df["sex"].str.strip()

In [71]:
# Crosstab between "education" and "income"
income_sex = pd.crosstab(df["income"], df["sex"])
 
income_sex

sex,Female,Male
income,Unnamed: 1_level_1,Unnamed: 2_level_1
<=50K,9592,15128
>50K,1179,6662


In [72]:
# Null Hypothesis(H0) : There is No significance relationship between income and sex
# Alternative Hypothesis(H1) : There is significance relationship between income and sex

# Getting required parameters using chi2_contingency()
stat, p, dof, expected = chi2_contingency(income_sex)
print("Observed Statistic:", round(stat, 2))
print("P-value:", p)
print("Degrees of Freedom:", dof)

# Taking significance level of 0.05
significance_level = 0.05
critical_stat = chi2.ppf(1-significance_level, dof)
print("Critical Statistic:", critical_stat)

print("-----------------------------------------------")
if p < significance_level:
    print("Reject H0")
    print("So, There is significance relationship between income and sex.")
else:
    print("Fail to Reject H0")

Observed Statistic: 1517.81
P-value: 0.0
Degrees of Freedom: 1
Critical Statistic: 3.841458820694124
-----------------------------------------------
Reject H0
So, There is significance relationship between income and sex.
