# Chi-Square Test

In [38]:
# Import dependencies
import pandas as pd
import requests
import numpy as np
import scipy.stats as stats

In [3]:
# Read in csv file
tobacco_data = pd.read_csv("./Resources/Clean_Tobacco_data.csv")

tobacco_data.head()

Unnamed: 0.1,Unnamed: 0,YEAR,LocationDesc,TopicDesc,MeasureDesc,Response,Data_Value,Sample_Size,Gender,Race,Age
0,0,2019,National Median (States and DC),Cessation (Adults),Percent of Former Smokers Among Ever Smokers,,60.8,,Overall,All Races,All Ages
1,1,2019,New Hampshire,Cigarette Use (Adults),Smoking Frequency,Every Day,76.3,357.0,Female,All Races,All Ages
2,2,2019,Florida,Cigarette Use (Adults),Current Smoking,,14.8,15824.0,Overall,All Races,All Ages
3,3,2019,Hawaii,Smokeless Tobacco Use (Adults),Current Use,,2.1,459.0,Overall,All Races,18 to 24 Years
4,4,2019,Alabama,Smokeless Tobacco Use (Adults),User Status,Not Current,88.6,2729.0,Male,All Races,All Ages


In [6]:
# Remove Unnamed column
tobacco_data = tobacco_data[["YEAR", "LocationDesc", "TopicDesc", "MeasureDesc", "Response", "Data_Value", 
                            "Sample_Size", "Gender", "Race", "Age"]]

In [11]:
# Filter on 2019
year_data = tobacco_data.loc[tobacco_data["YEAR"] == 2019]

year_data

Unnamed: 0,YEAR,LocationDesc,TopicDesc,MeasureDesc,Response,Data_Value,Sample_Size,Gender,Race,Age
0,2019,National Median (States and DC),Cessation (Adults),Percent of Former Smokers Among Ever Smokers,,60.8,,Overall,All Races,All Ages
1,2019,New Hampshire,Cigarette Use (Adults),Smoking Frequency,Every Day,76.3,357.0,Female,All Races,All Ages
2,2019,Florida,Cigarette Use (Adults),Current Smoking,,14.8,15824.0,Overall,All Races,All Ages
3,2019,Hawaii,Smokeless Tobacco Use (Adults),Current Use,,2.1,459.0,Overall,All Races,18 to 24 Years
4,2019,Alabama,Smokeless Tobacco Use (Adults),User Status,Not Current,88.6,2729.0,Male,All Races,All Ages
...,...,...,...,...,...,...,...,...,...,...
3235,2019,Louisiana,Smokeless Tobacco Use (Adults),User Status,Not Current,94.2,4467.0,Overall,All Races,All Ages
3236,2019,Missouri,Cigarette Use (Adults),Smoking Frequency,Every Day,74.4,1219.0,Overall,All Races,All Ages
3237,2019,Wisconsin,Smokeless Tobacco Use (Adults),Current Use,,5.0,936.0,Overall,All Races,25 to 44 Years
3238,2019,Idaho,Smokeless Tobacco Use (Adults),Current Use,,4.4,3357.0,Overall,All Races,Age 25 and Older


In [20]:
# Create male dataframe
male_data = year_data.loc[year_data["Gender"] == "Male"]

male_data.head()

Unnamed: 0,YEAR,LocationDesc,TopicDesc,MeasureDesc,Response,Data_Value,Sample_Size,Gender,Race,Age
4,2019,Alabama,Smokeless Tobacco Use (Adults),User Status,Not Current,88.6,2729.0,Male,All Races,All Ages
13,2019,North Carolina,Cigarette Use (Adults),Smoking Status,Current,20.7,1909.0,Male,All Races,All Ages
14,2019,Guam,Cigarette Use (Adults),Smoking Frequency,Some Days,28.4,236.0,Male,All Races,All Ages
18,2019,Nevada,Cigarette Use (Adults),Smoking Frequency,Some Days,38.6,211.0,Male,All Races,All Ages
33,2019,Vermont,Cessation (Adults),Percent of Former Smokers Among Ever Smokers,,65.8,1389.0,Male,All Races,All Ages


In [28]:
# Filter on cigarette use
mcig_data = male_data.loc[male_data["TopicDesc"] == "Cigarette Use (Adults)"]

mcig_data

Unnamed: 0,YEAR,LocationDesc,TopicDesc,MeasureDesc,Response,Data_Value,Sample_Size,Gender,Race,Age
13,2019,North Carolina,Cigarette Use (Adults),Smoking Status,Current,20.7,1909.0,Male,All Races,All Ages
14,2019,Guam,Cigarette Use (Adults),Smoking Frequency,Some Days,28.4,236.0,Male,All Races,All Ages
18,2019,Nevada,Cigarette Use (Adults),Smoking Frequency,Some Days,38.6,211.0,Male,All Races,All Ages
40,2019,Maine,Cigarette Use (Adults),Smoking Status,Former,32.3,4652.0,Male,All Races,All Ages
52,2019,Minnesota,Cigarette Use (Adults),Smoking Frequency,Every Day,72.1,1094.0,Male,All Races,All Ages
...,...,...,...,...,...,...,...,...,...,...
3172,2019,California,Cigarette Use (Adults),Smoking Frequency,Every Day,54.2,678.0,Male,All Races,All Ages
3190,2019,Virginia,Cigarette Use (Adults),Smoking Frequency,Some Days,30.1,591.0,Male,All Races,All Ages
3198,2019,Florida,Cigarette Use (Adults),Smoking Status,Never,54.8,7178.0,Male,All Races,All Ages
3211,2019,Mississippi,Cigarette Use (Adults),Smoking Status,Never,49.4,1982.0,Male,All Races,All Ages


In [30]:
# Filter on response to get current and former users
mresponse_data = mcig_data.loc[(mcig_data["Response"] == "Current") | (mcig_data["Response"] == "Former")]

mresponse_data

Unnamed: 0,YEAR,LocationDesc,TopicDesc,MeasureDesc,Response,Data_Value,Sample_Size,Gender,Race,Age
13,2019,North Carolina,Cigarette Use (Adults),Smoking Status,Current,20.7,1909.0,Male,All Races,All Ages
40,2019,Maine,Cigarette Use (Adults),Smoking Status,Former,32.3,4652.0,Male,All Races,All Ages
82,2019,Virginia,Cigarette Use (Adults),Smoking Status,Former,26.0,4249.0,Male,All Races,All Ages
121,2019,Massachusetts,Cigarette Use (Adults),Smoking Status,Former,26.3,3409.0,Male,All Races,All Ages
134,2019,Maryland,Cigarette Use (Adults),Smoking Status,Current,14.2,7239.0,Male,All Races,All Ages
...,...,...,...,...,...,...,...,...,...,...
3040,2019,Pennsylvania,Cigarette Use (Adults),Smoking Status,Former,30.1,3098.0,Male,All Races,All Ages
3073,2019,Delaware,Cigarette Use (Adults),Smoking Status,Current,15.8,1651.0,Male,All Races,All Ages
3085,2019,South Carolina,Cigarette Use (Adults),Smoking Status,Current,19.5,3096.0,Male,All Races,All Ages
3104,2019,Missouri,Cigarette Use (Adults),Smoking Status,Former,30.2,3108.0,Male,All Races,All Ages


In [33]:
# Calculate mean of Sample_Size for all states
male_mean = mresponse_data["Sample_Size"].mean()

male_mean

3484.826923076923

In [34]:
# Create female dataframe
female_data = year_data.loc[year_data["Gender"] == "Female"]

female_data.head()

Unnamed: 0,YEAR,LocationDesc,TopicDesc,MeasureDesc,Response,Data_Value,Sample_Size,Gender,Race,Age
1,2019,New Hampshire,Cigarette Use (Adults),Smoking Frequency,Every Day,76.3,357.0,Female,All Races,All Ages
6,2019,South Carolina,Cigarette Use (Adults),Smoking Status,Current,15.8,3761.0,Female,All Races,All Ages
8,2019,Michigan,Cigarette Use (Adults),Smoking Status,Current,17.2,5662.0,Female,All Races,All Ages
11,2019,Ohio,Smokeless Tobacco Use (Adults),User Status,Current,1.4,7547.0,Female,All Races,All Ages
20,2019,District of Columbia,Smokeless Tobacco Use (Adults),Frequency of Use,Every Day,,,Female,All Races,All Ages


In [35]:
# Filter on cigarette use
fcig_data = female_data.loc[female_data["TopicDesc"] == "Cigarette Use (Adults)"]

fcig_data

Unnamed: 0,YEAR,LocationDesc,TopicDesc,MeasureDesc,Response,Data_Value,Sample_Size,Gender,Race,Age
1,2019,New Hampshire,Cigarette Use (Adults),Smoking Frequency,Every Day,76.3,357.0,Female,All Races,All Ages
6,2019,South Carolina,Cigarette Use (Adults),Smoking Status,Current,15.8,3761.0,Female,All Races,All Ages
8,2019,Michigan,Cigarette Use (Adults),Smoking Status,Current,17.2,5662.0,Female,All Races,All Ages
24,2019,Oregon,Cigarette Use (Adults),Current Smoking,,14.6,983.0,Female,All Races,18 to 44 Years
29,2019,Iowa,Cigarette Use (Adults),Smoking Status,Current,14.8,5064.0,Female,All Races,All Ages
...,...,...,...,...,...,...,...,...,...,...
3161,2019,Utah,Cigarette Use (Adults),Smoking Frequency,Every Day,64.7,347.0,Female,All Races,All Ages
3171,2019,Nevada,Cigarette Use (Adults),Current Smoking,,13.9,1424.0,Female,All Races,All Ages
3199,2019,North Carolina,Cigarette Use (Adults),Smoking Status,Current,16.4,2222.0,Female,All Races,All Ages
3210,2019,Maine,Cigarette Use (Adults),Smoking Frequency,Some Days,20.2,849.0,Female,All Races,All Ages


In [36]:
# filter on response to get current and former users
fresponse_data = fcig_data.loc[(fcig_data["Response"] == "Current") | (fcig_data["Response"] == "Former")]

fresponse_data

Unnamed: 0,YEAR,LocationDesc,TopicDesc,MeasureDesc,Response,Data_Value,Sample_Size,Gender,Race,Age
6,2019,South Carolina,Cigarette Use (Adults),Smoking Status,Current,15.8,3761.0,Female,All Races,All Ages
8,2019,Michigan,Cigarette Use (Adults),Smoking Status,Current,17.2,5662.0,Female,All Races,All Ages
29,2019,Iowa,Cigarette Use (Adults),Smoking Status,Current,14.8,5064.0,Female,All Races,All Ages
71,2019,Ohio,Cigarette Use (Adults),Smoking Status,Current,20.0,7519.0,Female,All Races,All Ages
133,2019,Pennsylvania,Cigarette Use (Adults),Smoking Status,Current,16.5,3287.0,Female,All Races,All Ages
...,...,...,...,...,...,...,...,...,...,...
3012,2019,Idaho,Cigarette Use (Adults),Smoking Status,Current,13.9,2793.0,Female,All Races,All Ages
3063,2019,Louisiana,Cigarette Use (Adults),Smoking Status,Former,18.6,2513.0,Female,All Races,All Ages
3099,2019,Massachusetts,Cigarette Use (Adults),Smoking Status,Former,24.4,3985.0,Female,All Races,All Ages
3140,2019,Puerto Rico,Cigarette Use (Adults),Smoking Status,Current,6.7,3716.0,Female,All Races,All Ages


In [37]:
# calculate mean of sample_size for all states
female_mean = fresponse_data["Sample_Size"].mean()

female_mean

4194.788461538462

In [39]:
# Observed data 
observed = pd.Series([3485,4195], index=["male", "female"])

In [40]:
# Create a data frame
df = pd.DataFrame([observed]).T

In [41]:
# Add a column whose default values are the expected values
df[1] = 3840

In [42]:
# Rename columns
df.columns = ["observed", "expected"]

In [43]:
# View the data frame
df

Unnamed: 0,observed,expected
male,3485,3840
female,4195,3840


In [45]:
# The degree of freedom is 2-1 = 1
# With a p-value of 0.05, the confidence level is 1.00-0.05 = 0.95.
critical_value = stats.chi2.ppf(q = 0.95, df = 1)

critical_value

3.841458820694124

In [46]:
# Run the chi square test with stats.chisquare()
stats.chisquare(df['observed'], df['expected'])

Power_divergenceResult(statistic=65.63802083333333, pvalue=5.418331493598966e-16)

### Conclusion

* Since the chi square value of 65.6 exceeds the critical value of 3.84, we conclude that the results are statistically significant.