## Example 1: Effectiveness of flu shot.

In [1]:
import statsmodels.api as sm
from scipy.stats import ttest_ind
import numpy as np

flushot_num_samples = 5103
flushot_flu_cases   = 49
proportion_flu_shot = flushot_flu_cases/flushot_num_samples

placebo_num_samples = 2549
placebo_flu_cases   = 74
proportion_placebo = placebo_flu_cases/placebo_num_samples

total_samples = flushot_num_samples + placebo_num_samples

# null hypothesis: the flu shot group was not more protected from the flu (did not have a smaller incidence of flu)
# relative to the placebo group
# alternative hypothesis: the flu shot group did have a statistically significant smaller incidence of flu relative 
# to the placebo group

zstatistic, pvalue = sm.stats.proportions_ztest(proportion_flu_shot*total_samples, total_samples, proportion_placebo, alternative='smaller')

print(pvalue)

2.5367467798836333e-68


## Example 2: Differences in age between female and male passengers on the titanic.

In [3]:
import pandas as pd

titanic_df = pd.read_csv('train.csv')
titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
population1 = titanic_df['Age'].where(titanic_df['Sex'] == 'female')
population1.dropna(inplace=True)

population2 = titanic_df['Age'].where(titanic_df['Sex'] == 'male')
population2.dropna(inplace=True)

print(np.mean(population1), np.mean(population2))

27.915708812260537 30.72664459161148


In [5]:
# null hypothesis: men and woman who were on the titanic have the same mean age
# alternative hypothesis: men and women who were on the titanic do not have the same mean age

ttest, pvalue =  sm.stats.ztest(population1, population2, alternative = 'two-sided') 

print(pvalue)

0.01244718076724336


In [6]:
# null hypothesis: woman on the titanic are not younger on average than men
# alternative hypothesis: woman on the titanic are younger on average than men

ttest, pvalue =  sm.stats.ztest(population1, population2, alternative = 'smaller') 

print(pvalue)

0.00622359038362168


In [7]:
# null hypothesis: the average age of women on the titanic is not higher than "age"
# alternative hypothesis: the average age of women on the titanic is more than "age"

age = 27

ttest, pvalue =  sm.stats.ztest(population1, value = age, alternative = 'larger') 

print(pvalue)

0.14721655212875273


### The end