In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import ttest_ind
from subprocess import check_output
from statsmodels.stats import weightstats as stests


  import pandas.util.testing as tm


In [20]:
# Define a function to perform the zstats
#Input -> 
# 1. Sample column
# 2. Population column
# 3. No of tails
# 4. alpha / significance value

def ztest(sample_x,pop_x,tails=1,alpha=0.05):

    #define the alpha value dictionary
    z_alpha = {0.02: 2.0537,
         0.025: 1.96,
         0.03: 1.88,
         0.04:1.7507,
         0.05:1.6449}

    #Perform calculations
    
    #if the test is two tailed, divide alpha by 2
    if tails == 2:
        alpha = alpha/2
    
    #store the mean values of samples and population, and sd of population
    mu_pop = round(np.mean(pop_x),2) #mean of the population
    #print('mu-pop',mu_pop)

    mu_sam = round(np.mean(sample_x),2) #mean of the sample
    #print('mu-sam',mu_sam)

    sd_pop = round(np.std(pop_x),2) #standard deviation of the population
    #print('sd_pop', sd_pop)

    n_sam = len(sample_x) #sample size
    #print('n_sam',n_sam)
    
    #Calculate ztats >> mean of population - mean of sample / standard deviation / square root of sample size
    
    zstats = (mu_sam - mu_pop)/(sd_pop/(n_sam**0.5))
    #print('zstats --> ',zstats)

    # Check if Null Hypothesis is True or False based on the no. of tails and 
    if tails == 1:
        if zstats > (z_alpha[alpha]):
            result_str = 'Null is false'
            #print('Null is false')
        else:
            result_str = 'Null is True'
            #print('Null is True')
    elif tails == 2:
        if zstats < -(z_alpha[alpha]):
            result_str = 'Null is false'
            #print('Null is false')
        elif zstats > z_alpha[alpha]:
            result_str = 'Null is false'
            #print('Null is false')
        else:
            result_str = 'Null is True'
            #print('Null is True')
    
    return(result_str)
    


In [9]:
# Read the csv into a dataframe

cereal = pd.read_csv('cereal.csv')
cereal.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


In [10]:
#Check the stats for the data
cereal.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
calories,77.0,106.883117,19.484119,50.0,100.0,110.0,110.0,160.0
protein,77.0,2.545455,1.09479,1.0,2.0,3.0,3.0,6.0
fat,77.0,1.012987,1.006473,0.0,0.0,1.0,2.0,5.0
sodium,77.0,159.675325,83.832295,0.0,130.0,180.0,210.0,320.0
fiber,77.0,2.151948,2.383364,0.0,1.0,2.0,3.0,14.0
carbo,77.0,14.597403,4.278956,-1.0,12.0,14.0,17.0,23.0
sugars,77.0,6.922078,4.444885,-1.0,3.0,7.0,11.0,15.0
potass,77.0,96.077922,71.286813,-1.0,40.0,90.0,120.0,330.0
vitamins,77.0,28.246753,22.342523,0.0,25.0,25.0,25.0,100.0
shelf,77.0,2.207792,0.832524,1.0,1.0,2.0,3.0,3.0


In [21]:
# Hypothesis testing - 1

# There is no difference in the amount of sugar in Hot cereal and the population

# Null Hypothesis - Amount of sugar in hot cereal is no different than the entire population
# Alternative Hypothesis - Amount of sugar in hot cereal is not the same as the entire population

# Null Hypothesis - mu_hot = mu_pop
# Alternative Hypotheiss - mu_hot Not= mu_pop

# Alpha = 0.05

# This is a two tailed test

# We will call our function, after separating the data

hot_cereal = cereal.loc[cereal['type']=='H',:]
result = ztest(hot_cereal['sugars'],cereal['sugars'],2,0.05)
print(result)



Null is false


In [22]:
# Hypothesis testing - 1

# There is no difference in the amount of sugar in Hot cereal and the population

# Null Hypothesis - Amount of sugar in hot cereal is no different than the entire population
# Alternative Hypothesis - Amount of sugar in hot cereal is not the same as the entire population

# Null Hypothesis - mu_hot = mu_pop
# Alternative Hypotheiss - mu_hot Not= mu_pop

# Alpha = 0.05

# This is a two tailed test

# We will call our function, after separating the data

cereal_sam = cereal.loc[cereal['type']=='H',:]
result = ztest(cereal_sam['sugars'],cereal['sugars'],2,0.05)
print(result)



Null is false


In [23]:
# Hypothesis testing - 2
# Null Hypothesis - The cereals with calories greater than 110 have a greater rating as all the cereals
# Alternate Hypothesis - The cereals with calories greater than 110 do not have a greater rating as all the cereals

# Alpha = 0.05

# This is a one tailed test

# We will call our function, after separating the data

cereal_sam = cereal.loc[cereal['calories']>110,:]
result = ztest(cereal_sam['rating'],cereal['rating'],1,0.05)
print(result)



Null is True
