# Sample size calculator

A simple program to calculate the required sample size to perform statistical testing.

**Input**

1. _alpha_ ($\alpha$ or significance level)
2. _beta_ ($\beta$ or acceptable type-2 error or (1-power))
3. Observed _p_ (proportion)
4. Desired minimum difference in the experiment: _d_


**Example**

Assume that you observed that 20% of your website visitors click a button. You wanted to change the color of this button. So you need to determine the required sample size to determine if the change is optimal. You expect that the change should improve the click-through-probability by 3%. You wanted to use a significance level of 5%, and power of the test as 80%. Supply the following parameters to find the desired sample size in each group:

    alpha = 0.05
    beta = 0.2
    p = 0.2
    d = 0.03

In [42]:
#Import required packages
import scipy.stats as st
import numpy as np

#Create a function to get the z-score for a desired significance level.
#Returns only the positive side of the z-score, assuming that the alpha is for 2 tail testing
def get_z_score(alpha):
    return st.norm.ppf(alpha/2.0)

#Create a function to return the p_value for an associated z-score    
def get_pvalue(z_score):
    return st.norm.cdf(z_score)


def check_values(alpha, beta, p, d):
    try:
        if alpha > 1 or alpha <0:
            print("Supplied alpha value must be float and between (0,1)")
    except:
        print("Supplied alpha value must be float and between (0,1)")
    try: 
        if beta > 1 or beta <0:
            print("Supplied beta value must be float and between (0,1)")
    except:
        print("Supplied beta value must be float and between (0,1)")
    try: 
        if p > 1 or p <0:
            print("Supplied p value must be float and between (0,1)")
    except:
        print("Supplied p value must be float and between (0,1)")
        
    if type(d) != float and type(d) != int:
        raise Exception("Supplied d value must be float")
        exit(1)

def find_sample_size(alpha, beta, p, d):
    check_values(alpha, beta, p, d)
    
    #Get the z-score on the positive side, for the sig. level of alpha:
    z_score = -1*get_z_score(alpha)
    
    #Compute the q value, which is nothing but 1-p
    q = 1-p
    
    
    for n in range(1,20000,1):
        se_1 = np.sqrt(p*q*2.0/n)
        
        #Find the positive boundary using the above se, assuming the true difference as 0
        x = z_score * se_1
        
        #Check where this x lies (p-value), assuming the true difference as d
        se_2 = np.sqrt((p+d)*(1-p-d)*2.0/n)
        z_score_2 = (x-d)/se_2
        
        #Find the p-value of z_score_2
        temp_beta = get_pvalue(z_score_2)
        
        if temp_beta <= beta:
            return n
    
    return -1   
        
        
        
    

In [44]:
find_sample_size(alpha=.05, beta=.2, p=.2, d=.03)

2879

In [52]:
##Inspired from Udacity's course

#Import required packages
import scipy.stats as st
import numpy as np

#Create a function to get the z-score for a desired significance level.
#Returns only the positive side of the z-score, assuming that the alpha is for 2 tail testing
def get_z_score(alpha):
    return st.norm.ppf(alpha/2.0)

#Create a function to return the p_value for an associated z-score    
def get_pvalue(z_score):
    return st.norm.cdf(z_score)


def check_values(alpha, beta, general_se_at_1, desired_se_at_1 ,d_min):
    try:
        if alpha > 1 or alpha <0:
            print("Supplied alpha value must be float and between (0,1)")
    except:
        print("Supplied alpha value must be float and between (0,1)")
    try: 
        if beta > 1 or beta <0:
            print("Supplied beta value must be float and between (0,1)")
    except:
        print("Supplied beta value must be float and between (0,1)")

    if type(general_se_at_1) != float and type(general_se_at_1) != int:
        raise Exception("Supplied general_se_at_1 value must be float")
        exit(1)

    if type(desired_se_at_1) != float and type(desired_se_at_1) != int:
        raise Exception("Supplied desired_se_at_1 value must be float")
        exit(1)

        
    if type(d_min) != float and type(d_min) != int:
        raise Exception("Supplied d_min value must be float")
        exit(1)

def find_sample_size(alpha, beta, general_se_at_1, desired_se_at_1 ,d_min):
    #check_values(alpha, beta, p, d)
    
    #Get the z-score on the positive side, for the sig. level or alpha:
    z_score = -1*get_z_score(alpha)
    
    #Compute the q value, which is nothing but 1-p
    #q = 1-p
    
    for n in range(1, 20000,1):
        #std. error for given p (existing proportion or general proportion)
        general_se_at_n = general_se_at_1/np.sqrt(n)
        
        #Find the positive boundary using the above se, assuming the true difference as 0
        x = z_score * general_se_at_n

        #std. error for given p+d (existing proportion + desired difference: p+d_min)
        #The desired_se_at_1 was already precomputed with p+d_min as the mean, so need for 
        #any other change. Note that desired_se_at_1 is supplied as a parameter to this function
        desired_se_at_n = desired_se_at_1/np.sqrt(n)
        
        #Check what is the p-value for x, assuming the mean as d_min, and std error as desired_se_at_n
        z_score_2 = (x-d_min)/desired_se_at_n
        temp_beta = get_pvalue(z_score_2)
        
        if temp_beta <= beta:
            return n

    return -1   
        
        
        
    

In [53]:
#find_sample_size(alpha=.05, beta=.2, p=.2, d=.03)
find_sample_size(alpha=0.05, beta=0.2, general_se_at_1=np.sqrt(0.2*0.8*2), desired_se_at_1=np.sqrt(0.23*0.77*2),d_min = 0.03)

2879