In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy import stats

## Types of Samping
<br>
- 1. Simple Random
- 2. Systematic (Every Kth)
- 3. Stratified
- 4. Cluster
- 5. Resampling

## Central limit theorem
The Central Limit Theorem states that the sample means of large-sized samples will be normally distributed regardless of the shape of their population distributions. 

1. When Population standard deviation is known 
$$ UCL = \bar {x} + z_{\alpha/2} \frac {\sigma}  {\sqrt{n}} $$

In [3]:
sigma = 40.60
n = 32
# 90% confidence interval
confidence_interval = stats.norm.interval(0.90, loc=129.20, scale=sigma/(n)**0.5)
confidence_interval

(117.39466539018174, 141.00533460981822)

In [4]:
# 95% confidence interval
confidence_interval2 = stats.norm.interval(0.95, loc=129.20, scale=sigma/(n)**0.5)
confidence_interval2

(115.13307618285076, 143.2669238171492)

In [5]:
# 99% confidence interval
stats.norm.interval(0.99, loc=129.20, scale=sigma/(n)**0.5)

(110.71292836765772, 147.68707163234225)

1. When Population standard deviation is unknown (Studen's t-Distribution)
$$ UCL = \bar {x} + t_{\alpha/2, df} \frac {s}  {\sqrt{n}} $$

In [7]:
mean = 100.3
s = 16.6
n = 18
stats.t.interval(0.95, loc=mean, scale=s/(n)**0.5, df=n-1)

(92.04501420816499, 108.55498579183501)

In [2]:
# cost charged by a tax preparation service
data = np.array([135,120,150,110,90,115,130,200,180,200,140,150,100,75,70])

# a. Using a 98% confidence interval, estimate the average cost of the service for preparing a tax return for a customer.
# b. What is the margin of error for this sample? 
# c. Verify your result using Excel. 
# d. What assumptions are necessary for this analysis?

res_a = stats.t.interval(alpha=0.98, df=len(data)-1,
                        loc=data.mean(), scale=stats.sem(data))
res_a

(103.497916296103, 158.502083703897)

In [9]:
## Q.
n = 33
mean = 222
std = 76
stats.t.interval(alpha=0.90,df=n-1, loc=mean,
                scale=std/(33)**0.5)

(199.59001803523282, 244.40998181110837)

In [10]:
data = np.array([356,275,371,384,457,326,
                414,367,362,286,104,136,
                320,244,370,215,322,409,
                303,489,251,361,337,265])

avg_domestic_fare = stats.t.interval(alpha=0.99, df=len(data)-1,
                                    loc=data.mean(), scale=stats.sem(data))
avg_domestic_fare

(269.87984899822425, 373.7868176684424)

In [12]:
margin_of_error = data.mean() - avg_domestic_fare[0]
margin_of_error

51.95348433510907

## Calculating confidence interval for proportions
$$ UCL = \bar {x} + z_{\alpha/2}  \sqrt \frac {\bar{p} (1-\bar{p})}  {n} $$

In [11]:
from statsmodels.stats.proportion import proportion_confint

In [13]:
p_bar = 0.663
n = 175
proportion_confint(count=int(p_bar*n), nobs=n, alpha=0.01)

(0.5708090050263673, 0.7549052806879183)