In [1]:
import statistics as stat
import math
import numpy as np
import pandas as pd
from scipy.stats import norm
from scipy.stats import chi2

### Confidence Interval

In [2]:
pi_pop = 200
pi_samp = 78
alpha = 0.05

In [3]:
pi_hat = pi_samp / pi_pop
z_val = norm.ppf(1-(alpha/2))
pi_hat, z_val

(0.39, 1.959963984540054)

In [4]:
hcl = pi_hat + (z_val * math.sqrt(pi_hat * (1 - pi_hat)/ pi_pop))
lcl = pi_hat - (z_val * math.sqrt(pi_hat * (1 - pi_hat)/ pi_pop))
lcl, hcl

(0.3224025498467904, 0.45759745015320963)

### Sample Size

In [5]:
alpha = 0.05
E = 0.05
pi = 0.5 # assume 0.5 if not given

In [6]:
z_val = norm.ppf(1-(alpha/2))
n = ((z_val ** 2) * (pi * (1 - pi))) / E ** 2
n

384.14588206941244

### Hypothesis tests for pi

In [7]:
pi = 0.5
prop = 60
n = 100
alpha = 0.01 # 1 - alpha is lower tailed

In [8]:
pi_hat = prop / n
z = (pi_hat - pi) / math.sqrt((pi * (1 - pi))/ n)
z

1.9999999999999996

In [9]:
norm.ppf(1-(alpha))

2.3263478740408408

### Inference for 2 population proportions

In [10]:
n1 = 500
n2 = 500
prop2 = 25
prop1 = 75 # keep larger one as prop1
alpha = 0.1

In [11]:
pi_hat_1 = prop1 / n1
pi_hat_2 = prop2 / n2

In [12]:
z_val = norm.ppf(1-(alpha/2))
t1 = pi_hat_1 * (1 - pi_hat_1) / n1
t2 = pi_hat_2 * (1 - pi_hat_2) / n2
t3 = z_val * math.sqrt(t1 + t2)

In [13]:
hcl = (pi_hat_1 - pi_hat_2) + t3
lcl = (pi_hat_1 - pi_hat_2) - t3
lcl, hcl

(0.06922760638277556, 0.13077239361722443)

### Hypothesis for 2 population proportion

In [14]:
n1 = 100
prop1 = 50
n2 = 100
prop2 = 25
alpha = 0.05

In [15]:
pi_hat_1 = prop1/n1
pi_hat_2 = prop2/n2
pi_hat = (prop1 + prop2) / (n1 + n2)
pi_hat

0.375

#### If pi0 = 0

In [16]:
t1 = math.sqrt((pi_hat) * (1 - pi_hat) * ((1/n1) + (1/n2)))
z = (pi_hat_1 - pi_hat_2) / t1
z, norm.ppf(1-(alpha/2))

(3.6514837167011076, 1.959963984540054)

#### if pi0 != 0

In [17]:
pi_hat_1 = 0.000778
pi_hat_2 = 0.0131
n1 = 14134
n2 = 14073
alpha = 1 - 0.05 # left-tailed

In [18]:
t1 = math.sqrt((pi_hat_1*(1 - pi_hat_1) / n1) + (pi_hat_2*(1 - pi_hat_2) / n2))
z

3.6514837167011076

In [19]:
norm.ppf(1-alpha)

-1.6448536269514722

### Tests for Independence and Homogeneity

In [20]:
df = [[16, 21, 11], [24, 17, 13]]
alpha = 0.05
df = pd.DataFrame(df)
df

Unnamed: 0,0,1,2
0,16,21,11
1,24,17,13


In [21]:
c_total = df.sum()
r_total = df.sum(axis = 1)
total = sum(c_total)
dof = (len(c_total) - 1) * (len(r_total) - 1)
total, c_total, r_total

(102,
 0    40
 1    38
 2    24
 dtype: int64,
 0    48
 1    54
 dtype: int64)

In [22]:
t1 = []
for i in range(len(r_total)):
    for j in range(len(c_total)):
        e = (r_total[i] * c_total[j]) / total
        print(e)
        t1.append(((df.iloc[i][j] - e) ** 2) / e)
X = sum(t1)
X

18.823529411764707
17.88235294117647
11.294117647058824
21.176470588235293
20.11764705882353
12.705882352941176


1.8411488791423

In [23]:
chi2.ppf(1 - alpha, dof)

5.991464547107979

In [25]:
t1 # (obs-E) ^ 2 / E

[0.42352941176470604,
 0.5435371517027862,
 0.007659313725490234,
 0.3764705882352943,
 0.4831441348469211,
 0.006808278867102432]