## **One Sample Z test**

Used when population mean and std is known

In [2]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from math import sqrt

#### Example

- expected mercury level in a sea is 10 p/m (parts/million)
- population std = 1.5
- population mean = 10
- we took 40 different samples to measure mercury levels and got sample mean = 10.5
- is the mean significant at the 0.05 alpha level.

In [3]:
# identifying variables

x_bar = 10.5    # sample mean
n = 40          # sample size
sigma = 1.5     # pop std
mu = 10         # pop mean 

In [4]:
# calculating z score

z = (x_bar - mu)/(sigma/sqrt(n))
z

2.1081851067789197

In [5]:
# calculating p value

p_value = 1 - stats.norm.cdf(z)
p_value

0.017507490509831247

In [9]:
alpha = 0.05
if p_value<alpha:
    print(f'At {alpha} level of significance, we can reject the null hypothesis in favor of alternative hypothesis.')
else:
    print(f'At {alpha} level of significance, we fail to reject the null hypothesis')

At 0.05 level of significance, we can reject the null hypothesis in favor of alternative hypothesis.


### Example 2

- if average sales is greater than 170, it is more efficient to transfer into a new billing system
- 400 reports of monthly average sales a random is chosen with mean 178
- std of pop = 65
- is it efficient to transfer into new billing system

In [10]:
x_bar = 178  # sample mean
n = 400      # sample size
sigma = 65   # pop std
mu = 170     # pop mean

In [11]:
# calculating z score

z = (x_bar - mu)/(sigma/sqrt(n))
z

2.4615384615384617

In [12]:
# calculating p value

p_value = 1 - stats.norm.cdf(z)
p_value

0.006917128192854505

In [13]:
alpha = 0.05
if p_value<alpha:
    print(f'At {alpha} level of significance, we can reject the null hypothesis in favor of alternative hypothesis.')
else:
    print(f'At {alpha} level of significance, we fail to reject the null hypothesis')

At 0.05 level of significance, we can reject the null hypothesis in favor of alternative hypothesis.


## **One Sample t test**


### Example

- a school has 1000 students
- mean IQ = 110
- 20 students are chosen at random
- sample mean = 108
- sample std = 10
- are the results significant at the 0.01 level

In [14]:
x_bar = 108 # sample mean
n = 20      # sample size
s = 10      # pop std
mu = 110    # pop mean
alpha = 0.01 

In [15]:
# calculating t score
t = (x_bar - mu)/(s/sqrt(n))
t

-0.8944271909999159

In [17]:
# calculating p value
#df = n-1

p_value = stats.t.cdf(t, n-1)
p_value


0.1911420676837155

In [18]:
 alpha = 0.05
if p_value<alpha:
    print(f'At {alpha} level of significance, we can reject the null hypothesis in favor of alternative hypothesis.')
else:
    print(f'At {alpha} level of significance, we fail to reject the null hypothesis')

At 0.05 level of significance, we fail to reject the null hypothesis


In [19]:
import statsmodels.api as sm

## **Statsmodels Example**


In [19]:
import statsmodels.api as sm

In [20]:
# pulling dataset from statsmodels
ds1 = sm.datasets.get_rdataset(dataname = 'Pima.tr', package = 'MASS')

In [21]:
ds1.keys()

dict_keys(['data', '__doc__', 'package', 'title', 'from_cache'])

In [22]:
# converting dataset to dataframe
df1 = ds1.data
df1.head()

Unnamed: 0,npreg,glu,bp,skin,bmi,ped,age,type
0,5,86,68,28,30.2,0.364,24,No
1,7,195,70,33,25.1,0.163,55,Yes
2,5,77,82,41,35.8,0.156,35,No
3,0,165,76,43,47.9,0.259,26,No
4,0,107,60,25,26.4,0.133,23,No


In [23]:
# datas descriptive statistics
df1.describe()

Unnamed: 0,npreg,glu,bp,skin,bmi,ped,age
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,3.57,123.97,71.26,29.215,32.31,0.460765,32.11
std,3.366268,31.667225,11.479604,11.724594,6.130212,0.307225,10.975436
min,0.0,56.0,38.0,7.0,18.2,0.085,21.0
25%,1.0,100.0,64.0,20.75,27.575,0.2535,23.0
50%,2.0,120.5,70.0,29.0,32.8,0.3725,28.0
75%,6.0,144.0,78.0,36.0,36.5,0.616,39.25
max,14.0,199.0,110.0,99.0,47.9,2.288,63.0


In [24]:

# type feature count

df1["type"].value_counts()

No     132
Yes     68
Name: type, dtype: int64

In [25]:
# BMI evaluation
df1.groupby(['type'])['bmi'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,132.0,31.074242,6.381457,18.2,25.825,31.05,35.5,47.9
Yes,68.0,34.708824,4.810956,22.9,31.6,34.6,37.625,46.1


In [30]:
# hypothesis testing
# sample size = 200
# null = mean = 30 bmi in women

colN = 'bmi'
n = df1.shape[0]
sample_mean = df1[colN].mean()
sample_std = df1[colN].std()
print("sample size is ", n)
print('sample mean is {:.2f}'.format(sample_mean))
print('sample standard deviation is {:.2f}'.format(sample_std))

sample size is  200
sample mean is 32.31
sample standard deviation is 6.13


In [31]:
# calculating t score
mu_pop = 30
t_score = (sample_mean - mu_pop)/(sample_std/np.sqrt(n))
print('the t score is {:.2f}'.format(t_score))

the t score is 5.33


In [32]:
# calculating p-value
degrees_of_freedom = n-1
p_value = stats.t.sf(abs(t_score), df = degrees_of_freedom)
print('the p value is ', p_value)

the p value is  1.3307205153730877e-07
