In [1]:
import os
import numpy as np
import pandas as pd
from scipy.stats import binom, norm, t

In [2]:
DATA_DIR = "../Datasets/"

In [3]:
data = pd.read_excel(os.path.join(DATA_DIR, "birthweight.xls"))

In [4]:
data.head()

Unnamed: 0,Birthweight
0,3870
1,3400
2,3430
3,3560
4,3220


In [5]:
data["Birthweight"].isnull().sum()

0

In [6]:
data.describe()

Unnamed: 0,Birthweight
count,125.0
mean,3111.36
std,501.318172
min,1670.0
25%,2800.0
50%,3100.0
75%,3440.0
max,4230.0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 1 columns):
Birthweight    125 non-null int64
dtypes: int64(1)
memory usage: 1.1 KB


In [8]:
def get_z_score_from_prob(prob):
    return norm.ppf(prob)

#### PROBLEM : Some studies suggest that women having their first baby at age 35 or older are at increased risk of having a baby with a low birth weight. A medical researcher wanted to estimate μ, the mean weight of newborns who are the first child for women over the age of 35. To this end, the researcher chose a random sample of 125 women ages 35 and older who were pregnant with their first child and followed them through the pregnancy. The datafile linked below contains the birth weight (in grams) of the 125 newborns (women pregnant with more than one child were excluded from the study). From past research, it is assumed that the weight of newborns has a standard deviation of σ = 500 grams. We will estimate μ with a 99% confidence interval.

#### Solution 1 : In above problem, sample data is given. We can use pandas dataframe's method to calculate the sample mean or can use the numpy mean method. Since here standard deviation of population is given so we can use the following formula to calculate the confidence interval for the estimation of poulation mean with given confidence.
Formula : 
Confidence Interval for Population mean = sample_mean +/- (z_score * population_std_dev) / sqrt(sample_size)

Z_score can be calculated using above method and following is the solution for the above problem.

In [9]:
n = len(data)
sample_mean = np.mean(data["Birthweight"])
pop_std_dev = 500
c = 0.01 / 2
z_star = abs(get_z_score_from_prob(c))

In [10]:
n, sample_mean, pop_std_dev, c, z_star

(125, 3111.36, 500, 0.005, 2.575829303548901)

In [11]:
pop_mean_lower_bound = sample_mean - (z_star * pop_std_dev) / np.sqrt(n)
pop_mean_upper_bound = sample_mean + (z_star * pop_std_dev) / np.sqrt(n)

In [12]:
print("Confidence Interval for population mean for above scenario with 99% confidence is : [{}, {}]".
      format(pop_mean_lower_bound, pop_mean_upper_bound))

Confidence Interval for population mean for above scenario with 99% confidence is : [2996.1654115765746, 3226.5545884234257]


#### PROBLEM 2 : As part of a large survey conducted at a large state university, a random sample of 142 students were asked: "How many hours do you sleep in a typical day?" The datafile linked below contains the data. Use these data to estimate μ, the mean number of hours college students at this university sleep in a typical day, with a 95% confidence interval.

#### Solution 2 : This problem is same as above just one change is that here population standard deviation is unknown. So we cannot use the above formula. To estimate the population mean confidence interval here first we need to get the standard deviation of population. Here its not given so we can replace it with sample standard deviation. But because of this change we can not use normal table or the z-score. Here we will have to use t-interval. So, the formula becomes as follows:
Confidence Interval for Population Mean when population standard deviation is unknown = 
sample_mean +/- (t_score * sample_standard_deviation) / sqrt(sample_size)

In [13]:
data_sleep = pd.read_excel(os.path.join(DATA_DIR, "sleep.xls"))

In [14]:
data_sleep.head()

Unnamed: 0,Sleep
0,5.0
1,7.0
2,6.0
3,6.0
4,5.0


In [15]:
data_sleep.describe()

Unnamed: 0,Sleep
count,142.0
mean,7.355634
std,1.598515
min,3.0
25%,6.5
50%,7.5
75%,8.0
max,14.0


In [16]:
data_sleep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 1 columns):
Sleep    142 non-null float64
dtypes: float64(1)
memory usage: 1.2 KB


In [17]:
ds_sample_mean = data_sleep.describe().loc["mean"].values[0]
ds_sample_std = data_sleep.describe().loc["std"].values[0]
ds_n = len(data_sleep)
ds_c = (1 - 0.95) / 2
t_star = abs(t.ppf(ds_c, ds_n)) # t-interval for a value depends upon confidence as well as sample size

In [21]:
# for getting the confidence score back from the t-interval value
1 - t.cdf(t_star, ds_n), ds_c

(0.025000000000730105, 0.025000000000000022)

In [22]:
ds_sample_mean, ds_sample_std, ds_n, ds_c, t_star

(7.355633802816901,
 1.598515474777953,
 142,
 0.025000000000000022,
 1.9768109936200895)

In [23]:
ds_pop_mean_lower_bound = ds_sample_mean - (t_star * ds_sample_std) / np.sqrt(ds_n)
ds_pop_mean_upper_bound = ds_sample_mean + (t_star * ds_sample_std) / np.sqrt(ds_n)

In [24]:
print("Confidence Interval for population mean for above scenario with 95% confidence is : [{}, {}]".
      format(ds_pop_mean_lower_bound, ds_pop_mean_upper_bound))

Confidence Interval for population mean for above scenario with 95% confidence is : [7.090455601586144, 7.620812004047658]
