In [2]:
import pandas as pd
import numpy as np
import random
import scipy.stats
import math
from matplotlib import pyplot as plt
from random import sample
z90 = 1.645
z95 = 1.96
z99 = 2.576

In [98]:
# example 6.11: generating the contingency table in the example 
group = np.array(["control","treatment"])
group = np.repeat(group, [50,40],axis=0)
result = np.array(["survived","died","survived","died"])
result = np.repeat(result,[11,39,14,26],axis=0)
df = pd.DataFrame ({'Group':group,'Result':result})
group = df["Group"]
result = df["Result"]
ct = pd.crosstab(group,result,margins=True,normalize=False)
ct_n = pd.crosstab(group,result,margins=True,normalize='index')
print(ct)
print('''
checking independence. this is a randomized experiment, the condition is satisfied.
s/f condition: each experiment arm has at least 10 successes and failures. satisfied.
sample proportions can be modeled as normal distribution.
creating 90% confidence interval:''')
phat_1 = ct_n["survived"]["control"] # survive rate of the control group
minus_phat1 = 1 - phat_1
n1 = ct["All"]["control"] # sample size of the control group
phat_2 = ct_n["survived"]["treatment"] # survive rate of the treatment group.
minus_phat2 = 1 - phat_2
n2 = ct["All"]["treatment"] # sample size of the treatment group
# we want the difference between two groups, there are two point estimates and we'll use their differences as last point estimate.
phat =  phat_2 - phat_1

z = z90 # for the 90% confidence level.
se = np.sqrt( (phat_1*minus_phat1 / n1) + (phat_2 *minus_phat2 / n2) )
me = z * se
lo = phat - me
hi = phat + me
print('''SE = {se}
z* = {z} , margin of error: z* x SE = {me}
confidence interval: (phat - z* x SE , phat + z* x SE) = ({lo},{hi})
we're 90% confident that the blood thinners have a difference of {lo} to {hi} percentage point impact on survival rate.
because the confidence interval contains the 0%, we can conclude we don't have enough information to state there's a 
statistically significant difference whether using blood thinners or not for a CPR.
'''.format(z=z,se = se, me = me, lo=format(lo, '.3f'), hi=format(hi, '.3f')))


Result     died  survived  All
Group                         
control      39        11   50
treatment    26        14   40
All          65        25   90

checking independence. this is a randomized experiment, the condition is satisfied.
s/f condition: each experiment arm has at least 10 successes and failures. satisfied.
sample proportions can be modeled as normal distribution.
creating 90% confidence interval:
SE = 0.09549607321769832
z* = 1.645 , margin of error: z* x SE = 0.15709104044311373
confidence interval: (phat - z* x SE , phat + z* x SE) = (-0.027,0.287)
we're 90% confident that the blood thinners have a difference of -0.027 to 0.287 percentage point impact on survival rate.
because the confidence interval contains the 0%, we can conclude we don't have enough information to state there's a 
statistically significant difference whether using blood thinners or not for a CPR.



In [130]:
# guided practice 6.11: generating the contingency table in the example
# for 2x2 matrices
group_t = 'fish oil'
group_c = 'placebo'
outcome_s = 'heart attack'
outcome_f = 'no event'
group = np.array([group_t,group_c])
group = np.repeat(group, [12933,12938],axis=0)
outcome = np.array([outcome_s,outcome_f,outcome_s,outcome_f])
outcome = np.repeat(outcome,[145,12788,200,12738],axis=0)
df = pd.DataFrame ({'Group':group,'Outcome':outcome})
group = df["Group"]
outcome = df["Outcome"]
ct = pd.crosstab(group,outcome,margins=True,normalize=False)
ct_n = pd.crosstab(group,outcome,margins=True,normalize='index')
print(ct)
print('''
checking independence. this is a randomized experiment, the condition is satisfied.
s/f condition: each experiment arm has at least 10 successes and failures. satisfied.
sample proportions can be modeled as normal distribution.

creating 95% confidence interval:''')
phat_1 = ct_n[outcome_s][group_c] # survive rate of the control group
minus_phat1 = 1 - phat_1
n1 = ct["All"][group_c] # sample size of the control group
phat_2 = ct_n[outcome_s][group_t] # survive rate of the treatment group.
minus_phat2 = 1 - phat_2
n2 = ct["All"][group_t] # sample size of the treatment group
# we want the difference between two groups, there are two point estimates and we'll use their differences as last point estimate.
phat =  phat_2 - phat_1
z = z95 # for the 95% confidence level.
se = np.sqrt( (phat_1*minus_phat1 / n1) + (phat_2 *minus_phat2 / n2) ) # extended standard error calculation for two proportions
me = z * se
lo = phat - me
hi = phat + me
print('''SE = {se}
z* = {z} , margin of error: z* x SE = {me}
confidence interval: (phat - z* x SE , phat + z* x SE) = ({lo},{hi})
we're 95% confident that fish oil decrease heart attack rate from {lo} to {hi}.
because the confidence interval doesn't contain the 0%, we can conclude there's a statistically significant evidence that 
fish oil decreass heart attacks.'''.format(z=z,se = se, me = me, lo=format(lo, '.4f'), hi=format(hi, '.4f')))

Outcome   heart attack  no event    All
Group                                  
fish oil           145     12788  12933
placebo            200     12738  12938
All                345     25526  25871

checking independence. this is a randomized experiment, the condition is satisfied.
s/f condition: each experiment arm has at least 10 successes and failures. satisfied.
sample proportions can be modeled as normal distribution.

creating 95% confidence interval:
SE = 0.0014260130479947562
z* = 1.96 , margin of error: z* x SE = 0.002794985574069722
confidence interval: (phat - z* x SE , phat + z* x SE) = (-0.0070,-0.0015)
we're 95% confident that fish oil decrease heart attack rate from -0.0070 to -0.0015.
because the confidence interval doesn't contain the 0%, we can conclude there's a statistically significant evidence that 
fish oil decreass heart attacks.


In [197]:
# guided practice 6.11: generating the contingency table in the example
# for 2x2 matrices
group_t = 'mammogram'
group_c = 'control'
outcome_s = 'dead'
outcome_f = 'not dead'
group = np.array([group_t,group_c])
group = np.repeat(group, [44925,44910],axis=0)
outcome = np.array([outcome_s,outcome_f,outcome_s,outcome_f])
outcome = np.repeat(outcome,[500,44425,505,44405],axis=0)
df = pd.DataFrame ({'Group':group,'Outcome':outcome})
group = df["Group"]
outcome = df["Outcome"]
ct = pd.crosstab(group,outcome,margins=True,normalize=False) # the version that the values are not normalized
ct_n = pd.crosstab(group,outcome,margins=True,normalize='index') # the normalized version based on group totals.
ct_n1 = pd.crosstab(group,outcome,margins=True,normalize=True) # the normalized version based on the sample size of entire study.
print(ct)
p_pooled = ct_n1["dead"]["All"] # pooled proportion is: # of successes in study / sample size in the entire study.
# we use pooled proportion since the difference between two samples that we're testing in our hypotheses is 0
minus_p_pooled = 1 - p_pooled
succ_t = p_pooled * ct["All"][group_t]
fail_t = minus_p_pooled * ct["All"][group_t]
succ_c = p_pooled * ct["All"][group_c]
fail_c = minus_p_pooled * ct["All"][group_c]
print('''
H0: there's no difference in breath cancer death rate whether using mammogram or not. phat_trt - phat_ctrl = 0
HA: there's a difference in breath cancer death rate whether using mammogram or not. phat_trt - phat_ctrl != 0

checking independence. this is a randomized experiment, the condition is satisfied.
s/f condition: for the hypothesis tests that contains two proportion (phat1-phat2 = 0) we have to use the pooled proportion.
pooled * n_control = {succ_c} // (1-pooled) * n_control = {fail_c} ==> satisfied.
pooled * n_treatment = {succ_t} // (1- pooled) * n_treatment = {fail_t} ==> satisfied.
sample proportions can be modeled as normal distribution since all conditions are satisfied.
'''.format(succ_t=succ_t, fail_t=fail_t, succ_c=succ_c, fail_c=fail_c))
phat_1 = ct_n[outcome_s][group_c] # survive rate of the control group
minus_phat1 = 1 - phat_1
n1 = ct["All"][group_c] # sample size of the control group
phat_2 = ct_n[outcome_s][group_t] # survive rate of the treatment group.
minus_phat2 = 1 - phat_2
n2 = ct["All"][group_t] # sample size of the treatment group
# we want the difference between two groups, there are two point estimates and we'll use their differences as last point estimate.
phat =  phat_2 - phat_1
p0 = 0 # null value is: there's no difference, [phat_treatment - phat_control = 0]
se = np.sqrt( (phat_1*minus_phat1 / n1) + (phat_2 *minus_phat2 / n2) ) # extended standard error calculation for two proportions
z = (phat-p0)/se # how many standard errors away the phat is from the null value in the null distribution
pvalue = scipy.stats.norm.sf(abs(z))*2 # find p-value for two-tailed test

print('''SE = {se} , z* = {z} , p-value = {pvalue}
for a .05 significance level, because the p-value is larger than significance, we can't reject H0 and conclude
the difference in breast cancer death rates are reasonably explained by chance. in the data, we don't observe any benefits
or harm from mammograms relative to a regular breast exam.
'''.format(z=z, se=se, pvalue=format(pvalue, '.6f')) )

Outcome    dead  not dead    All
Group                           
control     505     44405  44910
mammogram   500     44425  44925
All        1005     88830  89835

H0: there's no difference in breath cancer death rate whether using mammogram or not. phat_trt - phat_ctrl = 0
HA: there's a difference in breath cancer death rate whether using mammogram or not. phat_trt - phat_ctrl != 0

checking independence. this is a randomized experiment, the condition is satisfied.
s/f condition: for the hypothesis tests that contains two proportion (phat1-phat2 = 0) we have to use the pooled proportion.
pooled * n_control = 502.41609617632326 // (1-pooled) * n_control = 44407.58390382368 ==> satisfied.
pooled * n_treatment = 502.5839038236768 // (1- pooled) * n_treatment = 44422.41609617633 ==> satisfied.
sample proportions can be modeled as normal distribution since all conditions are satisfied.

SE = 0.0007018184952669552 , z* = -0.1639328415192579 , p-value = 0.869784
for a .05 significance leve

In [198]:
''' example 6.20
A quadcopter company is considering a new manufacturer for rotor blades. The new manufacturer
would be more expensive, but they claim their higher-quality blades are more reliable, with 3% more
blades passing inspection than their competitor.

The quality control engineer from Guided Practice 6.19 collects a sample of blades, examining 1000
blades from each company, and she finds that 899 blades pass inspection from the current supplier
and 958 pass inspection from the prospective supplier. Using these data, evaluate the hypotheses
with a significance level of 5%.

'''
n1 = 1000
n2 = 1000
p0 = .03 # null value
phat1 = 899 / n # success proportion in the first sample
minus_phat1 = 1 - phat1
phat2 = 958 / n # success proportion in the second sample
minus_phat2 = 1 - phat2
phat = phat2 - phat1 # point estimate
minus_phat = 1 - phat
succ1 = phat1 * n # success condition for the first sample
fail1 = (1-phat1) * n # failure condition for the first sample
succ2 = phat2 * n # success condition for the second sample
fail2 = (1-phat2) * n # failure condition for the second sample
se = np.sqrt( (phat1*minus_phat1/n1) + (phat2*minus_phat2/n2) ) # calculate standard error and include both sample proportions.
z = (phat-p0)/se # how many standard errors away the phat is from the null value in the null distribution
pvalue = scipy.stats.norm.sf(abs(z))*2 # find p-value for two-tailed test
pvalue

print('''H0: the prospective supplier have a %3 more inspection success rate than the current supplier. p_pros - p_curr = .03
HA: the prospective supplier have a different inspection success rate than %3.  p_pros - p_curr != .03
independence: nothing said but to proceed, we'll assume it's randomly sampled.
s/f condition: phat1 * n1 = {succ1} // (1-phat1) * n1 = {fail1} ==> satisfied.
phat2 * n2 = {succ2} // (1-phat2) * n2 = {fail2} ==> satisfied.

SE = {se} // we use the p0 for the null distribution
z* = {z}
p-value = {pvalue}
for a .05 significance level, because the p-value is smaller than significance, we can reject H0 and conclude
there's a statistically significant evidence the prospetive supplier has a more than 3% higher inspection success rate.
'''.format(succ1=succ1, fail1=fail1, succ2=succ2, fail2=fail2, se = se,z=z,pvalue=format(pvalue, '.6f')))

H0: the prospective supplier have a %3 more inspection success rate than the current supplier. p_pros - p_curr = .03
HA: the prospective supplier have a different inspection success rate than %3.  p_pros - p_curr != .03
independence: nothing said but to proceed, we'll assume it's randomly sampled.
s/f condition: phat1 * n1 = 899.0 // (1-phat1) * n1 = 100.99999999999997 ==> satisfied.
phat2 * n2 = 958.0 // (1-phat2) * n2 = 42.000000000000036 ==> satisfied.

SE = 0.011447052022245729 // we use the p0 for the null distribution
z* = 2.533403355173239
p-value = 0.011296
for a .05 significance level, because the p-value is smaller than significance, we can reject H0 and conclude
there's a statistically significant evidence the prospetive supplier has a more than 3% higher inspection success rate.



In [6]:
'''6.17 Social experiment, Part I. A "social experiment" conducted by a TV program questioned what
people do when they see a very obviously bruised woman getting picked on by her boyfriend. On two
different occasions at the same restaurant, the same couple was depicted. In one scenario the woman was
dressed "provocatively" and in the other scenario the woman was dressed "conservatively". The table below
shows how many restaurant diners were present under each scenario, and whether or not they intervened.

Explain why the sampling distribution of the difference between the proportions of interventions under
provocative and conservative scenarios does not follow an approximately normal distribution.'''

group_t = 'yes'
group_c = 'no'
outcome_s = 'provocative'
outcome_f = 'conservative'
group = np.array([group_t,group_c])
group = np.repeat(group, [20,25],axis=0)
outcome = np.array([outcome_s,outcome_f,outcome_s,outcome_f])
outcome = np.repeat(outcome,[5,15,15,10],axis=0)
df = pd.DataFrame ({'Intervene':group,'Scenario':outcome})
group = df["Intervene"]
outcome = df["Scenario"]
ct = pd.crosstab(group,outcome,margins=True,normalize=False) # the version that the values are not normalized

print(ct)
print('''
checking independence. this seems like not a randomized experiment. independence condition is not satisfied here.
s/f condition: as seen, for provocatives that intervene, there's less than 10. in that condition success-failure condition
is not satisfied. since those conditions does not satisfy, we can not reasonably model the sample distributions as normal.''')

Scenario   conservative  provocative  All
Intervene                                
no                   10           15   25
yes                  15            5   20
All                  25           20   45

checking independence. this seems like not a randomized experiment. independence condition is not satisfied here.
s/f condition: as seen, for provocatives that intervene, there's less than 10. in that condition success-failure condition
is not satisfied. since those conditions does not satisfy, we can not reasonably model the sample distributions as normal.


In [10]:
'''6.18 Heart transplant success. The Stanford University Heart Transplant Study was conducted to de-
termine whether an experimental heart transplant program increased lifespan. Each patient entering the
program was officially designated a heart transplant candidate, meaning that he was gravely ill and might
benefit from a new heart. Patients were randomly assigned into treatment and control groups. Patients in
the treatment group received a transplant, and those in the control group did not. The table below displays
how many patients survived and died in each group.

Suppose we are interested in estimating the difference in survival rate between the control and treatment
groups using a confidence interval. Explain why we cannot construct such an interval using the normal
approximation. What might go wrong if we constructed the confidence interval despite this problem?'''
group_t = 'control'
group_c = 'treatment'
outcome_s = 'alive'
outcome_f = 'dead'
group = np.array([group_t,group_c])
group = np.repeat(group, [34,69],axis=0)
outcome = np.array([outcome_s,outcome_f,outcome_s,outcome_f])
outcome = np.repeat(outcome,[4,30,24,45],axis=0)
df = pd.DataFrame ({'Group':group,'Outcome':outcome})
group = df["Group"]
outcome = df["Outcome"]
ct = pd.crosstab(group,outcome,margins=True,normalize=False) # the version that the values are not normalized
print(ct)
print('''it's a randomized experiment, so the independence condition satisfies. but unfortunately the in the control group
who are alive isn't enough to satisfy the success-failure condition (<10). Because of this, we can not use the normal
approximation in building the confidence interval.''')

Outcome    alive  dead  All
Group                      
control        4    30   34
treatment     24    45   69
All           28    75  103
it's a randomized experiment, so the independence condition satisfies. but unfortunately the in the control group
who are alive isn't enough to satisfy the success-failure condition (<10). Because of this, we can not use the normal
approximation in building the confidence interval.


In [11]:
'''6.19 Gender and color preference. A study asked 1,924 male and 3,666 female undergraduate college
students their favorite color. A 95% confidence interval for the difference between the proportions of males
and females whose favorite color is black (pmale - pfemale) was calculated to be (0.02, 0.06). Based on this
information, determine if the following statements about undergraduate college students are true or false,
and explain your reasoning for each statement you identify as false.
(a) We are 95% confident that the true proportion of males whose favorite color is black is 2% lower to 6%
higher than the true proportion of females whose favorite color is black.
(b) We are 95% confident that the true proportion of males whose favorite color is black is 2% to 6% higher
than the true proportion of females whose favorite color is black.
(c) 95% of random samples will produce 95% confidence intervals that include the true difference between
the population proportions of males and females whose favorite color is black.
(d) We can conclude that there is a significant difference between the proportions of males and females
whose favorite color is black and that the difference between the two sample proportions is too large to
plausibly be due to chance.
(e) The 95% confidence interval for (pfemale -pmale) cannot be calculated with only the information given
in this exercise.'''
print('''a) false.
b) true.
c) true.
d) true
e) false. multiply the values with (-1) and the new order will be (-0.06,-0.02).''')

a) false.
b) true.
c) true.
d) true
e) false. multiply the values with (-1) and the new order will be (-0.06,-0.02).


In [13]:
'''6.20 Government shutdown. The United States federal government shutdown of 2018{2019 occurred
from December 22, 2018 until January 25, 2019, a span of 35 days. A Survey USA poll of 614 randomly
sampled Americans during this time period reported that 48% of those who make less than $40,000 per
year and 55% of those who make $40,000 or more per year said the government shutdown has not at all
affected them personally. A 95% confidence interval for (p<40K - p>=40K), where p is the proportion of those
who said the government shutdown has not at all affected them personally, is (-0.16, 0.02). Based on this
information, determine if the following statements are true or false, and explain your reasoning if you identify
the statement as false.
(a) At the 5% significance level, the data provide convincing evidence of a real difference in the proportion
who are not affected personally between Americans who make less than $40,000 annually and Americans
who make $40,000 annually.
(b) We are 95% confident that 16% more to 2% fewer Americans who make less than $40,000 per year are
not at all personally affected by the government shutdown compared to those who make $40,000 or more
per year.
(c) A 90% confidence interval for (p<40K - p>=40K) would be wider than the (-0.16, 0.02) interval.
(d) A 95% confidence interval for (p>=40K - p<40K) is (-0.02, 0.16).'''
print('''a) false. no difference level (p<40K - p>=40K = 0) is in the confidence interval. we can't state there's a 
convincing evidence in this significance level and in this sample size there's a difference between two groups.
b) true.
c) false, it would be narrower than the 95% confidence level.
d) true. simply negate and reverse.''')

a) false. no difference level (p<40K - p>=40K = 0) is in the confidence interval. we can't state there's a 
convincing evidence in this significance level and in this sample size there's a difference between two groups.
b) true.
c) false, it would be narrower than the 95% confidence level.
d) true. simply negate and reverse.


In [23]:
'''6.21 National Health Plan, Part III. Exercise 6.11 presents the results of a poll evaluating support for a
generically branded "National Health Plan" in the United States. 79% of 347 Democrats and 55% of 617
Independents support a National Health Plan.
(a) Calculate a 95% confidence interval for the difference between the proportion of Democrats and Inde-
pendents who support a National Health Plan (pD - pI), and interpret it in this context. We have
already checked conditions for you.
(b) True or false: If we had picked a random Democrat and a random Independent at the time of this poll,
it is more likely that the Democrat would support the National Health Plan than the Independent.'''
nD = 347
nI = 617
pD = .79 # proportion of democrats that support the national health plan
minus_pD = 1 - pD
pI = .55 # proportion of independents that support the national health plan
minus_pI = 1 - pI
z = z95 # for the 95% confidence level
phat =  pD - pI
se = np.sqrt( (pD*minus_pD / nD) + (pI *minus_pI / nI) )
me = z * se
lo = phat - me
hi = phat + me

print('''a) conditions are already checked.
SE = {se}
z* = {z} , margin of error: z* x SE = {me}
confidence interval: (phat - z* x SE , phat + z* x SE) = ({lo},{hi})
we're 95% confident that the percentage of the democrats who support the national health plan is from {lo} to {hi} higher 
than the independents.
b) true.
'''.format(succ1=succ_D, fail1=fail_D, succ2=succ_I, fail2=fail_I, z=z,se = se, me = me, lo=format(lo, '.3f'), hi=format(hi, '.3f')))


a) conditions are already checked.
SE = 0.029651854994064007
z* = 1.96 , margin of error: z* x SE = 0.05811763578836545
confidence interval: (phat - z* x SE , phat + z* x SE) = (0.182,0.298)
we're 95% confident that the percentage of the democrats who support the national health plan is from 0.182 to 0.298 higher 
than the independents.
b) true.



In [209]:
'''6.22 Sleep deprivation, CA vs. OR, Part I. According to a report on sleep deprivation by the Centers for
Disease Control and Prevention, the proportion of California residents who reported insucient rest or sleep
during each of the preceding 30 days is 8.0%, while this proportion is 8.8% for Oregon residents. These
data are based on simple random samples of 11,545 California and 4,691 Oregon residents. Calculate a 95%
confidence interval for the dierence between the proportions of Californians and Oregonians who are sleep
deprived and interpret it in context of the data.25'''

'6.22 Sleep deprivation, CA vs. OR, Part I. According to a report on sleep deprivation by the Centers for\nDisease Control and Prevention, the proportion of California residents who reported insu\x0ecient rest or sleep\nduring each of the preceding 30 days is 8.0%, while this proportion is 8.8% for Oregon residents. These\ndata are based on simple random samples of 11,545 California and 4,691 Oregon residents. Calculate a 95%\ncon\ndence interval for the di\nerence between the proportions of Californians and Oregonians who are sleep\ndeprived and interpret it in context of the data.25'

In [210]:
'''6.23 Offshore drilling, Part I. A survey asked 827 randomly sampled registered voters in California \Do
you support? Or do you oppose? Drilling for oil and natural gas off the Coast of California? Or do you
not know enough to say?" Below is the distribution of responses, separated based on whether or not the
respondent graduated from college.
(a) What percent of college graduates and what percent of the non-college graduates in this sample do not know enough to
have an opinion on drilling for oil and natural gas off the Coast of California?
(b) Conduct a hypothesis test to determine if the data provide strong evidence that the proportion of college graduates who
do not have an opinion on this issue is different than that of non-college graduates.'''

'6.23 Offshore drilling, Part I. A survey asked 827 randomly sampled registered voters in California \\Do\nyou support? Or do you oppose? Drilling for oil and natural gas off the Coast of California? Or do you\nnot know enough to say?" Below is the distribution of responses, separated based on whether or not the\nrespondent graduated from college.\n(a) What percent of college graduates and what percent of the\nnon-college graduates in this sample do not know enough to\nhave an opinion on drilling for oil and natural gas off the\nCoast of California?\n(b) Conduct a hypothesis test to determine if the data provide\nstrong evidence that the proportion of college graduates who\ndo not have an opinion on this issue is different than that of\nnon-college graduates.'

In [211]:
'''6.24 Sleep deprivation, CA vs. OR, Part II. Exercise 6.22 provides data on sleep deprivation rates of
Californians and Oregonians. The proportion of California residents who reported insucient rest or sleep
during each of the preceding 30 days is 8.0%, while this proportion is 8.8% for Oregon residents. These data
are based on simple random samples of 11,545 California and 4,691 Oregon residents.
(a) Conduct a hypothesis test to determine if these data provide strong evidence the rate of sleep deprivation
is different for the two states. (Reminder: Check conditions)
(b) It is possible the conclusion of the test in part (a) is incorrect. If this is the case, what type of error was
made?'''

'6.24 Sleep deprivation, CA vs. OR, Part II. Exercise 6.22 provides data on sleep deprivation rates of\nCalifornians and Oregonians. The proportion of California residents who reported insu\x0ecient rest or sleep\nduring each of the preceding 30 days is 8.0%, while this proportion is 8.8% for Oregon residents. These data\nare based on simple random samples of 11,545 California and 4,691 Oregon residents.\n(a) Conduct a hypothesis test to determine if these data provide strong evidence the rate of sleep deprivation\nis different for the two states. (Reminder: Check conditions)\n(b) It is possible the conclusion of the test in part (a) is incorrect. If this is the case, what type of error was\nmade?'

In [212]:
'''6.25 Offshore drilling, Part II. Results of a poll evaluating support for drilling for oil and natural gas o
the coast of California were introduced in Exercise 6.23.
(a) What percent of college graduates and what percent of the non-college graduates in this sample support
drilling for oil and natural gas o the Coast of California?
(b) Conduct a hypothesis test to determine if the data provide strong evidence that the proportion of college
graduates who support o-shore drilling in California is different than that of non-college graduates.'''

'6.25 Offshore drilling, Part II. Results of a poll evaluating support for drilling for oil and natural gas o\n\nthe coast of California were introduced in Exercise 6.23.\n(a) What percent of college graduates and what percent of the non-college graduates in this sample support\ndrilling for oil and natural gas o\n the Coast of California?\n(b) Conduct a hypothesis test to determine if the data provide strong evidence that the proportion of college\ngraduates who support o\n-shore drilling in California is different than that of non-college graduates.'

In [213]:
'''6.26 Full body scan, Part I. A news article reports that "Americans have differing views on two potentially
inconvenient and invasive practices that airports could implement to uncover potential terrorist attacks."
This news piece was based on a survey conducted among a random sample of 1,137 adults nationwide, where
one of the questions on the survey was \Some airports are now using `full-body' digital x-ray machines to
electronically screen passengers in airport security lines. Do you think these new x-ray machines should or
should not be used at airports?" Below is a summary of responses based on party affiliation.27
(a) Conduct an appropriate hypothesis test evaluating whether there is a difference in the proportion of
Republicans and Democrats who think the full- body scans should be applied in airports. Assume that
all relevant conditions are met.
(b) The conclusion of the test in part (a) may be incorrect, meaning a testing error was made. If an error
was made, was it a Type 1 or a Type 2 Error? Explain.'''

'6.26 Full body scan, Part I. A news article reports that "Americans have differing views on two potentially\ninconvenient and invasive practices that airports could implement to uncover potential terrorist attacks."\nThis news piece was based on a survey conducted among a random sample of 1,137 adults nationwide, where\none of the questions on the survey was \\Some airports are now using `full-body\' digital x-ray machines to\nelectronically screen passengers in airport security lines. Do you think these new x-ray machines should or\nshould not be used at airports?" Below is a summary of responses based on party affiliation.27\n(a) Conduct an appropriate hypothesis test evaluating whether there is a difference in the proportion of\nRepublicans and Democrats who think the full- body scans should be applied in airports. Assume that\nall relevant conditions are met.\n(b) The conclusion of the test in part (a) may be incorrect, meaning a testing error was made. If an error\nwas made, was 

In [214]:
'''6.27 Sleep deprived transportation workers. The National Sleep Foundation conducted a survey on
the sleep habits of randomly sampled transportation workers and a control sample of non-transportation
workers. The results of the survey are shown below.
Conduct a hypothesis test to evaluate if these data provide evidence of a difference between the proportions
of truck drivers and non-transportation workers (the control group) who get less than 6 hours of sleep per
day, i.e. are considered sleep deprived.'''

'6.27 Sleep deprived transportation workers. The National Sleep Foundation conducted a survey on\nthe sleep habits of randomly sampled transportation workers and a control sample of non-transportation\nworkers. The results of the survey are shown below.\nConduct a hypothesis test to evaluate if these data provide evidence of a difference between the proportions\nof truck drivers and non-transportation workers (the control group) who get less than 6 hours of sleep per\nday, i.e. are considered sleep deprived.'

In [217]:
'''6.28 Prenatal vitamins and Autism. Researchers studying the link between prenatal vitamin use and
autism surveyed the mothers of a random sample of children aged 24 - 60 months with autism and conducted
another separate random sample for children with typical development. The table below shows the number of
mothers in each group who did and did not use prenatal vitamins during the three months before pregnancy
(periconceptional period).
(a) State appropriate hypotheses to test for independence of use of prenatal vitamins during the three
months before pregnancy and autism.
(b) Complete the hypothesis test and state an appropriate conclusion. (Reminder: Verify any necessary
conditions for the test.)
(c) A New York Times article reporting on this study was titled \Prenatal Vitamins MayWard Off Autism".
Do you nd the title of this article to be appropriate? Explain your answer. Additionally, propose an
alternative title.30'''

'6.28 Prenatal vitamins and Autism. Researchers studying the link between prenatal vitamin use and\nautism surveyed the mothers of a random sample of children aged 24 - 60 months with autism and conducted\nanother separate random sample for children with typical development. The table below shows the number of\nmothers in each group who did and did not use prenatal vitamins during the three months before pregnancy\n(periconceptional period).\n(a) State appropriate hypotheses to test for independence of use of prenatal vitamins during the three\nmonths before pregnancy and autism.\n(b) Complete the hypothesis test and state an appropriate conclusion. (Reminder: Verify any necessary\nconditions for the test.)\n(c) A New York Times article reporting on this study was titled \\Prenatal Vitamins MayWard Off Autism".\nDo you \nnd the title of this article to be appropriate? Explain your answer. Additionally, propose an\nalternative title.30'

In [215]:
'''6.29 HIV in sub-Saharan Africa. In July 2008 the US National Institutes of Health announced that it was
stopping a clinical study early because of unexpected results. The study population consisted of HIV-infected
women in sub-Saharan Africa who had been given single dose Nevaripine (a treatment for HIV) while giving
birth, to prevent transmission of HIV to the infant. The study was a randomized comparison of continued
treatment of a woman (after successful childbirth) with Nevaripine vs Lopinavir, a second drug used to treat
HIV. 240 women participated in the study; 120 were randomized to each of the two treatments. Twenty-
four weeks after starting the study treatment, each woman was tested to determine if the HIV infection was
becoming worse (an outcome called virologic failure). Twenty-six of the 120 women treated with Nevaripine
experienced virologic failure, while 10 of the 120 women treated with the other drug experienced virologic
failure.
(a) Create a two-way table presenting the results of this study.
(b) State appropriate hypotheses to test for difference in virologic failure rates between treatment groups.
(c) Complete the hypothesis test and state an appropriate conclusion. (Reminder: Verify any necessary
conditions for the test.)'''

'6.29 HIV in sub-Saharan Africa. In July 2008 the US National Institutes of Health announced that it was\nstopping a clinical study early because of unexpected results. The study population consisted of HIV-infected\nwomen in sub-Saharan Africa who had been given single dose Nevaripine (a treatment for HIV) while giving\nbirth, to prevent transmission of HIV to the infant. The study was a randomized comparison of continued\ntreatment of a woman (after successful childbirth) with Nevaripine vs Lopinavir, a second drug used to treat\nHIV. 240 women participated in the study; 120 were randomized to each of the two treatments. Twenty-\nfour weeks after starting the study treatment, each woman was tested to determine if the HIV infection was\nbecoming worse (an outcome called virologic failure). Twenty-six of the 120 women treated with Nevaripine\nexperienced virologic failure, while 10 of the 120 women treated with the other drug experienced virologic\nfailure.\n(a) Create a two-way table 

In [218]:
'''6.30 An apple a day keeps the doctor away. A physical education teacher at a high school wanting to
increase awareness on issues of nutrition and health asked her students at the beginning of the semester
whether they believed the expression "an apple a day keeps the doctor away", and 40% of the students
responded yes. Throughout the semester she started each class with a brief discussion of a study highlighting
positive eects of eating more fruits and vegetables. She conducted the same apple-a-day survey at the end
of the semester, and this time 60% of the students responded yes. Can she used a two-proportion method
from this section for this analysis? Explain your reasoning.'''

'6.30 An apple a day keeps the doctor away. A physical education teacher at a high school wanting to\nincrease awareness on issues of nutrition and health asked her students at the beginning of the semester\nwhether they believed the expression "an apple a day keeps the doctor away", and 40% of the students\nresponded yes. Throughout the semester she started each class with a brief discussion of a study highlighting\npositive e\nects of eating more fruits and vegetables. She conducted the same apple-a-day survey at the end\nof the semester, and this time 60% of the students responded yes. Can she used a two-proportion method\nfrom this section for this analysis? Explain your reasoning.'