## One Sample t Test

Analyze if college students get 7.2 hours of sleep, on average, based on a sample of students: alpha=0.05

H0: mu = 7.2

Ha: mu != 7.2

In [4]:
import pandas as pd
import scipy.stats as stats
import math

In [2]:
df = pd.read_csv('students.csv')

In [3]:
df.head()

Unnamed: 0,ID,Gender,Classification,Height,Shoe Size,Phone Time,# of Shoes,Birth order,Pets,Happy,...,Exercise,Stat Pre,Stat Post,Phone Type,Sleep,Social Media,Impact of SocNetworking,Political,Animal,Superhero
0,1,male,senior,67.75,7.0,12.0,12.0,youngest,5.0,0.8,...,360,3.0,,iPhone,7.0,180.0,worse,Democrat,Dog person,Batman
1,2,male,freshman,71.0,7.5,1.5,5.0,middle,4.0,0.75,...,200,9.0,,Android smartphone,7.0,20.0,better,Democrat,Dog person,Batman
2,3,female,freshman,64.0,6.0,25.0,15.0,oldest,8.0,0.9,...,30,7.0,5.0,Android smartphone,8.0,60.0,better,Republican,Dog person,Batman
3,4,female,freshman,63.0,6.5,30.0,30.0,middle,12.0,0.98,...,180,6.0,7.0,iPhone,6.0,60.0,better,Republican,Both,Superman
4,5,male,senior,69.0,6.5,23.0,8.0,oldest,4.0,0.75,...,180,4.0,7.0,iPhone,5.5,60.0,worse,Independent,Dog person,Superman


In [4]:
onesample = stats.ttest_1samp(df['Sleep'], 7.2)

In [5]:
onesample.statistic

-1.92552134000487

In [6]:
onesample.pvalue

0.05795525591903326

In [7]:
df['Sleep'].mean()

6.8618421052631575

In [8]:
print(f'p-value for two sided test: {onesample.pvalue:.4f}')

p-value for two sided test: 0.0580


In [9]:
alpha = 0.05
p_value = onesample.pvalue

if p_value<alpha:
    print('At {} level of significance, we can reject the null hypothesis in favor of Ha.'.format(alpha))
else:
    print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))

At 0.05 level of significance, we fail to reject the null hypothesis.


The principal of the school thinks that the average hours of sleep is at most 7.2

H0: mu = 7.2

Ha: mu < 7.2

In [10]:
print(f'p-value for one sided test: {onesample.pvalue/2:.4f}')

p-value for one sided test: 0.0290


In [11]:
alpha = 0.05
p_value = onesample.pvalue/2

if p_value<alpha:
    print('At {} level of significance, we can reject the null hypothesis in favor of Ha.'.format(alpha))
else:
    print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))

At 0.05 level of significance, we can reject the null hypothesis in favor of Ha.


## Tests on the Difference in Means of Two Normal Distributions, Variances Unknown and Equal

In [12]:
df = pd.read_csv("catalysts.csv")

In [13]:
df

Unnamed: 0,Observation Number,Catalyst1,Catalyst2
0,1,91.5,89.19
1,2,94.18,90.95
2,3,92.18,90.46
3,4,95.39,93.21
4,5,91.79,97.19
5,6,89.07,97.04
6,7,94.72,91.07
7,8,89.21,92.75


In [24]:
xbar1 = df['Catalyst1'].mean()
xbar2 = df['Catalyst2'].mean()

s1 = df['Catalyst1'].std()
s2 = df['Catalyst2'].std()

In [25]:
print('xbar1 = {:.3f}'.format(xbar1))
print('xbar2 = {:.3f}'.format(xbar2))

print('s1 = {:.3f}'.format(s1))
print('s2 = {:.3f}'.format(s2))

xbar1 = 92.255
xbar2 = 92.732
s1 = 2.385
s2 = 2.983


In [28]:
s_pooled = math.sqrt(((len(df['Catalyst1']) - 1) * (s1 ** 2) + (len(df['Catalyst2']) - 1) * (s2 ** 2)) / (len(df['Catalyst1']) - 1 + len(df['Catalyst2']) - 1))
print('spooled = {:.3f}'.format(s_pooled))

spooled = 2.701


In [29]:
t_statistic = (xbar1-xbar2)/(s_pooled*math.sqrt(1/len(df['Catalyst1']) + 1/len(df['Catalyst2'])))

print ('t_statistic = {:.3f}'.format(t_statistic))

t_statistic = -0.354


In [31]:
# df = n1+n2 - 2

p_value = 2*stats.t.cdf(-0.35, 14)

print ('p_value = {:.3f}'.format(p_value))

p_value = 0.732


In [6]:
stats.norm.cdf(-2)

0.022750131948179195

In [32]:
alpha = 0.05

if p_value<alpha:
    print('At {} level of significance, we can reject the null hypothesis in favor of Ha.'.format(alpha))
else:
    print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))

At 0.05 level of significance, we fail to reject the null hypothesis.


### Scipy.stats.ttest for 2 groups

In [34]:
twogroups = stats.ttest_ind(df['Catalyst1'], df['Catalyst2'], equal_var = True)

In [36]:
print(f't statistic for two sided: {twogroups.statistic:.4f}')
print(f'p-value for two sided: {twogroups.pvalue:.4f}')

t statistic for two sided: -0.3536
p-value for two sided: 0.7289


## Arsenic Example

In [37]:
df = pd.read_csv("arsenic.csv")

In [38]:
df

Unnamed: 0,Metro Phoenix,x1,Rural Arizona,x2
0,Phoenix,3,Rimrock,48
1,Chandler,7,Goodyear,44
2,Gilbert,25,New River,40
3,Glendale,10,Apache Junction,38
4,Mesa,15,Buckeye,33
5,Paradise Valley,6,Nogales,21
6,Peoria,12,Black Canyon City,20
7,Scottsdale,25,Sedona,12
8,Tempe,15,Payson,1
9,Sun City,7,Casa Grande,18


In [39]:
twogroups = stats.ttest_ind(df['x1'], df['x2'], equal_var = True)

In [40]:
print(f't statistic for two sided: {twogroups.statistic:.4f}')
print(f'p-value for two sided: {twogroups.pvalue:.4f}')

t statistic for two sided: -2.7669
p-value for two sided: 0.0127


In [41]:
alpha = 0.05
p_value = twogroups.pvalue

if p_value<alpha:
    print('At {} level of significance, we can reject the null hypothesis in favor of Ha. We can say that there is difference between means of two communities'.format(alpha))
else:
    print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))

At 0.05 level of significance, we can reject the null hypothesis in favor of Ha. We can say that there is difference between means of two communities


In [42]:
df['x1'].mean()

12.5

In [43]:
df['x2'].mean()

27.5

## Paired Sample T Test

In [14]:
df = pd.read_csv('prozac.csv')
df

Unnamed: 0,moodpre,moodpost,difference
0,3,5,2
1,0,1,1
2,6,5,-1
3,7,7,0
4,4,10,6
5,3,9,6
6,2,7,5
7,1,11,10
8,4,8,4


In [15]:
df["difference"].mean()

3.6666666666666665

In [48]:
pairedtest = stats.ttest_rel(df['moodpre'], df['moodpost'])

In [52]:
onetailed_pvalue= pairedtest.pvalue / 2
onetailed_pvalue

0.006872912197394244

In [53]:
df = pd.read_csv('students.csv')

In [54]:
df['Height']

0     67.75
1     71.00
2     64.00
3     63.00
4     69.00
      ...  
71    65.00
72    62.00
73    65.00
74    65.00
75    65.00
Name: Height, Length: 76, dtype: float64

In [58]:
stats.shapiro(df['Happy'])

(nan, 1.0)

In [3]:
(10.5-10)/(1.5/math.sqrt(40))

2.1081851067789197

## EXERCISE 1. 

The hourly wages in a particular industry are normally distributed with mean $13.20 and standard deviation $2.50. A company in this industry employs 40 workers, paying them an average of $12.20 per hour. Can this company be accused of paying substandard wages? Use an α = .01 level test.

In [5]:
sample_std = 12.20/math.sqrt(40)

In [6]:
sample_std

1.9289893727027112

In [11]:
p_value = 1-stats.norm.cdf(1.9289)
p_value

0.026871638920348384

In [12]:
alpha = 0.01


if p_value<alpha:
    print('At {} level of significance, we can reject the null hypothesis in favor of Ha. We can say that there is difference between means of two communities'.format(alpha))
else:
    print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))

At 0.01 level of significance, we fail to reject the null hypothesis.


## EXERCISE 2.


Shear strength measurements derived from unconfined compression tests for two types of soils gave the results shown in the following document (measurements in tons per square foot). Do the soils appear to differ with respect to average shear strength, at the 1% significance level?

In [13]:
df = pd.read_csv("soil.csv")

In [14]:
df

Unnamed: 0,Soil1,Soil2
0,1.442,1.364
1,1.943,1.878
2,1.11,1.337
3,1.912,1.828
4,1.553,1.371
5,1.641,1.428
6,1.499,1.119
7,1.347,1.373
8,1.685,1.589
9,1.578,1.714


In [27]:
df = df[0:30]

In [42]:
mean1 = df["Soil1"].mean()
std1 = df["Soil1"].std()
print(f"std2 = {std1}\nmean2 = {mean1}")

std2 = 0.20690284243310084
mean2 = 1.6918


In [41]:
mean2 = df["Soil2"].mean()
std2 = df["Soil2"].std()
print(f"std2 = {std2}\nmean2 = {mean2}")

std2 = 0.22151243186616942
mean2 = 1.4076333333333335


In [45]:
s_pooled = math.sqrt(((len(df['Soil1']) - 1) * (std1 ** 2) + (len(df['Soil2']) - 1) * (std2 ** 2)) / (len(df['Soil1']) - 1 + len(df['Soil2']) - 1))
print('spooled = {:.3f}'.format(s_pooled))

spooled = 0.214


In [46]:
t_statistic = (mean1-mean2)/(s_pooled*math.sqrt(1/len(df['Soil1']) + 1/len(df['Soil2'])))

print ('t_statistic = {:.3f}'.format(t_statistic))

t_statistic = 5.135


In [47]:
twogroups = stats.ttest_ind(df["Soil1"], df['Soil2'], equal_var = True)

In [49]:
twogroups

Ttest_indResult(statistic=5.134893443609085, pvalue=3.440204643633666e-06)

In [54]:
1- stats.norm.cdf(5.13489)

1.4115454705088126e-07