In [31]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import t, ttest_rel, ttest_ind, norm

In [2]:
DATA_DIR = "../Datasets/"

#### Background
A study was conducted at a large state university in order to compare the sleeping habits of undergraduate students to those of graduate students. Random samples of 75 undergraduate students and 50 graduate students were chosen and each of the subjects was asked to report the number of hours he or she sleeps in a typical day. The thought was that since undergraduate students are generally younger and party more during their years in school, they sleep less, on average, than graduate students. Do the data support this hypothesis? The following figure summarizes the problem:

In [3]:
data = pd.read_excel(os.path.join(DATA_DIR, "sleep2.xls"))

In [5]:
data.head()

Unnamed: 0,undergraduate,graduate
0,6,8.0
1,5,5.0
2,6,6.0
3,6,6.0
4,8,6.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 2 columns):
undergraduate    75 non-null int64
graduate         50 non-null float64
dtypes: float64(1), int64(1)
memory usage: 1.2 KB


In [8]:
data.describe()

Unnamed: 0,undergraduate,graduate
count,75.0,50.0
mean,6.186667,6.42
std,1.048723,1.031939
min,3.0,4.0
25%,6.0,6.0
50%,6.0,6.0
75%,7.0,7.0
max,9.0,8.0


In [26]:
y1_bar = data["undergraduate"].mean()
y2_bar = data["graduate"].mean()
s1 = data["undergraduate"].std()
s2 = data["graduate"].std()
n1 = data.describe()["undergraduate"]["count"]
n2 = data.describe()["graduate"]["count"]

In [27]:
y1_bar, y2_bar, s1, s2, n1, n2

(6.1866666666666665, 6.42, 1.0487229471217934, 1.031938931906183, 75.0, 50.0)

In [28]:
print("Samples were random, so both samples are independent.")
print("Sample size of population 1 is {} which is greater than 30.".format(n1))
print("Sample size of population 2 is {} which is also greater than 30.".format(n2))
print("Since above conditions are met so we can move ahead with t-test")

Samples were random, so both samples are independent.
Sample size of population 1 is 75.0 which is greater than 30.
Sample size of population 2 is 50.0 which is also greater than 30.
Since above conditions are met so we can move ahead with t-test


In [29]:
t_score = ((y1_bar - y2_bar) - 0) / np.sqrt((np.square(s1) / n1) + (np.square(s2) / n2))

In [30]:
t_score

-1.2304203850632731

#### For calculating two sample t-test we can use scipy.stats.ttest_rel method

In [60]:
test_stats, p_value = ttest_ind(data["undergraduate"].values, data["graduate"].values, nan_policy="omit")

In [61]:
print("Test statistics : ", test_stats)
print("p value : ", p_value)

Test statistics :  -1.2264248015515442
p value :  0.22238149461453863


In [62]:
# Since ttest_ind is used for two-sided test, we have to divide it by 2 for one-sided test
# And the given problem looks if sample of population 1's mean is smaller than sample of population 2's mean.
# So, we will have here one-sided test, so dividing p_value by 2
print("One-sided p-value : ", p_value / 2)

One-sided p-value :  0.11119074730726931
