### Calculate the p-value using the t class. 

In [4]:
from scipy.stats import t

# Calculate the formula
t_value = -5.25

# By using the formula (df = no of samples - 1). 
degree_of_freedom = 58

p_value = t.cdf(t_value, degree_of_freedom)
print(p_value)
print(p_value * 2)

1.1281848734666122e-06
2.2563697469332243e-06


### Python Case Study 1 (1-Sample T-test)

In [2]:
import pandas as pd 
import numpy as np    

titanic_df = pd.read_csv('./Datasets/Titanic.csv')

In [3]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [39]:
pop = titanic_df['Age'].dropna()
pop

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
885    39.0
886    27.0
887    19.0
889    26.0
890    32.0
Name: Age, Length: 714, dtype: float64

In [41]:
pop_mean = pop.mean()
pop_mean

np.float64(29.69911764705882)

In [42]:
sample = pop.sample(25)
sample

610    39.00
127    24.00
518    36.00
686    14.00
725    20.00
821    27.00
305     0.92
509    26.00
353    25.00
635    28.00
340     2.00
139    24.00
69     26.00
504    16.00
290    26.00
765    51.00
885    39.00
477    29.00
749    31.00
803     0.42
882    22.00
122    32.50
838    32.00
625    61.00
163    17.00
Name: Age, dtype: float64

In [43]:
sample_mean = sample.mean()
sample_mean

np.float64(25.9536)

In [55]:
# Null Hypothesis -> The mean age is 35. 
pop_mean = 35
# Alternate Hypothesis -> The mean age is not 35.  

In [45]:
# and we have the value of the alpha (significance value). 

In [64]:
# used Shapiro Wilk test for normalality check. 
from scipy.stats import shapiro

normality_or_not = shapiro(sample)

# if p_value > significance_value then the sample is Normally distributed. 
print(normality_or_not)

ShapiroResult(statistic=np.float64(0.9477079296658132), pvalue=np.float64(0.22248296482364976))


In [65]:
from scipy import stats 

# alternative also has these two values --> 'less' and 'greater'
t_statistic, p_value = stats.ttest_1samp(sample, pop_mean, alternative = 'two-sided')

print('t_statistic - ', t_statistic)
print('P_value - ', p_value)

t_statistic -  -3.2493744983891943
P_value -  0.0034071460514494487


In [66]:
# significance Value. 
alpha = 0.05 

if p_value < alpha: 
    print('Reject the Null Hypothesis.')
else: 
    print("Can't Reject the Null Hypothesis.")

Reject the Null Hypothesis.


In [81]:
# One more dataset called Salary_data. 

salary_df = pd.read_csv('./Datasets/Salary_Data.csv')

In [118]:
salary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   YearsExperience  30 non-null     float64
 1   Salary           30 non-null     float64
dtypes: float64(2)
memory usage: 612.0 bytes


In [83]:
salary_df.isnull().sum()

YearsExperience    0
Salary             0
dtype: int64

In [124]:
# Null Hypothesis : The average salary is 50,000 
# Alternate Hypothesis : The average salary is greater than 50,000

# we have alpha predefined(Here it is alpha = 0.01). 

In [119]:
sample = salary_df['Salary'].sample(25).values
sample

array([ 56642., 101302.,  98273., 109431.,  60150.,  67938.,  55794.,
        37731.,  66029.,  63218., 116969.,  39343.,  56957.,  46205.,
       122391.,  39891.,  93940.,  81363.,  91738.,  57189.,  54445.,
       121872., 112635.,  43525.,  64445.])

In [120]:
sample_mean = sample.mean() 
sample_mean

np.float64(74376.64)

In [125]:
# check for normality of the sample using Shapiro Wilk test. 
from scipy.stats import shapiro

normality_or_not = shapiro(sample)

# if p_value > significance_value(0.01) then the sample is Normally distributed.
print(normality_or_not)

ShapiroResult(statistic=np.float64(0.9045687125853292), pvalue=np.float64(0.023099391615180608))


In [128]:
# perform the t-test. 
from scipy import stats 

total_salary_mean = 50000

t_statistic, p_value = stats.ttest_1samp(sample, total_salary_mean, alternative='greater')

print('t-statistic - ', t_statistic)
print('p_value - ', p_value)

t-statistic -  4.3554717525770865
p_value -  0.00010695527235780635


In [129]:
# significance Value. 
alpha = 0.01

if p_value < alpha: 
    print('Reject the Null Hypothesis.')
else: 
    print("Can't Reject the Null Hypothesis.")

Reject the Null Hypothesis.


In [130]:
# One more new Dataset. 

healthcare_df = pd.read_csv('./Datasets/healthcare_dataset.csv')
healthcare_df.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


In [131]:
numeric_columns_df = healthcare_df.select_dtypes(include=['float', 'int'])

In [132]:
numeric_columns_df

Unnamed: 0,Age,Billing Amount,Room Number
0,30,18856.281306,328
1,62,33643.327287,265
2,76,27955.096079,205
3,28,37909.782410,450
4,43,14238.317814,458
...,...,...,...
55495,42,2650.714952,417
55496,61,31457.797307,316
55497,38,27620.764717,347
55498,43,32451.092358,321


In [134]:
# Null Hypothesis : Average age of patient is 40. 
# Alternate Hypothesis : Average age is greater than 40. 

# set a predetermined value of alpha(0.05). 

In [135]:
numeric_columns_df.isnull().sum()

Age               0
Billing Amount    0
Room Number       0
dtype: int64

In [149]:
sample = numeric_columns_df['Age'].sample(40).values 
sample

array([62, 25, 46, 58, 80, 27, 60, 49, 32, 71, 33, 35, 69, 52, 65, 66, 36,
       68, 48, 80, 61, 51, 55, 43, 32, 39, 77, 35, 35, 41, 51, 27, 60, 72,
       84, 23, 39, 68, 68, 60])

In [150]:
sample_mean = sample.mean()
sample_mean

np.float64(52.075)

In [152]:
# normality check using Shapiro wilk test. 
from scipy.stats import shapiro 

normality_or_not = shapiro(sample)

# sample is normal becoz p_value > alpha
print(normality_or_not)

ShapiroResult(statistic=np.float64(0.9593835646631304), pvalue=np.float64(0.15955639454559073))


In [153]:
# perform the t-test. 
from scipy import stats 

pop_mean = 40
t_statistic, p_value = stats.ttest_1samp(sample, pop_mean, alternative = 'greater')

print('t-statistic - ', t_statistic)
print('p_value - ', p_value)

t-statistic -  4.442559068605289
p_value -  3.568970752227182e-05


In [154]:
# significance Value. 
alpha = 0.05

if p_value < alpha: 
    print('Reject the Null Hypothesis.')
else: 
    print("Can't Reject the Null Hypothesis.")

Reject the Null Hypothesis.


### Python Case study 2 (2-Sample t-test (Independent))

In [155]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [161]:
male_pop = titanic_df[titanic_df['Sex'] == 'male']['Age'].dropna()
female_pop = titanic_df[titanic_df['Sex'] == 'female']['Age'].dropna()

In [162]:
male_pop

0      22.0
4      35.0
6      54.0
7       2.0
12     20.0
       ... 
883    28.0
884    25.0
886    27.0
889    26.0
890    32.0
Name: Age, Length: 453, dtype: float64

In [163]:
female_pop

1      38.0
2      26.0
3      35.0
8      27.0
9      14.0
       ... 
879    56.0
880    25.0
882    22.0
885    39.0
887    19.0
Name: Age, Length: 261, dtype: float64

In [164]:
# H0 - Mean age of male and female are similar
# H1 - Mean age of male is higher than female    

# significance value(alpha = 0.05)

In [165]:
male_sample = male_pop.sample(25)
female_sample = female_pop.sample(25)

In [168]:
# check the normality of both the sample using shapiro wilk test. 
from scipy.stats import shapiro 

male_normality = shapiro(male_sample)
female_normality = shapiro(female_sample)

print('Male Normality - ', male_normality)
print('Female Normality - ', female_normality)

Male Normality -  ShapiroResult(statistic=np.float64(0.9460649192024171), pvalue=np.float64(0.2040970312827715))
Female Normality -  ShapiroResult(statistic=np.float64(0.9728314035027668), pvalue=np.float64(0.7172531959765615))


In [169]:
# here we see p_value is greater than alpha so the sample distribution is normal. 

In [176]:
# check for the variance(it is same or not) using the levene test also called f-test. 

from scipy import stats 

result_levene = stats.levene(male_sample, female_sample)

# as you see that p_value is greater than the alpha so vaiance is same for both the sample distribution.
print(result_levene)

LeveneResult(statistic=np.float64(0.06211265092487989), pvalue=np.float64(0.8042508431338997))


In [179]:
# perform the t-test. 
from scipy import stats 

total_salary_mean = 50000

t_statistic, p_value = stats.ttest_ind(male_sample, female_sample, alternative = 'two-sided')

print('t-statistic - ', t_statistic)
print('p_value - ', p_value)

t-statistic -  0.8732482311307138
p_value -  0.3868756190105058


In [180]:
alpha = 0.05 

if p_value < alpha: 
    print('Reject the Null Hypothesis')
else: 
    print("Can't Reject the Null Hypothesis")

Can't Reject the Null Hypothesis


### Paired 2-sample T-test.