In [1]:
import numpy as np
import pandas as pd
heart=pd.read_csv('heart.csv')
heart

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


## Null hypothesis(Ho): The average cholestrol level is less than or equal to 200 mg/dl
## Alternative hypothesis: The average cholestrol level is greater than 200 mg/dl

In [2]:
import scipy.stats as stats
# One tailed test
cholestrol_levels=heart['chol']

# set the sample size
sample_size=200 #n

# random sample 100 cholestrol levels
sample=cholestrol_levels.sample(n=sample_size,random_state=42)#random_state=42 is used to pick the same set of random numbers 

#define the population mean (hypothesized mean)
population_mean =200 #mu
sample

179    276
228    288
111    126
246    409
60     265
      ... 
150    228
265    212
233    246
247    246
11     275
Name: chol, Length: 200, dtype: int64

In [3]:
# calculate statistics
sample_mean=sample.mean() # mean of the sample (Xbar)
sample_std=sample.std(ddof=1) # sample standard deviation (Sigma)

# Z-test calculation

z_score=(sample_mean-population_mean)/(sample_std/(sample_size**0.5))

In [4]:
stats.norm.ppf(0.975) # critcal value for 2 tailed

1.959963984540054

In [5]:
stats.norm.ppf(0.95) # critcal value for 1 tailed

1.6448536269514722

In [6]:
#percent point function(ppf) gives the value below which a certain percentage of data falls
critical_value=stats.norm.ppf(0.95)
results={'Sample_Mean':sample_mean,"z-score":z_score,"Critical_value ci=95%":critical_value}
print(results)

{'Sample_Mean': 249.515, 'z-score': 13.372692854915593, 'Critical_value ci=95%': 1.6448536269514722}


In [7]:
decision={'Critical value comparison':'Accept Ha' if z_score>critical_value else 'Accept Ho' }
decision

{'Critical value comparison': 'Accept Ha'}

In [8]:
stats.norm.ppf(0.95)

1.6448536269514722

### import scipy
scipy.__version__

In [9]:
#Z test using p-value
#CDF stands for cumm dist func. For a norm dist, the cdf tells u the area under the curve to the left of a given z_score
alpha=0.05
p_value=1-stats.norm.cdf(z_score)
if p_value<alpha:
    print(f"accept alt hyp(p_value:{p_value})")
else:
    print(f"accept null hyp(p_value:{p_value})")

accept alt hyp(p_value:0.0)


## T-test
we can compare the mean cholestrol levels between two groups
grp1- Patients with heart disease (target=1)
grp2- Patients without heart disease (target=0)

## Null hyp: There is no diff in the mean chol levels between the 2 groups
## Alt hyp: There is significant diff in the mean chol levels btwn the two groups

In [27]:
sample_size=25 #n<30
grp1=heart[heart['target']==1]['chol'].sample(n=sample_size,random_state=42)
grp2=heart[heart['target']==0]['chol'].sample(n=sample_size,random_state=42)


In [11]:
# perform  an independent t-test
t_stat,p_value=stats.ttest_ind(grp1,grp2) # here ttest_ind means independent t test
alpha=0.05
print(f'T statistic:{t_stat}')
print(f'P-value:{p_value}')

T statistic:-0.05645685011974626
P-value:0.9552122280739332


In [12]:
if p_value<alpha:
    print('accept alternate hypothesis: There is a significant difference in mean cholestrol levels:')
else:
    print('accept null hypothesis : no significant difference in mean cholestrol levels')

accept null hypothesis : no significant difference in mean cholestrol levels


## paired T test

Null Hypothesis: treatment had no effect on patients


Alternative Hypothesis: treatment has made changes in patients 

In [13]:
bef=np.array([100,102,98,105,110,99,101,103,97,100])
aft=np.array([102,100,99,107,108,98,102,101,96,101])

In [14]:
# perform paired t test
t_stat,p_value=stats.ttest_rel(bef,aft) # here rel indicates the paired data

print(f'T-statistic:{t_stat}')
print(f'P-value:{p_value}')



T-statistic:0.19011727515734334
P-value:0.853436208125729


In [15]:
alpha=0.05

# hyp test condition based on the P-value alone

if p_value<alpha:
    print('accept alternate hypothesis: There is a significant difference between the paired samples')
else:
    print("accept null hypothesis: No significant difference between the paired samples")

accept null hypothesis: No significant difference between the paired samples


In [16]:
# One way ANOVA to compare the mean cholestrol levels (chol)
# across these different values of cp
# chest pain types (although the exact definitions can be vary by dataset)
# 0 : Typical angina (chest pain due to reduced blood flow to the heart muscle)
# 1 : Atypocal angina (chest pain that doesnt fit the classica description of angina )
# 2 : Non- anginal pain (pain related to angina)
# 3 : Asymptomatic (no chest pain)

In [17]:
heart['cp'].unique()

array([3, 2, 1, 0], dtype=int64)

In [18]:
heart

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [19]:
heart[heart['cp']==0]['chol']

4      354
5      192
10     239
18     247
20     234
      ... 
296    197
297    176
298    241
300    193
301    131
Name: chol, Length: 143, dtype: int64

In [20]:
# sample size for each 'cp' type(ensure n<30)
sample_size=10 # adjust as needed
samples=[]

# get unique 'cp' values
cp_values=heart['cp'].unique()

for cp in cp_values:
    #randomly sample from the 'chol' column for each 'cp' type
    sample=heart[heart['cp']==cp]['chol'].sample(n=sample_size,random_state=42) # will have 40 random samples from each category (0,1,2,3)
    samples.append(sample)
print(samples)

[152    227
83     298
0      233
62     186
228    288
106    234
13     211
117    193
24     199
14     283
Name: chol, dtype: int64, 203    274
1      250
54     252
48     216
33     273
163    175
28     417
40     308
15     219
164    175
Name: chol, dtype: int64, 67     234
161    342
132    295
248    283
77     221
287    232
118    204
114    262
134    306
81     308
Name: chol, dtype: int64, 268    286
91     207
221    217
242    212
189    172
57     260
288    335
200    197
201    258
89     248
Name: chol, dtype: int64]


In [21]:
f_statistic,p_value=stats.f_oneway(*samples)
#print the results
print(f'F-statistic: {f_statistic}')
print(f'p_value: {p_value}')

# set the significance level
alpha=0.05

F-statistic: 0.8671400266189693
p_value: 0.46703363060086867


In [22]:
# hypothesis test conclusion based on p-value
if p_value<alpha:
    print('accept alt hyp : there is a sig diff in mean chol levels among chest pain')
else:
    print('accept null hyp : there is no sig diff in mean chol levels among chest pain')

#chol level and cp column are uncorrelated

accept null hyp : there is no sig diff in mean chol levels among chest pain


In [23]:
print('accept alt hyp : there is a sig diff in mean chol levels among chest pain')

accept alt hyp : there is a sig diff in mean chol levels among chest pain
