# python实现假设检验

In [None]:
import pandas as pd 
import numpy as np 
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
df:pd.DataFrame=pd.read_csv('test.csv')
df.head()


In [None]:
df.shape # 样本数>30属于大样本

In [None]:
fig,ax=plt.subplots(1,2,figsize=(8,6))
sns.distplot(
    df['Temperature'],ax=ax[0]
)
sns.distplot(
    df['HeartRate'],ax=ax[1]
)

从可视化的直观上看，温度和心率的样本是服从正态分布

# 人体均值是否为98.6华氏度？

$$
H_0: \mu=98.6;\ H_1:\mu \ne 98.6
$$

检验统计量
$$
z=\frac{\bar{x}-\mu_0}{\sigma/\sqrt{n}}
$$

In [None]:
# 手动计算
# 95%置信水平, 显著性水平alpha=0.05
alpha=0.05
mu=98.6
x_bar=np.mean(df['Temperature'])
sigma=np.std(df['Temperature'],ddof=1)
n=df.shape[0]
z=(x_bar-mu)/(sigma/np.sqrt(n))

z_score=stats.norm.ppf(1-alpha)

print('z: ',z)
print('z score: ',z_score)

if z<z_score:
    print('Null Hypothesis Rejected')
else:
    print('Null Hypothesis cannot be Rejected')



In [None]:
# scipy.stats计算
t,p_value=stats.ttest_1samp(
    df['Temperature'],popmean=mu
)
print('t: ',t,' p: ',p_value)


 # 人体的温度是否服从正态分布?
```py
# Shapiro-Wilk检验
stats.shapiro()
# K-S检验
stats.kstest()

In [None]:
alpha=0.05
stat,p=stats.shapiro(df['Temperature'])
if p>alpha:
    print('Sample looks Gaussian(Fail to reject H0)')
else:
    print('Sample looks not Gaussian(reject H0)')
# 结果符合正态分布

In [None]:
stat,p=stats.kstest(df['Temperature'],'norm',args=(
    df['Temperature'].mean(),df['Temperature'].std()
))
print('p: ',p)
if p>alpha:
    print('Sample looks Gaussiian(Fail to reject H0)')
else:
    print('Sample looks not Gaussian(reject H0)')


# 人体体温中存在的异常数据是哪些？

sklearn中的`neighbors.LocalOutlierFactor`可以检测异常值

具体用法请看[这里](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html#sklearn.neighbors.LocalOutlierFactor)

In [None]:
from sklearn.neighbors import LocalOutlierFactor

clf=LocalOutlierFactor(n_neighbors=20)  
predict=clf.fit_predict(df[['Temperature']])
# 正常:1 异常:0
df.loc[predict<0]



# 男女体温是否存在明显差异？
H0: 有差异
H1: 没有差异

In [None]:
boy=df.loc[
    df['Gender']==1,'Temperature'
]
girl=df.loc[
    df['Gender']==2,'Temperature'
]
stat,p=stats.f_oneway(boy,girl)

print('p: ',p)
if p>alpha:
    print('Cannot reject H0: The Gender makes difference')
else:
    print('Reject H0: There is difference')


# 体温与心率间的相关性(强？弱？中等?)

In [None]:
sns.scatterplot(
    x='HeartRate',y='Temperature',data=df
)

In [None]:
df[['Temperature','HeartRate']].corr()


Pearson系数显示两者相关度不是很高，我们做一个回归看看

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


X=df[['HeartRate']]
y=df['Temperature']
X_train,X_test,y_train,y_test=train_test_split(X,y)

model=LinearRegression().fit(X_train,y_train)
model.score(X_train,y_train)


拟合效果不佳，说明两者相关程度应该会更低