In [None]:
%matplotlib inline
import os
import glob
import operator
import numpy as np
from pylab import *
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.stats.weightstats as st
from sklearn.preprocessing import StandardScaler,RobustScaler,MinMaxScaler

In [None]:
## 导入数据
df = pd.read_csv("./feature_dist.csv")
display(df)


In [None]:

X_data = df.iloc[:,2:]
scaler = StandardScaler()
X_data = scaler.fit_transform(X_data)
X_data.shape
X_df = pd.DataFrame(X_data, columns=df.columns[2:])
X_df.insert(0, 'label', df[:]['label'])
display(X_df)

### 正态性检验

In [None]:
for i in X_df.columns[1:]:
    print(i,' :') 
    zt = stats.shapiro(X_df[:][i])
    alpha = 0.05
    if(zt.pvalue < alpha): 
        print("正态性检验的统计量为：" + str(zt.statistic),"正态性检验的P值为：" + str(zt.pvalue),'P<0.05，拒绝原假设，不符合正态分布')
    else: 
        print("正态性检验的统计量为：" + str(zt.statistic),"正态性检验的P值为：" + str(zt.pvalue),'P>0.05，不能拒绝原假设，符合正态分布')

### 曼惠特尼U检验（两独立样本差异性检验）

In [None]:
for i in X_df.columns[1:]:
    print('特征',i,'：')
    fc = stats.mannwhitneyu(X_df.loc[X_df[:]["label"] == 1, i], X_df.loc[X_df[:]["label"] == 0, i],alternative='two-sided')
    print("Mann-Whitney 检验的统计量为" + str(fc.statistic))
    print("Mann-Whitney 检验的P,值为" + str("%d"%fc.pvalue))
    
    alpha = 0.05
    if(fc.pvalue < alpha): 
        print('P<0.05，拒绝原假设，两总体有显著差异。')
    else: 
        print('P>0.05，不能拒绝原假设，两总体无显著差异。')

In [None]:
#查看数据集分布
for i in X_df.columns[1:]: 
    # fig, axes = plt.subplots()
    # fig.set_size_inches(10, 5)
    plt.figure(figsize=(3, 3))
    sns.kdeplot(X_df.loc[X_df[:]["label"] == 1, i],legend=False)
    plt.xlabel(i,fontsize=12)
    plt.show()
    # plt.subplots_adjust(wspace=0.2)

### 方差齐性检验

In [None]:
for i in X_df.columns[1:]:
    print('特征',i)
    W, levene_P = stats.levene(X_df.loc[X_df[:]["label"] == 1, i], X_df[:].loc[X_df[:]["label"] == 0, i], center='mean')
    #print("Levene's方差齐性检验的W统计量为" + str(W))
    print("Levene's方差齐性检验的P值为" + str(levene_P))

    alpha = 0.05
    if(levene_P < alpha): 
        print('P<0.05，拒绝原假设，两总体方差有显著差异，不满足方差齐性。')
    else: 
        print('P>0.05，不能拒绝原假设，两总体方差无显著差异，满足方差齐性。')

### t检验

In [None]:
for i in X_df.columns[1:11]:
    print('特征',i,'：')
    t, p_two, df = st.ttest_ind(X_df.loc[X_df[:]["label"] == 1, i], X_df.loc[X_df[:]["label"] == 0, i])

    #print('t=' + str(t))
    print('P值=' + str(p_two))
    #print('自由度=' + str(df))

    alpha = 0.05
    if(p_two < alpha): 
        print('P<0.05，拒绝原假设，平均值有显著差异。')
    else: 
        print('P>0.05，不能拒绝原假设，平均值无显著差异。')

In [None]:
# 差异性检验
methods = ['M-U检验', "t检验", "K-W检验", '方差齐性检验', '曼惠特尼U检验']
for method in methods:
    p_values=[]
    for i in X_df.columns[1:]:
        if method == 'M-U检验':
            m, p=stats.mannwhitneyu(X_df.loc[X_df[:]["label"] == 1, i], X_df.loc[X_df[:]["label"] == 0, i])   #M-U检验
        elif method == "t检验":
            t, p, df = st.ttest_ind(X_df.loc[X_df[:]["label"] == 1, i], X_df.loc[X_df[:]["label"] == 0, i])   #t检验
        elif method == "K-W检验":
            s,p=stats.kruskal(X_df.loc[X_df[:]["label"] == 1, i], X_df.loc[X_df[:]["label"] == 0, i])   #K-W检验
        elif method == '方差齐性检验':
            W, p = stats.levene(X_df.loc[X_df[:]["label"] == 1, i], X_df[:].loc[X_df[:]["label"] == 0, i], center='mean')   #方差齐性检验
        elif method == '曼惠特尼U检验':
            fc = stats.mannwhitneyu(X_df.loc[X_df[:]["label"] == 1, i], X_df.loc[X_df[:]["label"] == 0, i],alternative='two-sided') #曼惠特尼U检验
            p = fc.pvalue
        else:
            raise NotImplementedError('no such method')
        
        p_values.append(p)

    a = pd.DataFrame(p_values,index=X_df.columns[1:],columns=["p_value"])
    a = a.sort_values(by='p_value',ascending=True,axis=0)
    max_p = list(a['p_value'])[-1]
    display(a)  
    a.to_csv(f'./img_dist/{method}.csv')

In [None]:
plt.figure(figsize = (12,5))
sns.set(style="ticks")
ax=sns.barplot(x=a.index, y="p_value", data=a, palette=["#6495ED"])
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.ylim(0,max_p)
plt.xticks(rotation=60,ha='right',fontsize=15) 
plt.ylabel('P_value',fontsize=15) 

### 特征取值热图

In [None]:
xlabel=list(X_df.columns[1:])
plt.figure(figsize = (20,5))
sns.set()
sns.color_palette("coolwarm")
ax=sns.heatmap(X_df.iloc[0:,1:],vmin=0, vmax=1,xticklabels=xlabel,yticklabels=False,cmap="coolwarm")
cb = ax.figure.colorbar(ax.collections[0]) #显示colorbar
cb.ax.tick_params(labelsize=18)  #设置刻度轴字体大小
plt.xticks(rotation=60,fontsize=20)  #设置x轴字体大小、旋转角度
plt.yticks(rotation=360,fontsize=20)  #设置y轴字体大小
plt.xlabel('feature',fontsize=20)  #设置x轴标签字体大小
plt.ylabel('value',fontsize=20) 
# plt.setp(rotation=360, horizontalalignment='right')
plt.show()