In [None]:
%matplotlib inline
import os
import glob
import xgboost
import numpy as np
from pylab import *
import pandas as pd
import seaborn as sns
import SimpleITK as sitk
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression,Lasso,LassoCV, LassoLarsCV, LassoLarsIC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, AdaBoostClassifier,VotingClassifier
from sklearn.model_selection import GridSearchCV,KFold, cross_val_predict,StratifiedKFold,train_test_split,cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score,plot_confusion_matrix,auc,plot_roc_curve,ConfusionMatrixDisplay,mean_squared_error
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler,RobustScaler,MinMaxScaler
from sklearn.impute import SimpleImputer
import missingno as msno
from sklearn.decomposition import PCA

RANDOM_NUM = 24


In [None]:
#导入数据
df = pd.read_excel("./features_total.xlsx")
df

In [None]:
#异常值处理
def outliers(x):   #传入某变量  
    mean_value = x.mean()   #计算该变量的均值  
    std_value = x.std()   #计算该变量的标准差  
    rule = (mean_value - 3 * std_value > x) | (x.mean() + 3 * x.std() < x)   #处于(mean-3std,mean+3std)区间外的数据为异常值
    index = np.arange(x.shape[0])[rule]   #获取异常值的行位置索引  
    x = x.replace(x.iloc[index],nan,inplace=True)   #获取异常值的数据 
    return x   #返回异常值的数据

for i in range(117):
    outliers(df[df.columns[i+1]])

In [None]:
#缺失值填充
df_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
df_miss = pd.DataFrame(df_mean.fit_transform(df),columns=df.columns)
df_miss

In [None]:
#划分数据和标签
df_train, df_test = train_test_split(df, test_size=0.3, random_state=RANDOM_NUM, stratify=df['label'])

df_miss = df_train
target=df_miss[list(df_miss.columns)[2:3]]
features=df_miss[list(df_miss.columns)[3:]]
#数据标准化
scaler = StandardScaler()
#scaler = MinMaxScaler()
X_train = scaler.fit_transform(features)
X_train=pd.DataFrame(X_train,columns=df.columns[3:])
y_train = target

In [None]:
#所有数据分类十折交叉验证
def Kfold_model_confusion(model, X, y, name):
    cv = StratifiedKFold(n_splits=10,shuffle=True,random_state=1234)
    accuracy = cross_val_score(model,X, y, cv=cv,scoring="accuracy",n_jobs=-1)
    auc = cross_val_score(model,X, y, cv=cv,scoring="roc_auc",n_jobs=-1)
    recall = cross_val_score(model,X, y, cv=cv,scoring="recall",n_jobs=-1)
    precision = cross_val_score(model,X, y, cv=cv,scoring="precision",n_jobs=-1)
    f1 = cross_val_score(model,X, y, cv=cv,scoring="f1",n_jobs=-1)
    result_dict = {'Accuracy': [accuracy.mean()], 
                  'Auc': [auc.mean()], 
                  'Recall': [recall.mean()], 
                  'Precision': [precision.mean()],
                  'F1 score': [f1.mean()]}
    result = pd.DataFrame(result_dict, index=[name])
    return result

#创建分类器
lr_clf = LogisticRegression()
knn_clf = KNeighborsClassifier()
svm_clf = SVC(probability=True)
forest_clf = RandomForestClassifier(random_state=0,n_jobs=-1,class_weight="balanced")
gra_clf = GradientBoostingClassifier(n_estimators=500)
ada_clf = AdaBoostClassifier(n_estimators=500)
dt_clf = DecisionTreeClassifier(random_state=1234)
lgbm_clf = LGBMClassifier(n_estimators=500)
xgb_clf = xgboost.XGBClassifier(n_estimators=500, learning_rate=0.2, 
                                gamma=0.5, max_depth=20, verbosity=0)
en_clf = VotingClassifier(estimators=[('rf', forest_clf), ('gb', gra_clf), ('xgb', xgb_clf), ('lgbm', lgbm_clf)],
                         voting='soft',weights=[4, 3, 4, 5])


#训练分类器并输出结果
result3 = Kfold_model_confusion(lr_clf, X_train, y_train, 'Logistic Regression')
result3 = pd.concat([result3, Kfold_model_confusion(knn_clf, X_train, y_train, 'KNN')])
result3 = pd.concat([result3, Kfold_model_confusion(svm_clf, X_train, y_train, 'SVM')])
result3 = pd.concat([result3, Kfold_model_confusion(forest_clf, X_train, y_train, 'Random Forest')])
result3 = pd.concat([result3, Kfold_model_confusion(gra_clf, X_train, y_train, 'Gradient Boosting')])
result3 = pd.concat([result3, Kfold_model_confusion(ada_clf, X_train, y_train, 'AdaBoosting')])
result3 = pd.concat([result3, Kfold_model_confusion(dt_clf, X_train, y_train, 'Decision Tree')])
result3 = pd.concat([result3, Kfold_model_confusion(lgbm_clf,X_train, y_train, 'LGBM')])
result3 = pd.concat([result3, Kfold_model_confusion(xgb_clf, X_train, y_train, 'XGBoost')])
result3 = pd.concat([result3, Kfold_model_confusion(en_clf,X_train, y_train, 'Ensemble')])
result3

In [None]:
#混淆矩阵
clf=en_clf
cons = 0
#target_ = np.array(target, dtype = int)
cv = StratifiedKFold(n_splits=10,shuffle=True,random_state=1)
for i, (train, test) in enumerate(cv.split(X_train, y_train)):
    clf.fit(X_train.iloc[train], y_train.iloc[train])
    class_names=['0','1']
    a=confusion_matrix(y_train.iloc[test],(clf.fit(X_train.iloc[train], y_train.iloc[train])).predict(X_train.iloc[test]))
    cons=cons+a
disp = ConfusionMatrixDisplay(confusion_matrix=cons,display_labels=class_names)
disp.plot(cmap = 'Blues')
plt.show()  

In [None]:
cv = StratifiedKFold(n_splits=10,shuffle=True,random_state=1)
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
fig, ax = plt.subplots()   
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
for i, (train, test) in enumerate(cv.split(X_train, y_train)):
    clf.fit(X_train.iloc[train], y_train.iloc[train])
    viz = plot_roc_curve(clf, X_train.iloc[test],y_train.iloc[test])
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
ax.plot([0, 1], [0, 1], linestyle='--', lw=2,color='r', 
         alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)

ax.plot(mean_fpr, mean_tpr, 
        label=r'AUC = %0.2f'% (mean_auc),
        lw=2, alpha=.8)

ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
       title="Receiver operating characteristic")
ax.legend(loc="lower right")

plt.show()

In [None]:
import time

EPSILON = 1e-4
# LassoCV: coordinate descent

# Compute paths
print("Computing regularization path using the coordinate descent lasso...")
t1 = time.time()
model = LassoCV(cv=10).fit(X_train, y_train)
t_lasso_cv = time.time() - t1

# Display results
plt.figure()
#设置字体大小
matplotlib.rcParams.update({'font.size': 15})
ymin, ymax = 2300, 3800
plt.semilogx(model.alphas_ + EPSILON, model.mse_path_, ":")
plt.plot(
    model.alphas_ + EPSILON,
    model.mse_path_.mean(axis=-1),
    "k",
    label="Average across the folds",
    linewidth=2,
)
plt.axvline(
    model.alpha_ + EPSILON, linestyle="--", color="k", label="alpha: CV estimate"
)
print(model.alpha_+ EPSILON)
plt.legend()

plt.xlabel('alpha')
plt.ylabel("Mean square error")
plt.title(
    "Mean square error on each fold"
    
)
plt.axis("tight")
#plt.ylim(ymin, ymax)

In [None]:
model_lasso = Lasso()
coeficents = []
errors_train = []
#errors_test = []

alphas = np.logspace(-5, 5, 200)
for a in alphas:
    model_lasso.set_params(alpha=a)
    model_lasso.fit(X_train, y_train)
    coeficents.append(model_lasso.coef_)
    errors_train.append(mean_squared_error(y_train, model_lasso.predict(X_train)))
#    errors_test.append(mean_squared_error(diabetes_y_test,model_lasso.predict(diabetes_X_test)))

plt.figure(figsize=(20, 6))
matplotlib.rcParams.update({'font.size': 15})
plt.subplot(121)
ax = plt.gca()
ax.plot(alphas, coeficents)
ax.set_xscale('log')
plt.axvline(model.alpha_ + EPSILON, linestyle="--", color="k", label="alpha: CV estimate")
plt.xlabel('alpha')
plt.ylabel('Coefficients')
plt.title('Lasso Coeficients (training)')
plt.axis('tight')

plt.subplot(122)
ax = plt.gca()
ax.plot(alphas, errors_train,linestyle="-", label="Train")
#ax.plot(alphas, errors_test,linestyle="--", label="Test")
ax.set_xscale('log')
plt.axvline(model.alpha_ + EPSILON, linestyle="--", color="k", label="alpha: CV estimate")
plt.xlabel('alpha')
plt.ylabel('Mean Squared Error')
plt.title('Lasso - Training Errors')
plt.axis('tight')

plt.show()

In [None]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
feature_selection= SelectFromModel(Lasso(alpha=model.alpha_,random_state=RANDOM_NUM)) 
feature_selection.fit(X_train,y_train)
selected_feat = X_train.columns[(feature_selection.get_support())]
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
selected_feat

In [None]:
mpl.rcParams['font.sans-serif'] = ['SimHei'] 
mpl.rcParams['axes.unicode_minus'] = False
matplotlib.rcParams.update({'font.size': 12})
plt.figure(figsize=(10,14))
model_lasso = Lasso(alpha = model.alpha_, random_state=RANDOM_NUM).fit(X_train, y_train)
coef = pd.Series(model_lasso.coef_, index = X_train.columns)
imp_coef = pd.concat([coef.sort_values().head(24),
                     coef.sort_values().tail(24)])
imp_coef.plot(kind='barh')

In [None]:
select_df = df.loc[:, selected_feat]
select_df = pd.concat([df[list(df.columns)[:3]], select_df], axis= 1)
display(select_df)
select_df.to_csv('./Selected_features.csv', index=False)

In [None]:
X_train_lasso=X_train.loc[:, selected_feat]
X_train_lasso

In [None]:
#所有数据分类十折交叉验证
def Kfold_model_confusion(model, X, y, name):
    cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=1234)
    accuracy = cross_val_score(model,X, y, cv=cv,scoring="accuracy",n_jobs=-1)
    auc = cross_val_score(model,X, y, cv=cv,scoring="roc_auc",n_jobs=-1)
    recall = cross_val_score(model,X, y, cv=cv,scoring="recall",n_jobs=-1)
    precision = cross_val_score(model,X, y, cv=cv,scoring="precision",n_jobs=-1)
    f1 = cross_val_score(model,X, y, cv=cv,scoring="f1",n_jobs=-1)
    result_dict = {'Accuracy': [accuracy.mean()], 
                  'Auc': [auc.mean()], 
                  'Recall': [recall.mean()], 
                  'Precision': [precision.mean()],
                  'F1 score': [f1.mean()]}
    result = pd.DataFrame(result_dict, index=[name])
    return result

#创建分类器
lr_clf = LogisticRegression()
knn_clf = KNeighborsClassifier()
svm_clf = SVC(probability=True)
forest_clf = RandomForestClassifier(random_state=0,n_jobs=-1,class_weight="balanced")
gra_clf = GradientBoostingClassifier(n_estimators=500)
ada_clf = AdaBoostClassifier(n_estimators=500)
dt_clf = DecisionTreeClassifier(random_state=1234)
lgbm_clf = LGBMClassifier(n_estimators=500)
xgb_clf = xgboost.XGBClassifier(n_estimators=500, learning_rate=0.2, 
                                gamma=0.5, max_depth=20, verbosity=0)
en_clf = VotingClassifier(estimators=[('rf', forest_clf), ('gb', gra_clf), ('xgb', xgb_clf), ('lgbm', lgbm_clf)],
                         voting='soft',weights=[4, 3, 4, 5])


#训练分类器并输出结果
result3 = Kfold_model_confusion(lr_clf, X_train_lasso, y_train, 'Logistic Regression')
result3 = pd.concat([result3, Kfold_model_confusion(knn_clf, X_train_lasso, y_train, 'KNN')])
result3 = pd.concat([result3, Kfold_model_confusion(svm_clf, X_train_lasso, y_train, 'SVM')])
result3 = pd.concat([result3, Kfold_model_confusion(forest_clf, X_train_lasso, y_train, 'Random Forest')])
result3 = pd.concat([result3, Kfold_model_confusion(gra_clf, X_train_lasso, y_train, 'Gradient Boosting')])
result3 = pd.concat([result3, Kfold_model_confusion(ada_clf, X_train_lasso, y_train, 'AdaBoosting')])
result3 = pd.concat([result3, Kfold_model_confusion(dt_clf, X_train_lasso, y_train, 'Decision Tree')])
result3 = pd.concat([result3, Kfold_model_confusion(lgbm_clf,X_train_lasso, y_train, 'LGBM')])
result3 = pd.concat([result3, Kfold_model_confusion(xgb_clf, X_train_lasso, y_train, 'XGBoost')])
result3 = pd.concat([result3, Kfold_model_confusion(en_clf,X_train_lasso, y_train, 'Ensemble')])
result3

In [None]:
#混淆矩阵
clf=en_clf
cons = 0
#target_ = np.array(target, dtype = int)
cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)
for i, (train, test) in enumerate(cv.split(X_train_lasso, y_train)):
    clf.fit(X_train_lasso.iloc[train], y_train.iloc[train])
    class_names=['0','1']
    a=confusion_matrix(y_train.iloc[test],(clf.fit(X_train_lasso.iloc[train], y_train.iloc[train])).predict(X_train_lasso.iloc[test]))
    cons=cons+a
disp = ConfusionMatrixDisplay(confusion_matrix=cons,display_labels=class_names)
disp.plot(cmap = 'Blues')
plt.show()  

In [None]:
cv = StratifiedKFold(n_splits=10,shuffle=True,random_state=1)
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
fig, ax = plt.subplots()   
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
for i, (train, test) in enumerate(cv.split(X_train_lasso, y_train)):
    clf.fit(X_train_lasso.iloc[train], y_train.iloc[train])
    viz = plot_roc_curve(clf, X_train_lasso.iloc[test],y_train.iloc[test])
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
ax.plot([0, 1], [0, 1], linestyle='--', lw=2,color='r', 
         alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)

ax.plot(mean_fpr, mean_tpr, 
        label=r'AUC = %0.2f'% (mean_auc),
        lw=2, alpha=.8)

ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
       title="Receiver operating characteristic")
ax.legend(loc="lower right")

plt.show()

In [None]:
#用selectbest对特征做方差分析F检验
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,f_classif
model = SelectKBest(f_classif, k=44)
select_feature=model.fit_transform(X_train_lasso,y_train)
scores=model.pvalues_
indices=np.argsort(scores)[::-1]
print('Features ANOVA p_value')
#"%0.2f%s" % (scores[indices[i]],
for i in range(len(scores)):
    print("'"+X_train_lasso.columns[indices[i]]+"'"+",")


In [None]:
select_feature=['月经间隔天数',
'嗜碱性粒细胞绝对值',
'高血压史',
'谷酰转肽酶',
'睡眠时间',
'晚睡',
'血糖值',
'是否绝经',
'WHR',
'臀围',
'胡萝卜',
'运动量',
'白蛋白',
'白蛋白球蛋白比',
'噩梦',
'淋巴细胞比例',
'肌酐',
'中性粒细胞比例',
'失眠',
'空腹血糖.葡萄糖.',
'乳腺增生',
'二手烟',
'中性粒细胞绝对值',
'早醒',
'怀孕次数',
'母乳喂养月数',
'奶制品',
'孩子个数',
'城市/农村ur',
'生活满意度',
'经济状况',
'家庭月均收入',
'行为预防得分']

In [None]:
X_train_imp=X_train.loc[:, select_feature]
X_train_imp

In [None]:
#所有数据分类十折交叉验证
def Kfold_model_confusion(model, X, y, name):
    cv = StratifiedKFold(n_splits=10,shuffle=True,random_state=1234)
    accuracy = cross_val_score(model,X, y, cv=cv,scoring="accuracy",n_jobs=-1)
    auc = cross_val_score(model,X, y, cv=cv,scoring="roc_auc",n_jobs=-1)
    recall = cross_val_score(model,X, y, cv=cv,scoring="recall",n_jobs=-1)
    precision = cross_val_score(model,X, y, cv=cv,scoring="precision",n_jobs=-1)
    f1 = cross_val_score(model,X, y, cv=cv,scoring="f1",n_jobs=-1)
    result_dict = {'Accuracy': [accuracy.mean()], 
                  'Auc': [auc.mean()], 
                  'Recall': [recall.mean()], 
                  'Precision': [precision.mean()],
                  'F1 score': [f1.mean()]}
    result = pd.DataFrame(result_dict, index=[name])
    return result

#创建分类器
lr_clf = LogisticRegression()
knn_clf = KNeighborsClassifier()
svm_clf = SVC(probability=True)
forest_clf = RandomForestClassifier(random_state=0,n_jobs=-1,class_weight="balanced")
gra_clf = GradientBoostingClassifier(n_estimators=500)
ada_clf = AdaBoostClassifier(n_estimators=500)
dt_clf = DecisionTreeClassifier(random_state=1234)
lgbm_clf = LGBMClassifier(n_estimators=500)
xgb_clf = xgboost.XGBClassifier(n_estimators=500, learning_rate=0.2, 
                                gamma=0.5, max_depth=20, verbosity=0)
en_clf = VotingClassifier(estimators=[('rf', forest_clf), ('gb', gra_clf), ('xgb', xgb_clf), ('lgbm', lgbm_clf)],
                         voting='soft',weights=[4, 3, 4, 5])


#训练分类器并输出结果
result3 = Kfold_model_confusion(lr_clf, X_train_imp, y_train, 'Logistic Regression')
result3 = pd.concat([result3, Kfold_model_confusion(knn_clf, X_train_imp, y_train, 'KNN')])
result3 = pd.concat([result3, Kfold_model_confusion(svm_clf, X_train_imp, y_train, 'SVM')])
result3 = pd.concat([result3, Kfold_model_confusion(forest_clf, X_train_imp, y_train, 'Random Forest')])
result3 = pd.concat([result3, Kfold_model_confusion(gra_clf, X_train_imp, y_train, 'Gradient Boosting')])
result3 = pd.concat([result3, Kfold_model_confusion(ada_clf, X_train_imp, y_train, 'AdaBoosting')])
result3 = pd.concat([result3, Kfold_model_confusion(dt_clf, X_train_imp, y_train, 'Decision Tree')])
result3 = pd.concat([result3, Kfold_model_confusion(lgbm_clf,X_train_imp, y_train, 'LGBM')])
result3 = pd.concat([result3, Kfold_model_confusion(xgb_clf, X_train_imp, y_train, 'XGBoost')])
result3 = pd.concat([result3, Kfold_model_confusion(en_clf,X_train_imp, y_train, 'Ensemble')])
result3