In [None]:
import pandas as pd
# 读取heart数据集
df = pd.read_csv("./data/heart.csv")
df.head()

In [None]:
# 查看各特征值列中的缺失值信息
df.isnull().sum()

In [None]:
# 缺失值处理*
df = df.dropna()
df.info()

In [None]:
# 将thal列 数据映射到 0, 1 , 2
df.replace({"thal" : {3:0,6:1,7:2}},inplace=True)
# 将target 映射为 0，1 二分类
df.replace({"target" : {1:1,2:1,3:1,4:1}},inplace=True)

In [None]:
'''特征工程'''
# 修改列名
df.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar','rest_ecg','max_heart_rate',
'exercise_induced_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']
df.head()

In [None]:
# 将定类特征由整数编码转为实际对应的字符串
df['sex'][df['sex'] == 0] = 'female'
df['sex'][df['sex'] == 1] = 'male'

df['chest_pain_type'][df['chest_pain_type'] == 1] = 'typical_angina'
df['chest_pain_type'][df['chest_pain_type'] == 2] = 'atypical_angina'
df['chest_pain_type'][df['chest_pain_type'] == 3] = 'non-angina_pain'
df['chest_pain_type'][df['chest_pain_type'] == 4] = 'asymptomic'

df['fasting_blood_sugar'][df['fasting_blood_sugar'] == 0] = '<120mg/ml'
df['fasting_blood_sugar'][df['fasting_blood_sugar'] == 1] = '>120mg/ml'

df['rest_ecg'][df['rest_ecg'] == 0] = 'normal'
df['rest_ecg'][df['rest_ecg'] == 1] = 'ST-T'
df['rest_ecg'][df['rest_ecg'] == 2] = 'left_heart_big'

df['exercise_induced_angina'][df['exercise_induced_angina'] == 0] = 'no'
df['exercise_induced_angina'][df['exercise_induced_angina'] == 1] = 'yes'

df['st_slope'][df['st_slope'] == 1] = 'up'
df['st_slope'][df['st_slope'] == 2] = 'normal'
df['st_slope'][df['st_slope'] == 3] = 'down'

df['thalassemia'][df['thalassemia'] == 0] = 'normal'
df['thalassemia'][df['thalassemia'] == 1] = 'fixed'
df['thalassemia'][df['thalassemia'] == 2] = 'reversable'

df.head()

In [None]:
'''将离散的定类和定序特征序列转为One-Hot编码'''
df = pd.get_dummies(df)
df.head()

In [None]:
'''划分训练集和测试集'''
y = df.target
X = df.drop(['target'], axis=1)

import sklearn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=10)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=5 , n_estimators=100, random_state=5)
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
y_pred = model.predict(X_test)
# 准确率
print("准确率：" + str(accuracy_score(y_test, y_pred)))
# 精确率
print("精确率：" + str(precision_score(y_test, y_pred)))
# # 召回率
print("召回率：" + str(recall_score(y_test, y_pred)))
# # F1值
print("F1值： " + str(f1_score(y_test, y_pred)))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
# y_test：实际的标签, dataset_pred：预测的概率值。
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr) 
#画图，只需要plt.plot(fpr,tpr),变量roc_auc只是记录auc的值，通过auc()函数能计算出来 
plt.plot(fpr, tpr, lw=1, label='ROC(area = %0.2f)' % (roc_auc))
plt.xlabel("FPR (False Positive Rate)")
plt.ylabel("TPR (True Positive Rate)")
plt.title("Receiver Operating Characteristic, ROC(AUC = %0.2f)"% (roc_auc))
plt.show()