# 导致心脏病数据科学项目


# 介绍
心脏病是男女死亡的主要原因。在美国，每年有61万人死于心脏病，占所有死亡人数的四分之一。冠心病是最常见的一种心脏病，它会导致流向心脏的血液减少，每年夺去37万人的生命。一些生理因素导致心脏病，如高胆固醇和高血压(CDC, 2015)。
该数据集包含关于患者和各种变量的信息，以及指定患者是否有超过50%的导致心脏闭塞的动脉的目标变量。50%的阈值是明显狭窄的指示，是心脏病学家和其他内科医生进一步诊断的标志,
数据集中的不同变量，并回答以下问题:
有多少个体变量与直径变窄相关?(相关结果以蓝色显示)
男性和女性之间的变量是什么?(相关结果以黄色标示)
此外，分析将呈现和比较使用决策树预测患者是否有50%的>狭窄。

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sci
import seaborn as sns
import math
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc

from IPython.display import Image  
from sklearn.tree import export_graphviz
%matplotlib inline
import matplotlib.pyplot as plt  
plt.rcParams['font.sans-serif'] = [u'SimHei']
plt.rcParams['axes.unicode_minus'] = False

In [2]:
# pip install graphviz
from graphviz import Source

In [3]:
heart = pd.read_csv("./heart.csv")
print(heart.shape)
heart.head()

(303, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
heart.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [5]:
heart.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

这个数据集中有14列，它们的定义如下:
'age':年龄,
'sex':人的性别(1 =男性，0 =女性),
'cp':胸痛(值1:典型心绞痛，值2:非典型心绞痛，值3:非心绞痛，值4:无症状) 
'trestbps':患者入院时的静息血压(mm Hg),
'chol':人的胆固醇含量以毫克/分升为单位,
'fbs':空腹血糖(> 120 mg/dl, 1 =真;0 = false),
'restecg':静息心电图测量(0 =正常，1 = ST-T波异常，2 =根据Estes标准显示可能或明确的左室肥厚),
'thalach':人的最大心率,
'exang':运动诱发心绞痛(1 =有;0 = no),
'oldpeak':运动相对于休息引起的ST抑制(“ST”与心电图上的位置有关),
'slope':峰值运动ST段的坡度(值1:下降，值2:平缓，值3:上升), 
'ca':主要船舶数量(0-3艘), 
'thal':一种叫做地中海贫血的血液疾病(1 =固定缺陷，2 =正常，3 =反向缺陷), 
'target':血管造影疾病状态(0 = >50%直径变窄，1 = <50%直径变窄),
  

In [6]:
heart.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [7]:
# 查看有无空值
heart.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

注意:sex、cp、fbs、restecg、exang、slope、ca、thal和target是分类的类型。

In [9]:
# 查看重复行
dup_count = heart.duplicated()
dup_count
heart[dup_count]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
164,38,1,2,138,175,0,1,173,0,0.0,2,4,2,1


In [10]:
dup = heart.loc[heart['age'] == 38]
dup

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
163,38,1,2,138,175,0,1,173,0,0.0,2,4,2,1
164,38,1,2,138,175,0,1,173,0,0.0,2,4,2,1
259,38,1,3,120,231,0,1,182,1,3.8,1,0,3,0


# 163和164是重复行，由于相邻可能是录入重复  删除164行