In [1]:
# In this analysis, I have cleaned dataset, prepared it to tran algorithms and then I tested the scores for different 
# Classification Algorithms.

# Dataset can be found on https://archive.ics.uci.edu/ml/datasets.php

# Data Set Information:

# This database contains 76 attributes, but all published experiments refer to using a subset of 14 of them. In particular,
# the Cleveland database is the only one that has been used by ML researchers to this date. We have two datasets, Hungarian
# and Switzerland. Approach for both are the same. I have done the coding on Hungarian dataset only.
# The "goal" field refers to the presence of heart disease in the patient.   

In [2]:
# Attribute Information:
#(age) 
#(sex) (1 = male; 0 = female)
#(cp) : chest pain type
#Value 1: typical angina
#Value 2: atypical angina
#Value 3: non-anginal pain
#Value 4: asymptomatic

#(trestbps): resting blood pressure (in mm Hg on admission to the hospital)
#(chol) : serum cholestoral in mg/dl
#(fbs): (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
#(restecg):  resting electrocardiographic results
#Value 0: normal
#Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
#Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

#(thalach): maximum heart rate achieved
#(exang): exercise induced angina (1 = yes; 0 = no)
#(oldpeak): ST depression induced by exercise relative to rest
#(slope): the slope of the peak exercise ST segment
#Value 1: upsloping
#Value 2: flat
#Value 3: downsloping

#(ca): number of major vessels (0-3) colored by flourosopy
#(thal): 3 = normal; 6 = fixed defect; 7 = reversable defect
#(num) (the predicted attribute) diagnosis of heart disease (angiographic disease status)
#Value 0: < 50% diameter narrowing: Absense
#Value 1: > 50% diameter narrowing: Presense

In [3]:
import pandas as pd

In [4]:
# Load the dataset, remove headers, provide names to the attributes
df =pd.read_csv("d:/uhungarian.data", header = None, names=['age','sex','cp','trestbps','chol','fbs','restecg','thalach',
                                                             'exang','oldpeak','slope','ca','thal','num'])
df.head(2)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,28,1,2,130,132,0,2,185,0,0.0,?,?,?,0
1,29,1,2,120,243,0,0,160,0,0.0,?,?,?,0


In [5]:
df.shape

(294, 14)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 14 columns):
age         294 non-null int64
sex         294 non-null int64
cp          294 non-null int64
trestbps    294 non-null object
chol        294 non-null object
fbs         294 non-null object
restecg     294 non-null object
thalach     294 non-null object
exang       294 non-null object
oldpeak     294 non-null float64
slope       294 non-null object
ca          294 non-null object
thal        294 non-null object
num         294 non-null int64
dtypes: float64(1), int64(4), object(9)
memory usage: 32.3+ KB


In [7]:
df.isnull().sum() # no specific null values but we need to check other values/symbols which is not of out interest

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

In [8]:
# We will check each attribute, clean it if required and we will encode it to get it in desired input form for an algorithm
df['trestbps'].value_counts() # found just one '?' symbol, as we will replace it with mean value  

120    65
130    54
140    50
150    23
110    21
160    20
125     8
100     6
180     6
170     5
145     5
135     5
112     3
124     2
118     2
122     2
115     2
200     1
190     1
108     1
113     1
132     1
106     1
142     1
138     1
105     1
155     1
136     1
92      1
?       1
128     1
98      1
Name: trestbps, dtype: int64

In [9]:
df['trestbps'] = df['trestbps'].replace("?",'NaN')
df['trestbps'] = df['trestbps'].astype('float')
mean_trestbps = round(df['trestbps'].mean())
df['En_trestbps'] = df['trestbps'].fillna(mean_trestbps) #to persists the change, we have to save it in another column
print(mean_trestbps)
df['En_trestbps'].value_counts()

133


120.0    65
130.0    54
140.0    50
150.0    23
110.0    21
160.0    20
125.0     8
100.0     6
180.0     6
145.0     5
170.0     5
135.0     5
112.0     3
118.0     2
122.0     2
124.0     2
115.0     2
105.0     1
98.0      1
155.0     1
190.0     1
128.0     1
132.0     1
133.0     1
108.0     1
113.0     1
92.0      1
106.0     1
200.0     1
138.0     1
136.0     1
142.0     1
Name: En_trestbps, dtype: int64

In [10]:
df['chol'].value_counts() #As we have majority values as '?', we can replace it with the mode value

?      23
275     5
246     5
230     5
215     4
       ..
234     1
285     1
173     1
217     1
303     1
Name: chol, Length: 154, dtype: int64

In [11]:
from sklearn.preprocessing import Imputer # used to replace NaN values with desired values
df['chol'] = df['chol'].replace("?",'NaN') # Make "?" as NaN
imp=Imputer(missing_values="NaN", strategy="most_frequent" ) 
df["En_chol"]=imp.fit_transform(df[["chol"]]).ravel()
df["En_chol"].value_counts()



230.0    28
246.0     5
275.0     5
216.0     4
211.0     4
         ..
392.0     1
233.0     1
242.0     1
255.0     1
132.0     1
Name: En_chol, Length: 153, dtype: int64

In [12]:
df['fbs'].value_counts() # '?' found, less in counts, so we will replace it with mean value

0    266
1     20
?      8
Name: fbs, dtype: int64

In [13]:
df['fbs'] = df['fbs'].replace("?", 'NaN')
df['fbs'] = df['fbs'].astype('float')
mean_fbs = round(df['fbs'].mean())
df['En_fbs'] = df['fbs'].fillna(mean_fbs)
df['En_fbs'].value_counts()

0.0    274
1.0     20
Name: En_fbs, dtype: int64

In [14]:
df['restecg'].value_counts() # just one '?', replace with 0 as it has the max frequency

0    235
1     52
2      6
?      1
Name: restecg, dtype: int64

In [15]:
df['restecg'] = df['restecg'].replace("?",'NaN')
df['restecg'] = df['restecg'].astype("float")
mean_rest = round(df['restecg'].mean())
df['En_restecg'] = df['restecg'].fillna(mean_rest)
df['En_restecg'].value_counts()

0.0    236
1.0     52
2.0      6
Name: En_restecg, dtype: int64

In [16]:
df['thalach'].value_counts() # one '?', replace with mean

150    29
140    21
130    17
170    14
160    13
       ..
105     1
176     1
82      1
129     1
102     1
Name: thalach, Length: 72, dtype: int64

In [17]:
df['thalach'] = df['thalach'].replace("?",'NaN')
df['thalach'] = df['thalach'].astype('float')
mean_thalach= round(df['thalach'].mean())

df['En_thalach'] = df['thalach'].fillna(mean_thalach)
df['En_thalach'].value_counts()

150.0    29
140.0    21
130.0    17
170.0    14
160.0    13
         ..
162.0     1
127.0     1
176.0     1
119.0     1
166.0     1
Name: En_thalach, Length: 71, dtype: int64

In [18]:
df['exang'].value_counts() # one "?", replace with mean

0    204
1     89
?      1
Name: exang, dtype: int64

In [19]:
df['exang'] = df['exang'].replace("?",'NaN')
df['exang'] = df['exang'].astype('float')
mean_exang= round(df['exang'].mean())

df['En_exang'] = df['exang'].fillna(mean_exang)
df['En_exang'].value_counts()

0.0    205
1.0     89
Name: En_exang, dtype: int64

In [20]:
df['oldpeak'].value_counts() # perfect 

0.0    189
1.0     41
2.0     31
1.5     16
3.0      9
2.5      3
0.5      2
0.8      1
5.0      1
4.0      1
Name: oldpeak, dtype: int64

In [21]:
df['slope'].value_counts() # MAximum '?', use Imputer

?    190
2     91
1     12
3      1
Name: slope, dtype: int64

In [22]:
df['slope'] = df['slope'].replace("?",'NaN') # Make "?" as NaN
imp=Imputer(missing_values="NaN", strategy="mean" ) 
df["En_slope"]=imp.fit_transform(df[["slope"]]).ravel()
df["En_slope"].value_counts()



1.894231    190
2.000000     91
1.000000     12
3.000000      1
Name: En_slope, dtype: int64

In [23]:
df['ca'].value_counts() # most of all are '?', only 3 are different, we can drop this column also but we will prefer to keep it

?    291
0      3
Name: ca, dtype: int64

In [24]:
df['ca'] = df['ca'].replace("?",'NaN')
df['ca'] = df['ca'].astype('float')
mean_ca= round(df['ca'].mean())

df['En_ca'] = df['ca'].fillna(mean_ca)
df['En_ca'].value_counts()

0.0    294
Name: En_ca, dtype: int64

In [25]:
df['thal'].value_counts() #replac '?' with mean

?    266
7     11
6     10
3      7
Name: thal, dtype: int64

In [26]:
df['thal'] = df['thal'].replace("?",'NaN')
df['thal'] = df['thal'].astype('float')
mean_thal= round(df['thal'].mean())

df['En_thal'] = df['thal'].fillna(mean_thal)
df['En_thal'].value_counts()

6.0    276
7.0     11
3.0      7
Name: En_thal, dtype: int64

In [27]:
# Prepare final dataframe by removing the unwanted attributes keeping all encoded attributes
df1=df.drop(['trestbps','chol','fbs','restecg','thalach','exang','slope','ca','thal'], axis=1)
# Final encoded dataset 
df1.head(2)

Unnamed: 0,age,sex,cp,oldpeak,num,En_trestbps,En_chol,En_fbs,En_restecg,En_thalach,En_exang,En_slope,En_ca,En_thal
0,28,1,2,0.0,0,130.0,132.0,0.0,2.0,185.0,0.0,1.894231,0.0,6.0
1,29,1,2,0.0,0,120.0,243.0,0.0,0.0,160.0,0.0,1.894231,0.0,6.0


In [28]:
# prepare input and output
dfi = df1.iloc[:,df1.columns!='num']
dfo = df1['num']

In [29]:
#split in train and test
from sklearn.model_selection import train_test_split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(dfi,dfo, test_size=0.30)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(205, 13) (205,) (89, 13) (89,)


In [31]:
# Fit and get scores
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

algos = {'Logistic Regression':LogisticRegression(), 'Decision Tree':DecisionTreeClassifier(), 'Random Forest':RandomForestClassifier(),
            'Naive Bayes':GaussianNB(),'KNN':KNeighborsClassifier(), 'SVC_Rbf':SVC(kernel='rbf') }

In [32]:
lsnames, lsscores=[],[]
for i, j in algos.items():
    j.fit(X_train, y_train)
    
    lsnames.append(i)
    lsscores.append(j.score(X_train, y_train))
        
print('_____________Algorithm Scores______________')
for a in range(len(lsnames)):
    print(lsnames[a]," : ", lsscores[a])
    print('===========================================')
    

_____________Algorithm Scores______________
Logistic Regression  :  0.8634146341463415
Decision Tree  :  1.0
Random Forest  :  0.9853658536585366
Naive Bayes  :  0.8439024390243902
KNN  :  0.7365853658536585
SVC_Rbf  :  1.0




In [33]:
# Comparing the scores, the best suited algorithm can be decided. Further more, confusion matrix and classification report
# can also be compared, if required.