In [1]:
# Import libraries
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [2]:
# A) Read the data file “Hearts_s.csv” (from github using the following command), and assign it to a Pandas DataFrame
df = pd.read_csv("https://github.com/mpourhoma/CS4661/raw/master/Heart_s.csv")

In [3]:
# B) Check out the dataset. As you see, the dataset contains a number of features including both contextual and 
# biological factors (e.g. age, gender, vital signs, …). The last column “AHD” is the label with “Yes” 
# meaning that a human subject has Heart Disease, and “No” meaning that the subject does not have Heart Disease.
df

Unnamed: 0,Age,Gender,ChestPain,RestBP,Chol,RestECG,MaxHR,Oldpeak,Thal,AHD
0,63,f,typical,145,233,2,150,2.3,fixed,No
1,67,f,asymptomatic,160,286,2,108,1.5,normal,Yes
2,67,f,asymptomatic,120,229,2,129,2.6,reversable,Yes
3,37,f,nonanginal,130,250,0,187,3.5,normal,No
4,41,m,nontypical,130,204,2,172,1.4,normal,No
...,...,...,...,...,...,...,...,...,...,...
296,45,f,typical,110,264,0,132,1.2,reversable,Yes
297,68,f,asymptomatic,144,193,0,141,3.4,reversable,Yes
298,57,f,asymptomatic,130,131,0,115,1.2,reversable,Yes
299,57,m,nontypical,130,236,2,174,0.0,normal,Yes


In [4]:
# C) As you see, there are at least 3 categorical features in the dataset (Gender, ChestPain, Thal). 
# Let’s ignore these categorical features for now, only keep the numerical features and build your 
# feature matrix and label vector.
feature_cols = ['Age','RestBP','Chol','RestECG', 'MaxHR', 'Oldpeak']
X = df[feature_cols]  
y = df['AHD']

In [5]:
# D) Split the dataset into testing and training sets with the following parameters: test_size=0.25, random_state=6.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=6)

In [6]:
# E) Use KNN (with k=3), Decision Tree (with random_state=5 (this random state is for decision tree and you put 
# it when you define the decision tree classifier. It is different from the random state that you used to split
# the data in part D)), and Logistic Regression Classifiers to predict Heart Disease based on the training/testing 
# datasets that you built in part (d). Then check, compare, and report the accuracy of these 3 classifiers.
# Which one is the best? Which one is the worst?

# KNN
k = 3
knn = KNeighborsClassifier(n_neighbors=k)
# Training only on the the data set:
knn.fit(X_train, y_train)
# Testing only on the testing set:
y_predict_knn = knn.predict(X_test)
# Calculate the accuracy of your prediction
accuracy_knn = accuracy_score(y_test, y_predict_knn)
print('knn accuracy:', accuracy_knn)

# DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=5)
dt.fit(X_train, y_train)
y_predict_dt = dt.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_predict_dt)
print('Decision Tree accuracy:', accuracy_dt)

# LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_predict_lr = logreg.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_predict_lr)
print('Logistic Regression Classifier accuracy:', accuracy_lr)
print('Logistic Regression Classifier has the best accuracy while the Decision Tree Classifier has the worst accuracy')

knn accuracy: 0.6447368421052632
Decision Tree accuracy: 0.618421052631579
Logistic Regression Classifier accuracy: 0.6710526315789473
Logistic Regression Classifier has the best accuracy while the Decision Tree Classifier has the worst accuracy


In [7]:
# F) Now, we want to use the categorical features as well! To this end, we have to perform a feature engineering 
# process called OneHotEncoding for the categorical features. To do this, each categorical feature should be 
# replaced with dummy columns in the feature table (one column for each possible value of a categorical feature), 
# and then encode it in a binary manner such that only one of the dummy columns can take “1” at a time 
# (and zero for the rest). For example, “Gender” can take two values “m” and “f”. Thus, we need to replace 
# this feature (in the feature table) by 2 columns titled “m” and “f”.  Wherever we have a male subject, we can 
# put “1” and ”0” in the columns “m” and “f”.  Wherever we have a female subject, we can put “0” and ”1” in the 
# columns “m” and “f”. (Hint: you will need 4 columns to encode “ChestPain” and 3 columns to encode “Thal”).

ohe_df = pd.get_dummies(df, columns = ['Gender', 'ChestPain', 'Thal'])
ohe_df

Unnamed: 0,Age,RestBP,Chol,RestECG,MaxHR,Oldpeak,AHD,Gender_f,Gender_m,ChestPain_asymptomatic,ChestPain_nonanginal,ChestPain_nontypical,ChestPain_typical,Thal_fixed,Thal_normal,Thal_reversable
0,63,145,233,2,150,2.3,No,1,0,0,0,0,1,1,0,0
1,67,160,286,2,108,1.5,Yes,1,0,1,0,0,0,0,1,0
2,67,120,229,2,129,2.6,Yes,1,0,1,0,0,0,0,0,1
3,37,130,250,0,187,3.5,No,1,0,0,1,0,0,0,1,0
4,41,130,204,2,172,1.4,No,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,45,110,264,0,132,1.2,Yes,1,0,0,0,0,1,0,0,1
297,68,144,193,0,141,3.4,Yes,1,0,1,0,0,0,0,0,1
298,57,130,131,0,115,1.2,Yes,1,0,1,0,0,0,0,0,1
299,57,130,236,2,174,0.0,Yes,0,1,0,0,1,0,0,1,0


In [8]:
# G) Repeat parts (d) and (e) with the new dataset that you built in part (f). How does the prediction 
# accuracy change for each method?
feature_cols_ohe = ['Age','RestBP','Chol','RestECG', 'MaxHR', 'Oldpeak', 'Gender_f', 'Gender_m', 'ChestPain_asymptomatic', 'ChestPain_nonanginal', 'ChestPain_nontypical', 'ChestPain_typical', 'Thal_fixed', 'Thal_normal', 'Thal_reversable']
X_ohe = ohe_df[feature_cols_ohe] 
y_ohe = ohe_df['AHD']
X_train_ohe, X_test_ohe, y_train_ohe, y_test_ohe = train_test_split(X_ohe, y_ohe, test_size=0.25, random_state=6)

# KNN
k = 3
knn_ohe = KNeighborsClassifier(n_neighbors=k)
# Training only on the the data set:
knn_ohe.fit(X_train_ohe, y_train_ohe)
# Testing only on the testing set:
y_predict_knn_ohe = knn_ohe.predict(X_test_ohe)
# Calculate the accuracy of your prediction
accuracy_knn_ohe = accuracy_score(y_test_ohe, y_predict_knn_ohe)
print('knn accuracy:', accuracy_knn_ohe)

# DecisionTreeClassifier
dt_ohe = DecisionTreeClassifier(random_state=5)
dt_ohe.fit(X_train_ohe, y_train_ohe)
y_predict_dt_ohe = dt_ohe.predict(X_test_ohe)
accuracy_dt_ohe = accuracy_score(y_test_ohe, y_predict_dt_ohe)
print('Decision Tree accuracy:', accuracy_dt_ohe)

# LogisticRegression
# NOTE: The max_iter parameter was used to remove Convergence Warnings. Same results are produced with/without it
logreg_ohe = LogisticRegression(max_iter = 766)
logreg_ohe.fit(X_train_ohe, y_train_ohe)
# Testing only on the testing set:
y_predict_logreg_ohe = logreg_ohe.predict(X_test_ohe)
# Calculate the accuracy of your prediction
accuracy_logreg_ohe = accuracy_score(y_test_ohe, y_predict_logreg_ohe)
print('Logistic Regression Classifier accuracy:', accuracy_logreg_ohe)

print('The knn accuracy did not see a change, while both the Decision Tree accuracy and Regression Classifier accuracy saw an increase in accuracy')

knn accuracy: 0.6447368421052632
Decision Tree accuracy: 0.7368421052631579
Logistic Regression Classifier accuracy: 0.7763157894736842
The knn accuracy did not see a change, while both the Decision Tree accuracy and Regression Classifier accuracy saw an increase in accuracy


In [9]:
# H) Now, repeat part (e) with the new dataset that you built in part (f), but this time using Cross-Validation.
# Thus, rather than splitting the dataset into testing and training, use 10-fold Cross-Validation (as we learned 
# in Lab4) to evaluate the classification methods and report the final prediction accuracy. 

#KNN
k = 3
knn = KNeighborsClassifier(n_neighbors=k)
accuracy_list_knn = cross_val_score(knn, X_ohe, y_ohe, cv=10, scoring='accuracy')
accuracy_knn = accuracy_list_knn.mean()
print('Average for KNN Classifier:', accuracy_knn)

# DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=5)
accuracy_list_dt = cross_val_score(dt, X_ohe, y_ohe, cv=10, scoring='accuracy')
accuracy_dt = accuracy_list_dt.mean()
print('Average for Decision Tree Classifier', accuracy_dt)

# LogisticRegressionClassifier
# NOTE: The max_iter parameter was used to remove Convergence Warnings. Same results are produced with/without it
logreg = LogisticRegression(max_iter = 1241)
accuracy_list_lr = cross_val_score(logreg, X_ohe, y_ohe, cv=10, scoring='accuracy')
accuracy_lr = accuracy_list_lr.mean()
print('Average for Logistic Regression Classifier', accuracy_lr)

Average for KNN Classifier: 0.6343010752688172
Average for Decision Tree Classifier 0.750752688172043
Average for Logistic Regression Classifier 0.810752688172043
