# Supervised Learning: Classification

## Pre-processing

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('../res/heart_trt.csv', sep=';', encoding='utf-8')

### Transforming nominal categorical variables into ordinal categorical variables

In [3]:
df2 = pd.DataFrame.copy(df)

In [4]:
df2.Sex.replace({'M':0, 'F':1}, inplace=True)
df2.ChestPainType.replace({'TA':0, 'ATA':1, 'NAP':2, 'ASY':3}, inplace=True)
df2.RestingECG.replace({'Normal':0, 'ST': 1, 'LVH': 2}, inplace=True)
df2.ExerciseAngina.replace({'N':0, 'Y':1}, inplace=True)
df2.ST_Slope.replace({'Up':0, 'Flat': 1, 'Down':2}, inplace=True) 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df2.Sex.replace({'M':0, 'F':1}, inplace=True)
  df2.Sex.replace({'M':0, 'F':1}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df2.ChestPainType.replace({'TA':0, 'ATA':1, 'NAP':2, 'ASY':3}, inplace=True)
  df2.ChestPainType.replace({'TA':0, 'ATA':1, 'NAP':2, 'ASY':

In [5]:
df2.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,1,140,289.0,0,0,172,0,0.0,0,0
1,49,1,2,160,180.0,0,0,156,0,1.0,1,1
2,37,0,1,130,283.0,0,1,98,0,0.0,0,0
3,48,1,3,138,214.0,0,0,108,1,1.5,1,1
4,54,0,2,150,195.0,0,0,122,0,0.0,0,0


### Forecasters and target

In [6]:
forecasters = df2.iloc[:, 0:11].values
target = df2.iloc[:, 11].values

### Feature scaling analysis

In [7]:
df2.describe()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,53.509269,0.210469,2.251908,132.540894,244.635389,0.23337,0.604144,136.789531,0.40458,0.886696,0.63795,0.55289
std,9.437636,0.407864,0.931502,17.999749,53.347125,0.423206,0.806161,25.467129,0.491078,1.06696,0.60727,0.497466
min,28.0,0.0,0.0,80.0,85.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0
25%,47.0,0.0,2.0,120.0,214.0,0.0,0.0,120.0,0.0,0.0,0.0,0.0
50%,54.0,0.0,3.0,130.0,244.635389,0.0,0.0,138.0,0.0,0.6,1.0,1.0
75%,60.0,0.0,3.0,140.0,267.0,0.0,1.0,156.0,1.0,1.5,1.0,1.0
max,77.0,1.0,3.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,2.0,1.0


* Standardization (Uses mean and std deviation as references)
* Normalization (Uses min and max as references) 
In this case, I will use standardization.

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
forecasters_scaled = StandardScaler().fit_transform(forecasters)

In [10]:
forecasters_df = pd.DataFrame(forecasters_scaled)
forecasters_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,1.859654e-16,7.748558e-18,1.046055e-16,7.767929e-16,-1.86934e-16,4.649135e-17,0.0,-5.114048e-16,-1.046055e-16,7.748558000000001e-17,-3.8742790000000005e-17
std,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546
min,-2.704405,-0.5163086,-2.418822,-2.920572,-2.994023,-0.5517333,-0.749818,-3.016886,-0.8243101,-3.269662,-1.051095
25%,-0.6900904,-0.5163086,-0.2705801,-0.6971063,-0.5745784,-0.5517333,-0.749818,-0.6596226,-0.8243101,-0.8315022,-1.051095
50%,0.05202558,-0.5163086,0.803541,-0.1412398,0.0,-0.5517333,-0.749818,0.04755658,-0.8243101,-0.26885,0.5965186
75%,0.688125,-0.5163086,0.803541,0.4146267,0.4194568,-0.5517333,0.491306,0.7547357,1.213136,0.5751284,0.5965186
max,2.490407,1.936826,0.803541,3.749826,6.721265,1.81247,1.73243,2.561971,1.213136,4.982571,2.244132


### Creating Dummy Variables

In [11]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder

In [12]:
forecasters2 = df.iloc[:, 0:11].values
forecasters2[:,1] = LabelEncoder().fit_transform(forecasters[:,1])

In [13]:
forecasters2[:,2] = LabelEncoder().fit_transform(forecasters[:,2])
forecasters2[:,6] = LabelEncoder().fit_transform(forecasters[:,6])
forecasters2[:,8] = LabelEncoder().fit_transform(forecasters[:,8])
forecasters2[:,10] = LabelEncoder().fit_transform(forecasters[:,10])

In [14]:
forecasters3 = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [1,2,6,8,10])], remainder='passthrough').fit_transform(forecasters2)

In [15]:
forecasters3df = pd.DataFrame(forecasters3)
forecasters3df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,40,140,289.0,0,172,0.0
1,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,49,160,180.0,0,156,1.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,37,130,283.0,0,98,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,48,138,214.0,0,108,1.5
4,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,54,150,195.0,0,122,0.0


### Scaling the features

In [16]:
forecasters3_esc = StandardScaler().fit_transform(forecasters3)
forecasters3df = pd.DataFrame(forecasters3_esc)
forecasters3df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.516309,-0.516309,-0.22981,2.073784,-0.531524,-1.085425,0.815013,-0.490781,-0.507826,0.82431,-0.82431,1.149573,-1.001091,-0.271607,-1.432206,0.414627,0.832075,-0.551733,1.383339,-0.831502
1,-1.936826,1.936826,-0.22981,-0.48221,1.881384,-1.085425,0.815013,-0.490781,-0.507826,0.82431,-0.82431,-0.869888,0.99891,-0.271607,-0.478057,1.52636,-1.212261,-0.551733,0.754736,0.106251
2,0.516309,-0.516309,-0.22981,2.073784,-0.531524,-1.085425,-1.226974,2.037569,-0.507826,0.82431,-0.82431,1.149573,-1.001091,-0.271607,-1.750256,-0.14124,0.719543,-0.551733,-1.523953,-0.831502
3,-1.936826,1.936826,-0.22981,-0.48221,-0.531524,0.921298,0.815013,-0.490781,-0.507826,-1.213136,1.213136,-0.869888,0.99891,-0.271607,-0.584074,0.303453,-0.574578,-0.551733,-1.131075,0.575128
4,0.516309,-0.516309,-0.22981,-0.48221,1.881384,-1.085425,0.815013,-0.490781,-0.507826,0.82431,-0.82431,1.149573,-1.001091,-0.271607,0.052026,0.970493,-0.930931,-0.551733,-0.581047,-0.831502


In [17]:
forecasters3df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0,917.0
mean,-1.472226e-16,1.084798e-16,6.973702000000001e-17,-3.8742790000000005e-17,3.8742790000000005e-17,1.937139e-17,-9.298269e-17,1.549712e-17,0.0,-4.2617070000000006e-17,4.2617070000000006e-17,-3.8742790000000005e-17,0.0,8.523413e-17,1.859654e-16,7.884157e-16,3.014189e-15,-1.549712e-17,-5.114048e-16,-1.859654e-16
std,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546,1.000546
min,-1.936826,-0.5163086,-0.2298105,-0.4822104,-0.5315237,-1.085425,-1.226974,-0.490781,-0.507826,-1.213136,-0.8243101,-0.8698879,-1.001091,-0.2716072,-2.704405,-2.920572,-2.994023,-0.5517333,-3.016886,-3.269662
25%,0.5163086,-0.5163086,-0.2298105,-0.4822104,-0.5315237,-1.085425,-1.226974,-0.490781,-0.507826,-1.213136,-0.8243101,-0.8698879,-1.001091,-0.2716072,-0.6900904,-0.6971063,-0.5745784,-0.5517333,-0.6596226,-0.8315022
50%,0.5163086,-0.5163086,-0.2298105,-0.4822104,-0.5315237,0.9212982,0.8150134,-0.490781,-0.507826,0.8243101,-0.8243101,-0.8698879,0.99891,-0.2716072,0.05202558,-0.1412398,3.19836e-15,-0.5517333,0.04755658,-0.26885
75%,0.5163086,-0.5163086,-0.2298105,-0.4822104,-0.5315237,0.9212982,0.8150134,-0.490781,-0.507826,0.8243101,1.213136,1.149573,0.99891,-0.2716072,0.688125,0.4146267,0.4194568,-0.5517333,0.7547357,0.5751284
max,0.5163086,1.936826,4.351412,2.073784,1.881384,0.9212982,0.8150134,2.037569,1.969177,0.8243101,1.213136,1.149573,0.99891,3.681787,2.490407,3.749826,6.721265,1.81247,2.561971,4.982571


### Summarizing pre-processing

* target = variable we want to get to (has or has not heart disease)
* forecasters = set of features with categoric variables transformed manually, no scalling
* forecasters_scaled = set of features with categoric variables transformed manually, scaled.
* forecasters2 = set of features with categoric variables transformed by labelencoder.
* forecasters3 = set of features with categoric variables transformed by labelencoder and onehotencoder
* forecasters3_esc = set of features transformed by labelencoder and onehot encoder, scaled.

### Separating Training Database from Testing Database

In [19]:
from sklearn.model_selection import train_test_split

In [226]:
x_train, x_test, y_train, y_test = train_test_split(forecasters3_esc, target, test_size=0.3, random_state=0)

### Support Vector Machine (SVM)

In [18]:
from sklearn.svm import SVC

In [140]:
svm = SVC(kernel='rbf', C=2, random_state=1)
svm.fit(x_train, y_train)

In [141]:
predictions_svm = svm.predict(x_test)
predictions_svm

array([1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1])

In [108]:
y_test

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1])

In [24]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [142]:
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_svm) * 100))

Accuracy: 86.23%


In [143]:
confusion_matrix(y_test, predictions_svm)

array([[ 99,  22],
       [ 16, 139]])

In [144]:
print(classification_report(y_test, predictions_svm))

              precision    recall  f1-score   support

           0       0.86      0.82      0.84       121
           1       0.86      0.90      0.88       155

    accuracy                           0.86       276
   macro avg       0.86      0.86      0.86       276
weighted avg       0.86      0.86      0.86       276



#### Analyzing training data

In [145]:
predictions_training = svm.predict(x_train)
predictions_training

array([1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1,

In [146]:
accuracy_score(y_train, predictions_training)

0.9282371294851794

In [147]:
confusion_matrix(y_train, predictions_training)

array([[257,  32],
       [ 14, 338]])

#### Cross-Validation

In [148]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [149]:
kfold = KFold(n_splits=30, random_state=5, shuffle=True)

In [162]:
model = SVC(kernel='rbf', C=2, random_state=1)
result = cross_val_score(model, forecasters3_esc, target, cv=kfold)

print("Avg. Accuracy: %.2f%%" % (result.mean() * 100))

Avg. Accuracy: 85.72%


### K-Nearest Neighbors (KNN)

In [163]:
from sklearn.neighbors import KNeighborsClassifier

In [230]:
knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn.fit(x_train, y_train)

In [233]:
predictions_knn = knn.predict(x_test)
 

In [231]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [234]:
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions_knn) * 100))

Accuracy: 84.78%


In [190]:
confusion_matrix(y_test, predictions_knn)

array([[ 98,  23],
       [ 19, 136]])

In [235]:
print(classification_report(y_test, predictions_knn))

              precision    recall  f1-score   support

           0       0.84      0.81      0.82       121
           1       0.86      0.88      0.87       155

    accuracy                           0.85       276
   macro avg       0.85      0.84      0.84       276
weighted avg       0.85      0.85      0.85       276



#### Analyzing training data

In [236]:
predictions_training = knn.predict(x_train)

In [237]:
accuracy_score(y_train, predictions_training)

0.8939157566302652

In [238]:
confusion_matrix(y_train, predictions_training)

array([[249,  40],
       [ 28, 324]])

#### Cross-Validation

In [239]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [240]:
kfold = KFold(n_splits=30, random_state=5, shuffle=True)

In [241]:
model = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
result = cross_val_score(model, forecasters3_esc, target, cv=kfold)

print("Avg. Accuracy: %.2f%%" % (result.mean() * 100))

Avg. Accuracy: 85.83%
