<b>Import modules</b>

In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import jaccard_score
import os
import requests
import colorthief

<b>To read CSV file</b>

In [2]:
input = pd.read_csv(r"C:\Users\salim\Desktop\my_codes\heart.csv")

<b>Classifying data with feature columns</b>

In [3]:
output_label = input.columns[-1]
input_features = input.columns[:-1]

<b>Get the first 10 rows</b>

In [4]:
input.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237,0,Normal,170,N,0.0,Up,0
7,54,M,ATA,110,208,0,Normal,142,N,0.0,Up,0
8,37,M,ASY,140,207,0,Normal,130,Y,1.5,Flat,1
9,48,F,ATA,120,284,0,Normal,120,N,0.0,Up,0


In [5]:
input.shape

(918, 12)

In [6]:
X = input[input_features]
Y = input[[output_label]]

<b>Input features</b>

In [7]:
X

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up
...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat


<b>Output labels</b>

In [8]:
Y

Unnamed: 0,HeartDisease
0,0
1,1
2,0
3,1
4,0
...,...
913,1
914,1
915,1
916,1


<b>Train-Test Split</b>

In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1)

In [10]:
x_train

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
555,58,M,NAP,150,219,0,ST,118,Y,0.0,Flat
776,62,F,ASY,150,244,0,Normal,154,Y,1.4,Flat
369,63,M,ASY,150,0,0,Normal,86,Y,2.0,Flat
463,51,M,ASY,128,0,0,Normal,107,N,0.0,Up
408,59,M,ASY,110,0,1,Normal,94,N,0.0,Flat
...,...,...,...,...,...,...,...,...,...,...,...
451,64,M,ASY,144,0,0,ST,122,Y,1.0,Flat
699,57,M,ASY,110,201,0,Normal,126,Y,1.5,Flat
523,59,M,ASY,124,160,0,Normal,117,Y,1.0,Flat
352,56,M,ASY,120,0,0,ST,100,Y,-1.0,Down


<b>And we apply the get_dummies() function</b>

In [11]:
processed_x_train = pd.get_dummies(x_train[["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"]])

<b>Combine two Series</b>

In [12]:
final_x_train = pd.concat([x_train, processed_x_train], axis="columns")

In [13]:
final_x_train = final_x_train.drop(["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"], axis=1)

In [14]:
final_x_train

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
555,58,150,219,0,118,0.0,0,1,0,0,1,0,0,0,1,0,1,0,1,0
776,62,150,244,0,154,1.4,1,0,1,0,0,0,0,1,0,0,1,0,1,0
369,63,150,0,0,86,2.0,0,1,1,0,0,0,0,1,0,0,1,0,1,0
463,51,128,0,0,107,0.0,0,1,1,0,0,0,0,1,0,1,0,0,0,1
408,59,110,0,1,94,0.0,0,1,1,0,0,0,0,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451,64,144,0,0,122,1.0,0,1,1,0,0,0,0,0,1,0,1,0,1,0
699,57,110,201,0,126,1.5,0,1,1,0,0,0,0,1,0,0,1,0,1,0
523,59,124,160,0,117,1.0,0,1,1,0,0,0,0,1,0,0,1,0,1,0
352,56,120,0,0,100,-1.0,0,1,1,0,0,0,0,0,1,0,1,1,0,0


<b>Standardize features by removing the mean and scaling to unit variance</b>

In [15]:
final_x_train = preprocessing.StandardScaler().fit(final_x_train).transform(final_x_train)

In [16]:
final_x_train[0]

array([ 0.47122414,  0.9316057 ,  0.19405592, -0.55590762, -0.73044645,
       -0.81778728, -0.51283227,  0.51283227, -1.0966616 , -0.47680213,
        1.88107989, -0.22558942, -0.50529115, -1.24474203,  2.07240725,
       -1.20761473,  1.20761473, -0.27481054,  1.01218072, -0.8789381 ])

In [17]:
y_train

Unnamed: 0,HeartDisease
555,1
776,1
369,1
463,0
408,1
...,...
451,1
699,0
523,1
352,1


<b>Model building</b>

In [18]:
log_reg = LogisticRegression()
log_reg.fit(final_x_train, y_train)

  return f(*args, **kwargs)


LogisticRegression()

In [19]:
log_reg.coef_

array([[ 0.11310861,  0.0377977 , -0.39254646,  0.4730324 , -0.11356114,
         0.42978858, -0.29223788,  0.29223788,  0.5045942 , -0.33163924,
        -0.24112025, -0.10459066,  0.03455238, -0.00816015, -0.02535472,
        -0.23246808,  0.23246808, -0.05978984,  0.60899883, -0.58322223]])

In [20]:
log_reg.intercept_

array([0.38603602])

<b>Evaluate model-build on validation set</b>

In [21]:
processed_x_test = pd.get_dummies(x_test[["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"]])
final_x_test = pd.concat([x_test, processed_x_test], axis="columns")
final_x_test = final_x_test.drop(["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"], axis=1)

In [22]:
final_x_test = preprocessing.StandardScaler().fit(final_x_test).transform(final_x_test)
final_x_test

array([[-0.93046106, -0.71991879,  0.1089263 , ..., -0.23973165,
        -1.14017543,  1.27615494],
       [ 0.20952271,  0.03846894,  1.36393784, ..., -0.23973165,
         0.87705802, -0.78360391],
       [ 0.41679248,  0.54406076, -1.96703263, ..., -0.23973165,
         0.87705802, -0.78360391],
       ...,
       [-1.13773083, -0.71991879,  0.29764984, ...,  4.17133072,
        -1.14017543, -0.78360391],
       [-1.03409594, -1.03591368,  0.89212899, ..., -0.23973165,
         0.87705802, -0.78360391],
       [ 1.76404602,  1.5552444 ,  0.34483073, ..., -0.23973165,
        -1.14017543,  1.27615494]])

In [23]:
final_x_test.shape

(92, 20)

In [24]:
model_predictions = log_reg.predict(final_x_test)

In [25]:
jaccard_score(y_test, model_predictions)

0.8392857142857143