In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# There are 13 attributes

# age: age in years
# sex: sex (1 = male; 0 = female)
# cp: chest pain type
# -- Value 0: typical angina
# -- Value 1: atypical angina
# -- Value 2: non-anginal pain
# -- Value 3: asymptomatic
# trestbps: resting blood pressure (in mm Hg on admission to the hospital)
# chol: serum cholestoral in mg/dl
# fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
# restecg: resting electrocardiographic results
# -- Value 0: normal
# -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
# -- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
# thalach: maximum heart rate achieved
# exang: exercise induced angina (1 = yes; 0 = no)
# oldpeak = ST depression induced by exercise relative to rest
# slope: the slope of the peak exercise ST segment
# -- Value 0: upsloping
# -- Value 1: flat
# -- Value 2: downsloping
# ca: number of major vessels (0-3) colored by flourosopy
# thal: 0 = normal; 1 = fixed defect; 2 = reversable defect
# and the label
# condition: 0 = no disease, 1 = disease
# Acknowledgements
# Data posted on Kaggle: https://www.kaggle.com/ronitf/heart-disease-uci
# Description of the data above: https://www.kaggle.com/ronitf/heart-disease-uci/discussion/105877
# Original data https://archive.ics.uci.edu/ml/datasets/Heart+Disease

# Creators:
# Hungarian Institute of Cardiology. Budapest: Andras Janosi, M.D.
# University Hospital, Zurich, Switzerland: William Steinbr
# Creators:
# Hungarian Institute of Cardiology. Budapest: Andras Janosi, M.D.
# University Hospital, Zurich, Switzerland: William Steinbrunn, M.D.
# University Hospital, Basel, Switzerland: Matthias Pfisterer, M.D.
# V.A. Medical Center, Long Beach and Cleveland Clinic Foundation: Robert Detrano, M.D., Ph.D.
# Donor: David W. Aha (aha '@' ics.uci.edu) (714) 856-8779


In [3]:
# Data collection

In [2]:
df = pd.read_csv('heart_cleveland_upload.csv')

In [3]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,1,3,152,223,0,0,181,0,0.0,0,0,2,1
293,39,1,3,118,219,0,0,140,0,1.2,1,0,2,1
294,35,1,3,120,198,0,0,130,1,1.6,1,0,2,1
295,35,0,3,138,183,0,0,182,0,1.4,0,0,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        297 non-null    int64  
 1   sex        297 non-null    int64  
 2   cp         297 non-null    int64  
 3   trestbps   297 non-null    int64  
 4   chol       297 non-null    int64  
 5   fbs        297 non-null    int64  
 6   restecg    297 non-null    int64  
 7   thalach    297 non-null    int64  
 8   exang      297 non-null    int64  
 9   oldpeak    297 non-null    float64
 10  slope      297 non-null    int64  
 11  ca         297 non-null    int64  
 12  thal       297 non-null    int64  
 13  condition  297 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 32.6 KB


In [5]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
count,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0
mean,54.542088,0.676768,2.158249,131.693603,247.350168,0.144781,0.996633,149.599327,0.326599,1.055556,0.602694,0.676768,0.835017,0.461279
std,9.049736,0.4685,0.964859,17.762806,51.997583,0.352474,0.994914,22.941562,0.469761,1.166123,0.618187,0.938965,0.95669,0.49934
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,2.0,120.0,211.0,0.0,0.0,133.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,56.0,1.0,2.0,130.0,243.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,0.0,0.0
75%,61.0,1.0,3.0,140.0,276.0,0.0,2.0,166.0,1.0,1.6,1.0,1.0,2.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,3.0,2.0,1.0


In [6]:
df['condition'].value_counts()

condition
0    160
1    137
Name: count, dtype: int64

In [7]:
df = df.rename(columns={"condition": "target"})

In [8]:
# Splitting the dataset into train and test data

In [9]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,1,3,152,223,0,0,181,0,0.0,0,0,2,1
293,39,1,3,118,219,0,0,140,0,1.2,1,0,2,1
294,35,1,3,120,198,0,0,130,1,1.6,1,0,2,1
295,35,0,3,138,183,0,0,182,0,1.4,0,0,0,0


In [10]:
x = df.drop('target',axis=1)
y= df['target']

In [11]:
x

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,1,3,152,223,0,0,181,0,0.0,0,0,2
293,39,1,3,118,219,0,0,140,0,1.2,1,0,2
294,35,1,3,120,198,0,0,130,1,1.6,1,0,2
295,35,0,3,138,183,0,0,182,0,1.4,0,0,0


In [12]:
y

0      0
1      0
2      0
3      1
4      0
      ..
292    1
293    1
294    1
295    0
296    1
Name: target, Length: 297, dtype: int64

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
scaler = StandardScaler()

In [17]:
scaler_df = scaler.fit_transform(x)

In [18]:
scaler_df

array([[ 1.60030243,  0.69109474, -2.24062879, ...,  0.6437811 ,
         0.34482438, -0.87429153],
       [ 1.60030243, -1.44697961, -2.24062879, ..., -0.97658319,
         1.41162482, -0.87429153],
       [ 1.26824154, -1.44697961, -2.24062879, ...,  2.26414539,
        -0.72197605, -0.87429153],
       ...,
       [-2.16305433,  0.69109474,  0.87388018, ...,  0.6437811 ,
        -0.72197605,  1.21977769],
       [-2.16305433, -1.44697961,  0.87388018, ..., -0.97658319,
        -0.72197605, -0.87429153],
       [-2.16305433,  0.69109474,  0.87388018, ..., -0.97658319,
        -0.72197605,  1.21977769]])

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x_train,x_test,y_train,y_test = train_test_split(x,y,shuffle=True,stratify=y,random_state=42,test_size=0.2)

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
linear_model = LogisticRegression()

In [17]:
linear_model.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
from sklearn.metrics import accuracy_score

In [19]:
y_train_pred = linear_model.predict(x_train)
print(f"Accuracy of training data is : {round(accuracy_score(y_train,y_train_pred)*100,2)}")

Accuracy of training data is : 85.23


In [20]:
y_test_pred = linear_model.predict(x_test)
print(f"Accuracy of testing data is : {round(accuracy_score(y_test,y_test_pred)*100,2)}")

Accuracy of testing data is : 90.0


In [21]:
# Let's predict any random value  and see how our model perform

In [22]:
input_ = (35,1,3,120,198,0,0,130,1,1.6,1,0,2)
np_input = np.asarray(input_)

In [23]:
np_input

array([ 35. ,   1. ,   3. , 120. , 198. ,   0. ,   0. , 130. ,   1. ,
         1.6,   1. ,   0. ,   2. ])

In [24]:
np_input_scale = np_input.reshape(1,-1)
np_input_scale.shape

(1, 13)

In [25]:
np_input_scale

array([[ 35. ,   1. ,   3. , 120. , 198. ,   0. ,   0. , 130. ,   1. ,
          1.6,   1. ,   0. ,   2. ]])

In [35]:
np_input_scaled = StandardScaler().fit_transform(np_input_scale)

In [37]:
np_input_scaled

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [26]:
linear_model.predict(np_input_scale)



array([1], dtype=int64)

In [27]:
import pickle

In [28]:
with open('training_model.svg','wb') as model:
    pickle.dump(linear_model,model)