In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle

In [10]:
# 1. Load the raw data
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [12]:
# 2. Drop unnecessary columns
df = df.drop(['id', 'dataset', 'ca', 'slope', 'thal'], axis=1)

In [13]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,num
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,2
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,1
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,0
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,0


In [14]:
# 3. EXPLICIT MAPPING (Converting Text to Numbers)
# Sex
df['sex'] = df['sex'].map({'Male': 1, 'Female': 0})

# Chest Pain
df['cp'] = df['cp'].map({
    'typical angina': 0, 
    'atypical angina': 1, 
    'non-anginal': 2, 
    'asymptomatic': 3
})

# FBS (True/False)
df['fbs'] = df['fbs'].map({True: 1, False: 0})

# Exang (True/False)
df['exang'] = df['exang'].map({True: 1, False: 0})

# Resting ECG
df['restecg'] = df['restecg'].map({
    'normal': 0, 
    'st-t abnormality': 1, 
    'lv hypertrophy': 2
})

In [15]:
#4 Force all columns to be numbers. If text is found, turn it into NaN.
df = df.apply(pd.to_numeric, errors='coerce')

# Fill ALL empty boxes (NaN) with 0. 
# This ensures the model never sees a NaN.
df.fillna(0, inplace=True)

In [16]:
# 5. Fix Target Variable
df['target'] = df['num'].apply(lambda x: 1 if x > 0 else 0)
df.drop('num', axis=1, inplace=True)

In [17]:
# 6. Train the Model
X = df.drop('target', axis=1)
Y = df['target']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train)

print("Training Successful!")

Training Successful!


In [18]:
pred = model.predict(X_test)
print(f"Accuracy Score: {accuracy_score(Y_test, pred)}")

Accuracy Score: 0.7989130434782609


In [19]:
pickle.dump(model, open('heart_disease_model.sav', 'wb'))
print("Model saved as 'heart_disease_model.sav'")

Model saved as 'heart_disease_model.sav'
