<H2>Creating a model a pickle file</H2>

In [1]:
pip cache purge

Files removed: 41
Note: you may need to restart the kernel to use updated packages.


In [2]:
!pip install scikit-learn==1.3.0

Collecting scikit-learn==1.3.0
  Downloading scikit_learn-1.3.0-cp311-cp311-win_amd64.whl (9.2 MB)
                                              0.0/9.2 MB ? eta -:--:--
                                              0.0/9.2 MB 1.4 MB/s eta 0:00:07
                                              0.1/9.2 MB 1.7 MB/s eta 0:00:06
                                              0.2/9.2 MB 1.5 MB/s eta 0:00:07
                                              0.2/9.2 MB 1.5 MB/s eta 0:00:07
                                              0.2/9.2 MB 1.5 MB/s eta 0:00:07
                                              0.2/9.2 MB 737.3 kB/s eta 0:00:13
                                              0.2/9.2 MB 737.3 kB/s eta 0:00:13
                                              0.2/9.2 MB 737.3 kB/s eta 0:00:13
                                              0.2/9.2 MB 518.8 kB/s eta 0:00:18
                                              0.2/9.2 MB 474.7 kB/s eta 0:00:19
                                        

In [3]:
import pandas as pd
import numpy as np
import pickle
import sklearn
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

  from pandas.core import (


<H2> Creating and Saving the model as pickle file</H2>

In [4]:
# Function to load and preprocess data
def load_and_preprocess_data(url):
    column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 
                    'oldpeak', 'slope', 'ca', 'thal', 'target']
    data = pd.read_csv(url, header=None, names=column_names)
    data['target'] = data['target'].apply(lambda x: 1 if x > 0 else 0)
    data = data.replace('?', np.nan)
    data = data.apply(pd.to_numeric)
    data = data.fillna(data.mean())
    return data

# Loading and preprocess the data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
data = load_and_preprocess_data(url)

# Encoding the categorical features
label_encoder = LabelEncoder()
data['sex_encoded'] = label_encoder.fit_transform(data['sex'])
data['cp_encoded'] = label_encoder.fit_transform(data['cp'])
data['fbs_encoded'] = label_encoder.fit_transform(data['fbs'])
data['restecg_encoded'] = label_encoder.fit_transform(data['restecg'])
data['exang_encoded'] = label_encoder.fit_transform(data['exang'])

# Preparing the feature and target variables
X = data.drop(columns=['target', 'sex', 'cp', 'fbs', 'restecg', 'exang'])
y = data['target']

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scalaing features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Training RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Saving the model and scaler
with open('heart_disease_model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("Model and scaler saved as 'heart_disease_model.pkl' and 'scaler.pkl'")

Model and scaler saved as 'heart_disease_model.pkl' and 'scaler.pkl'


<H2>Model Prediction</H2>

In [5]:
# Loading the model and making predictions
with open('heart_disease_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

predictions = loaded_model.predict(X_test_scaled)
comparison = (predictions == y_test.values)

# Creating DataFrame for displaying results
df = pd.DataFrame(X_test_scaled, columns=X_train.columns)
df['Prediction'] = predictions
df['Actual'] = y_test.values
df['Correct'] = comparison

print(df.head(15))

         age  trestbps      chol   thalach   oldpeak     slope        ca  \
0  -0.198627 -0.125982  0.007893  1.021242 -0.873573 -0.963432  2.636335   
1  -0.088632 -1.226617 -0.822617 -1.877375 -0.873573  0.655669  0.393763   
2   0.131357 -0.401140  0.070182 -0.271987  0.138740  0.655669  0.393763   
3   0.351347 -1.776934 -0.241260  0.263142 -0.789214 -0.963432  0.393763   
4  -0.418617 -0.125982  1.232896 -0.361175  0.138740  0.655669 -0.727523   
5  -0.198627  0.424336 -0.884905  0.218548  1.741568  2.274769 -0.727523   
6   1.121311  0.149177  0.173995 -1.030087  1.488490  0.655669  0.393763   
7  -0.198627 -0.511204  0.755353 -2.457098  0.813615  0.655669  1.515049   
8  -1.628560  1.084716 -0.469650  1.377994 -0.873573 -0.963432 -0.727523   
9   0.461342  0.149177 -0.241260  0.486112 -0.451776  0.655669 -0.727523   
10  0.131357 -0.676299 -0.199734  1.244212 -0.198698 -0.963432 -0.727523   
11 -2.288529 -0.786363 -1.320923  1.065836 -0.873573 -0.963432 -0.727523   
12  0.351347

<H2>Model Evaluation</H2>

In [6]:
# Evaluating the model
conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(conf_matrix)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.4f}")

class_report = classification_report(y_test, predictions, target_names=['No Heart Disease', 'Heart Disease'])
print("Classification Report:")
print(class_report)


Confusion Matrix:
[[27  2]
 [ 6 26]]
Accuracy: 0.8689
Classification Report:
                  precision    recall  f1-score   support

No Heart Disease       0.82      0.93      0.87        29
   Heart Disease       0.93      0.81      0.87        32

        accuracy                           0.87        61
       macro avg       0.87      0.87      0.87        61
    weighted avg       0.88      0.87      0.87        61

