Dataset Summary

In [4]:
import pandas as pd

# Load the dataset
file_path = '/content/Cardiovascular_Disease_Dataset.csv'
data = pd.read_csv(file_path)

# Display basic information about the dataset
dataset_info = {
    "Shape": data.shape,
    "Columns": data.columns.tolist(),
    "Data Types": data.dtypes.to_dict(),
    "Missing Values": data.isnull().sum().to_dict(),
    "Sample Data": data.head()
}

dataset_info


{'Shape': (1000, 14),
 'Columns': ['patientid',
  'age',
  'gender',
  'chestpain',
  'restingBP',
  'serumcholestrol',
  'fastingbloodsugar',
  'restingrelectro',
  'maxheartrate',
  'exerciseangia',
  'oldpeak',
  'slope',
  'noofmajorvessels',
  'target'],
 'Data Types': {'patientid': dtype('int64'),
  'age': dtype('int64'),
  'gender': dtype('int64'),
  'chestpain': dtype('int64'),
  'restingBP': dtype('int64'),
  'serumcholestrol': dtype('int64'),
  'fastingbloodsugar': dtype('int64'),
  'restingrelectro': dtype('int64'),
  'maxheartrate': dtype('int64'),
  'exerciseangia': dtype('int64'),
  'oldpeak': dtype('float64'),
  'slope': dtype('int64'),
  'noofmajorvessels': dtype('int64'),
  'target': dtype('int64')},
 'Missing Values': {'patientid': 0,
  'age': 0,
  'gender': 0,
  'chestpain': 0,
  'restingBP': 0,
  'serumcholestrol': 0,
  'fastingbloodsugar': 0,
  'restingrelectro': 0,
  'maxheartrate': 0,
  'exerciseangia': 0,
  'oldpeak': 0,
  'slope': 0,
  'noofmajorvessels': 0,
  

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Separate features and target
X = data.drop(columns=['patientid', 'target'])  # Dropping 'patientid' as it's irrelevant
y = data['target']

# Encoding categorical columns if any (like 'gender', 'chestpain', etc.)
categorical_columns = ['gender', 'chestpain', 'fastingbloodsugar', 'restingrelectro', 'exerciseangia', 'slope']
X[categorical_columns] = X[categorical_columns].apply(LabelEncoder().fit_transform)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Prepare a template for machine learning models
model_code = """
# Template for implementing machine learning models

from sklearn.metrics import classification_report, accuracy_score

# Example Model (Logistic Regression)
from sklearn.linear_model import LogisticRegression

# Initialize the model
model = LogisticRegression()

# Train the model
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\\n", classification_report(y_test, y_pred))
"""

# Display preprocessed data details and code
{
    "Processed Feature Shape": X_train_scaled.shape,
    "Code Template": model_code
}


{'Processed Feature Shape': (800, 12),
 'Code Template': '\n# Template for implementing machine learning models\n\nfrom sklearn.metrics import classification_report, accuracy_score\n\n# Example Model (Logistic Regression)\nfrom sklearn.linear_model import LogisticRegression\n\n# Initialize the model\nmodel = LogisticRegression()\n\n# Train the model\nmodel.fit(X_train_scaled, y_train)\n\n# Predict on the test set\ny_pred = model.predict(X_test_scaled)\n\n# Evaluate the model\nprint("Accuracy:", accuracy_score(y_test, y_pred))\nprint("Classification Report:\\n", classification_report(y_test, y_pred))\n'}

implementing a Logistic Regression model:

In [6]:
# Template for implementing machine learning models

from sklearn.metrics import classification_report, accuracy_score

# Example Model (Logistic Regression)
from sklearn.linear_model import LogisticRegression

# Initialize the model
model = LogisticRegression()

# Train the model
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.965
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.95      0.96        83
           1       0.97      0.97      0.97       117

    accuracy                           0.96       200
   macro avg       0.96      0.96      0.96       200
weighted avg       0.96      0.96      0.96       200



Random Forest, Support Vector Machine (SVM), and a simple Neural Network

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)
rf_pred = rf_model.predict(X_test_scaled)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print("Random Forest Classification Report:\n", classification_report(y_test, rf_pred))

# Support Vector Machine
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_scaled, y_train)
svm_pred = svm_model.predict(X_test_scaled)
print("\nSVM Accuracy:", accuracy_score(y_test, svm_pred))
print("SVM Classification Report:\n", classification_report(y_test, svm_pred))

# Neural Network
nn_model = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
nn_model.fit(X_train_scaled, y_train)
nn_pred = nn_model.predict(X_test_scaled)
print("\nNeural Network Accuracy:", accuracy_score(y_test, nn_pred))
print("Neural Network Classification Report:\n", classification_report(y_test, nn_pred))


Random Forest Accuracy: 0.99
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99        83
           1       0.98      1.00      0.99       117

    accuracy                           0.99       200
   macro avg       0.99      0.99      0.99       200
weighted avg       0.99      0.99      0.99       200


SVM Accuracy: 0.96
SVM Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.95      0.95        83
           1       0.97      0.97      0.97       117

    accuracy                           0.96       200
   macro avg       0.96      0.96      0.96       200
weighted avg       0.96      0.96      0.96       200


Neural Network Accuracy: 0.975
Neural Network Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97        83
           1       0.99      0.97      0.98       1

feature selection and hyperparameter tuning

In [8]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

# ====================
# Step 1: Feature Selection
# ====================

# Train a preliminary Random Forest model
feature_selector = RandomForestClassifier(random_state=42)
feature_selector.fit(X_train_scaled, y_train)

# Get feature importances and select top features
importances = feature_selector.feature_importances_
selected_features = SelectFromModel(feature_selector, threshold="mean")
X_train_selected = selected_features.fit_transform(X_train_scaled, y_train)
X_test_selected = selected_features.transform(X_test_scaled)

# Feature Names
selected_columns = X.columns[selected_features.get_support()]
print("Selected Features:", selected_columns.tolist())

# ====================
# Step 2: Hyperparameter Tuning
# ====================

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=2
)

# Perform the grid search
grid_search.fit(X_train_selected, y_train)

# Best Parameters
print("Best Parameters:", grid_search.best_params_)

# ====================
# Step 3: Final Model Evaluation
# ====================

# Train the final model with the best parameters
best_model = grid_search.best_estimator_

# Evaluate the model on the test set
y_pred = best_model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Results
print("Optimized Random Forest Accuracy:", accuracy)
print("Classification Report:\n", report)


Selected Features: ['chestpain', 'restingBP', 'slope']
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Parameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Optimized Random Forest Accuracy: 0.95
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.96      0.94        83
           1       0.97      0.94      0.96       117

    accuracy                           0.95       200
   macro avg       0.95      0.95      0.95       200
weighted avg       0.95      0.95      0.95       200



k-fold cross-validation

In [9]:
from sklearn.model_selection import cross_val_score

cross_val_scores = cross_val_score(rf_model, X_train_selected, y_train, cv=5)
print("Cross-validation scores:", cross_val_scores)
print("Average cross-validation score:", np.mean(cross_val_scores))


Cross-validation scores: [0.975   0.96875 0.94375 0.95    0.94375]
Average cross-validation score: 0.95625


Model Explainability

In [10]:
import joblib

# Save the model
joblib.dump(rf_model, 'final_random_forest_model.pkl')


['final_random_forest_model.pkl']

In [11]:
pip install shap




In [12]:
from flask import Flask, request, jsonify
import joblib

app = Flask(__name__)
model = joblib.load('final_random_forest_model.pkl')

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    prediction = model.predict([data['features']])
    return jsonify({'prediction': prediction.tolist()})

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat


In [13]:
pip install gradio


Collecting gradio
  Downloading gradio-5.7.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.5.0 (from gradio)
  Downloading gradio_client-1.5.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart==0.0.12 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (1.9 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [4]:
import pandas as pd

# Load your dataset
# Assuming your dataset is already loaded into a DataFrame called `df`
df = pd.read_csv('Cardiovascular_Disease_Dataset.csv')

# Get a list of features (column names) and their data types
feature_info = df.dtypes

# Display feature names and their corresponding data types
print(feature_info)


patientid              int64
age                    int64
gender                 int64
chestpain              int64
restingBP              int64
serumcholestrol        int64
fastingbloodsugar      int64
restingrelectro        int64
maxheartrate           int64
exerciseangia          int64
oldpeak              float64
slope                  int64
noofmajorvessels       int64
target                 int64
dtype: object


In [6]:
import gradio as gr
import joblib

# Load the trained Random Forest model
model = joblib.load('final_random_forest_model.pkl')

# Define the function for prediction
def predict(age, gender, chestpain, restingBP, serumcholestrol, fastingbloodsugar,
            restingrelectro, maxheartrate, exerciseangia, oldpeak, slope,
            noofmajorvessels):
    # Collecting the input features into a list for prediction
    features = [age, gender, chestpain, restingBP, serumcholestrol, fastingbloodsugar,
                restingrelectro, maxheartrate, exerciseangia, oldpeak, slope,
                noofmajorvessels]
    prediction = model.predict([features])
    return prediction[0]

# Create the Gradio interface for the features
iface = gr.Interface(
    fn=predict,  # Function to call for prediction
    inputs=[
        gr.Number(label="Age"),  # Numeric input for age
        gr.Dropdown(label="Gender", choices=[0, 1], type="value"),  # Assuming gender is encoded as 0/1
        gr.Dropdown(label="Chest Pain", choices=[0, 1, 2, 3], type="value"),  # Example for chest pain type
        gr.Number(label="Resting Blood Pressure"),  # Numeric input
        gr.Number(label="Serum Cholesterol"),  # Numeric input
        gr.Dropdown(label="Fasting Blood Sugar", choices=[0, 1], type="value"),  # 0 or 1
        gr.Dropdown(label="Resting Electrocardiographic Results", choices=[0, 1, 2], type="value"),  # Example choices
        gr.Number(label="Max Heart Rate"),  # Numeric input
        gr.Dropdown(label="Exercise Angina", choices=[0, 1], type="value"),  # 0 or 1 for exercise angina
        gr.Number(label="Old Peak (Depression of ST segment)"),  # Numeric input
        gr.Dropdown(label="Slope of the Peak Exercise ST Segment", choices=[0, 1, 2], type="value"),  # Example choices
        gr.Dropdown(label="Number of Major Vessels Colored by Fluoroscopy", choices=[0, 1, 2, 3], type="value")  # Example choices
    ],
    outputs="text",  # Text output for the prediction
    live=True,  # Live prediction as the user types
    title="Cardiovascular Disease Prediction",  # Title of the application
    description="Enter the feature values to predict the presence of cardiovascular disease."  # Description
)

# Launch the Gradio interface
if __name__ == "__main__":
    iface.launch(debug=True)


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://5d6e63063ce669467f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://5d6e63063ce669467f.gradio.live
