In [253]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import joblib
import google.generativeai as genai
import os
from ctgan import CTGAN


In [254]:
# Load the dataset
df = pd.read_excel("heart.xlsx", sheet_name="Heart2")

In [255]:
# Clean the data - handle missing values if any
df = df.dropna()

In [256]:
from sklearn.preprocessing import LabelEncoder

all_chest_pain_types = ['typical', 'atypical', 'non-anginal', 'asymptomatic']
all_thal_types = ['normal', 'fixed defect', 'reversible defect']

# Initialize LabelEncoders with all possible categories
chest_pain_encoder = LabelEncoder().fit(all_chest_pain_types)
thal_encoder = LabelEncoder().fit(all_thal_types)
le = LabelEncoder()
le.fit(all_chest_pain_types)  # Fit with all possible categories
df['AHD'] = le.fit_transform(df['AHD'])
df['ChestPain'] = le.fit_transform(df['ChestPain'])
df['Thal'] = le.fit_transform(df['Thal'].astype(str))  # Handle NA values

In [257]:
import numpy as np

# Domain-Specific Augmentation
def medical_augmentation(df):
    augmented = []
    
    for _, row in df.iterrows():
        # Create variations based on clinical relationships
        for _ in range(2):  # Create 2 augmented samples per original
            new_row = row.copy()
            new_row['Age'] += np.random.randint(-15, 16)
            # If patient has high cholesterol, likely higher blood pressure
            if new_row['Chol'] > 240:
                new_row['RestBP'] += np.random.randint(5, 15)
                
            # If patient has exercise induced angina, likely higher ST depression
            if new_row['ExAng'] == 1:
                new_row['Oldpeak'] += np.random.uniform(0.1, 0.5)
                
            new_row['RestBP'] += np.random.randint(-10, 11)
            new_row['Oldpeak'] += np.random.uniform(-0.5, 0.5)
            new_row['MaxHR'] += np.random.randint(-10, 11)
            new_row['AHD'] = np.random.choice([0, 1])  # Randomly assign AHD
            
            augmented.append(new_row)
            
			# Randomize age within a realistic range
			
    return pd.concat([df, pd.DataFrame(augmented)], ignore_index=True)

df = medical_augmentation(df)
df.tail(10)


Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
881,298.0,69.0,0.0,0.0,152.0,241.0,0.0,0.0,132.0,1.0,0.798408,2.0,0.0,2.0,1.0
882,298.0,64.0,0.0,0.0,155.0,241.0,0.0,0.0,120.0,1.0,0.267793,2.0,0.0,2.0,0.0
883,299.0,53.0,1.0,3.0,122.0,264.0,0.0,0.0,139.0,0.0,1.218203,2.0,0.0,2.0,1.0
884,299.0,58.0,1.0,3.0,130.0,264.0,0.0,0.0,122.0,0.0,1.466379,2.0,0.0,2.0,1.0
885,300.0,71.0,1.0,0.0,145.0,193.0,1.0,0.0,134.0,0.0,3.587416,2.0,2.0,2.0,1.0
886,300.0,73.0,1.0,0.0,146.0,193.0,1.0,0.0,151.0,0.0,3.473871,2.0,2.0,2.0,0.0
887,301.0,52.0,1.0,0.0,140.0,131.0,0.0,0.0,111.0,1.0,1.222904,2.0,1.0,2.0,1.0
888,301.0,62.0,1.0,0.0,138.0,131.0,0.0,0.0,114.0,1.0,1.792998,2.0,1.0,2.0,0.0
889,302.0,46.0,0.0,2.0,120.0,236.0,0.0,2.0,169.0,0.0,-0.132198,2.0,1.0,1.0,1.0
890,302.0,67.0,0.0,2.0,124.0,236.0,0.0,2.0,164.0,0.0,-0.41758,2.0,1.0,1.0,0.0


In [258]:
def ctgan_augmentation(df, target_column='AHD', augment_size=None):
    """
    Generate synthetic samples using CTGAN.
    
    Args:
        df: Original DataFrame
        target_column: The target column to condition on
        augment_size: Number of synthetic samples to generate (default: same as original)
    
    Returns:
        Augmented DataFrame
    """
    # Initialize CTGAN model
    model = CTGAN(epochs=100, verbose=True)
    print("Training CTGAN model...")

    # Fit the model to the original data
    model.fit(df)
    
    # Generate synthetic samples
    if augment_size is None:
        augment_size = len(df)
    
    synthetic_data = model.sample(augment_size)
    
    return pd.concat([df, synthetic_data], ignore_index=True)

In [259]:
# Apply CTGAN augmentation
ctgan_augmented_df = ctgan_augmentation(df)

# Display the augmented data
print("Original data size:", len(df))
print("CTGAN augmented data size:", len(ctgan_augmented_df))
#df = ctgan_augmented_df

Training CTGAN model...


Gen. (-1.13) | Discrim. (0.09): 100%|██████████| 100/100 [00:07<00:00, 12.50it/s]


Original data size: 891
CTGAN augmented data size: 1782


In [260]:
# Select features and target
features = ['Age', 'Sex', 'ChestPain', 'RestBP', 'Chol', 'Fbs', 'RestECG', 
            'MaxHR', 'ExAng', 'Oldpeak', 'Slope', 'Ca', 'Thal']
X = df[features]
y = df['AHD']


In [261]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=64)

In [262]:
# Train a model
ml_model = RandomForestClassifier(random_state=64)
ml_model.fit(X_train, y_train)

In [263]:
# Evaluate
y_pred = ml_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.60      0.62      0.61       102
         1.0       0.47      0.45      0.46        77

    accuracy                           0.55       179
   macro avg       0.54      0.54      0.54       179
weighted avg       0.55      0.55      0.55       179



In [264]:
# Save the model
joblib.dump(ml_model, 'heart_disease_model.pkl')
joblib.dump(le, 'label_encoder.pkl')


['label_encoder.pkl']

In [265]:
df.tail()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
886,300.0,73.0,1.0,0.0,146.0,193.0,1.0,0.0,151.0,0.0,3.473871,2.0,2.0,2.0,0.0
887,301.0,52.0,1.0,0.0,140.0,131.0,0.0,0.0,111.0,1.0,1.222904,2.0,1.0,2.0,1.0
888,301.0,62.0,1.0,0.0,138.0,131.0,0.0,0.0,114.0,1.0,1.792998,2.0,1.0,2.0,0.0
889,302.0,46.0,0.0,2.0,120.0,236.0,0.0,2.0,169.0,0.0,-0.132198,2.0,1.0,1.0,1.0
890,302.0,67.0,0.0,2.0,124.0,236.0,0.0,2.0,164.0,0.0,-0.41758,2.0,1.0,1.0,0.0


In [266]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  891 non-null    float64
 1   Age         891 non-null    float64
 2   Sex         891 non-null    float64
 3   ChestPain   891 non-null    float64
 4   RestBP      891 non-null    float64
 5   Chol        891 non-null    float64
 6   Fbs         891 non-null    float64
 7   RestECG     891 non-null    float64
 8   MaxHR       891 non-null    float64
 9   ExAng       891 non-null    float64
 10  Oldpeak     891 non-null    float64
 11  Slope       891 non-null    float64
 12  Ca          891 non-null    float64
 13  Thal        891 non-null    float64
 14  AHD         891 non-null    float64
dtypes: float64(15)
memory usage: 104.5 KB


In [267]:
# Gaussian Noise Addition
def add_gaussian_noise(df, noise_level=0.05):
    numerical_cols = ['Age', 'RestBP', 'Chol', 'MaxHR', 'Oldpeak']
    noisy_df = df.copy()
    for col in numerical_cols:
        noise = np.random.normal(0, noise_level*df[col].std(), size=len(df))
        noisy_df[col] = df[col] + noise
    return noisy_df

gauss_augmented_df = add_gaussian_noise(df)

In [268]:
gauss_augmented_df.tail()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
886,300.0,73.065507,1.0,0.0,146.690033,188.196564,1.0,0.0,150.960274,0.0,3.500059,2.0,2.0,2.0,0.0
887,301.0,52.05385,1.0,0.0,140.6753,125.983283,0.0,0.0,111.603105,1.0,1.356256,2.0,1.0,2.0,1.0
888,301.0,62.379734,1.0,0.0,138.041753,130.997956,0.0,0.0,114.423272,1.0,1.739462,2.0,1.0,2.0,0.0
889,302.0,46.832593,0.0,2.0,119.26716,240.405829,0.0,2.0,167.770991,0.0,-0.094436,2.0,1.0,1.0,1.0
890,302.0,67.473779,0.0,2.0,123.02592,231.052055,0.0,2.0,162.73817,0.0,-0.467981,2.0,1.0,1.0,0.0


In [269]:
# Feature Value Perturbation
def perturb_features(df, perturbation_factor=0.1):
    numerical_cols = ['Age', 'RestBP', 'Chol', 'MaxHR', 'Oldpeak']
    perturbed_samples = []
    
    for _, row in df.iterrows():
        new_row = row.copy()
        for col in numerical_cols:
            # Perturb within ±10% of original value
            perturbation = np.random.uniform(-perturbation_factor, perturbation_factor)
            new_row[col] = row[col] * (1 + perturbation)
        perturbed_samples.append(new_row)
    
    return pd.concat([df, pd.DataFrame(perturbed_samples)], ignore_index=True)

perturb_augmented_df = perturb_features(df)

In [270]:
perturb_augmented_df.tail()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
1777,300.0,69.952105,1.0,0.0,149.859525,186.10663,1.0,0.0,139.785993,0.0,3.654098,2.0,2.0,2.0,0.0
1778,301.0,53.389061,1.0,0.0,132.977519,129.870592,0.0,0.0,118.72257,1.0,1.232104,2.0,1.0,2.0,1.0
1779,301.0,66.216836,1.0,0.0,140.704857,137.319161,0.0,0.0,110.89984,1.0,1.634691,2.0,1.0,2.0,0.0
1780,302.0,45.501158,0.0,2.0,129.413865,227.81861,0.0,2.0,157.814061,0.0,-0.127138,2.0,1.0,1.0,1.0
1781,302.0,72.51394,0.0,2.0,117.472387,230.882784,0.0,2.0,170.871485,0.0,-0.404891,2.0,1.0,1.0,0.0


In [271]:
# Configure Gemini
genai.configure(api_key='GEMINI_API_KEY')

# Initialize the model
model_name = 'gemini-2.0-flash'
generation_config = {
    "temperature": 0.7,
    "top_p": 1,
    "top_k": 1,
    "max_output_tokens": 2048,
}


gemini_model = genai.GenerativeModel(model_name=model_name,
                            generation_config=generation_config)


In [272]:
import streamlit as st

# Load the saved model and encoder
model = joblib.load('heart_disease_model.pkl')
le = joblib.load('label_encoder.pkl')

In [273]:
def predict_ahd(input_data):
    """Predict AHD based on input features"""
    try:
        # Convert input to DataFrame
        input_df = pd.DataFrame([input_data])
        
        # Convert categorical variables
        input_df['ChestPain'] = chest_pain_encoder.transform([input_data['ChestPain']])[0]
        input_df['Thal'] = thal_encoder.transform([str(input_data['Thal'])])[0]
        
        # Make prediction
        prediction = model.predict(input_df)[0]
        probability = model.predict_proba(input_df)[0][1]
        
        return prediction, probability
    except Exception as e:
        print(f"Error in prediction: {e}")
        return None, None

def generate_explanation(input_data, prediction, probability):
    """Generate natural language explanation using Gemini"""
    prompt = f"""
    A patient with the following characteristics:
    - Age: {input_data['Age']}
    - Sex: {'Male' if input_data['Sex'] == 1 else 'Female'}
    - Chest Pain Type: {input_data['ChestPain']}
    - Resting Blood Pressure: {input_data['RestBP']} mmHg
    - Cholesterol: {input_data['Chol']} mg/dl
    - Fasting Blood Sugar > 120 mg/dl: {'Yes' if input_data['Fbs'] == 1 else 'No'}
    - Resting ECG Results: {input_data['RestECG']}
    - Maximum Heart Rate Achieved: {input_data['MaxHR']}
    - Exercise Induced Angina: {'Yes' if input_data['ExAng'] == 1 else 'No'}
    - ST Depression Induced by Exercise: {input_data['Oldpeak']}
    - Slope of Peak Exercise ST Segment: {input_data['Slope']}
    - Number of Major Vessels Colored by Fluoroscopy: {input_data['Ca']}
    - Thalassemia: {input_data['Thal']}
    
    The prediction is of {prediction}
    Has a {'high' if probability > 0.7 else 'moderate' if probability > 0.5 else 'low'} probability ({probability*100:.1f}%) of having angiographic heart disease (AHD).
    
    Please provide a detailed explanation in simple terms for a non-medical person about what this prediction means, which factors contributed most to this prediction, and what they should do next.
    """
    
    response = gemini_model.generate_content(prompt)
    return response.text