In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib
import google.generativeai as genai
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the dataset
df = pd.read_excel("heart.xlsx", sheet_name="Heart2")

In [4]:
# Clean the data - handle missing values if any
df = df.dropna()

In [5]:
import numpy as np

# Domain-Specific Augmentation
def medical_augmentation(df):
    augmented = []
    
    for _, row in df.iterrows():
        # Create variations based on clinical relationships
        for _ in range(2):  # Create 2 augmented samples per original
            new_row = row.copy()
            
            # If patient has high cholesterol, likely higher blood pressure
            if new_row['Chol'] > 240:
                new_row['RestBP'] += np.random.randint(5, 15)
                
            # If patient has exercise induced angina, likely higher ST depression
            if new_row['ExAng'] == 1:
                new_row['Oldpeak'] += np.random.uniform(0.1, 0.5)
                
            augmented.append(new_row)
    
    return pd.concat([df, pd.DataFrame(augmented)], ignore_index=True)

df = medical_augmentation(df)
df = df.dropna()


In [6]:
df = df.drop_duplicates()
df.tail(10)

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
873,294,63,1,asymptomatic,140,187,0,2,144,1,4.399999,1,2.0,reversable,Yes
874,294,63,1,asymptomatic,140,187,0,2,144,1,4.117165,1,2.0,reversable,Yes
875,295,63,0,asymptomatic,124,197,0,0,136,1,0.413978,2,0.0,normal,Yes
876,295,63,0,asymptomatic,124,197,0,0,136,1,0.119624,2,0.0,normal,Yes
881,298,57,0,asymptomatic,151,241,0,0,123,1,0.469509,2,0.0,reversable,Yes
882,298,57,0,asymptomatic,150,241,0,0,123,1,0.374597,2,0.0,reversable,Yes
883,299,45,1,typical,116,264,0,0,132,0,1.2,2,0.0,reversable,Yes
884,299,45,1,typical,121,264,0,0,132,0,1.2,2,0.0,reversable,Yes
887,301,57,1,asymptomatic,130,131,0,0,115,1,1.532385,2,1.0,reversable,Yes
888,301,57,1,asymptomatic,130,131,0,0,115,1,1.506042,2,1.0,reversable,Yes


In [7]:
from sklearn.preprocessing import LabelEncoder

# At the beginning of your script (after loading data)
all_chest_pain_types = ['typical', 'atypical', 'non-anginal', 'asymptomatic']
all_thal_types = ['normal', 'fixed defect', 'reversible defect']

# Initialize LabelEncoders with all possible categories
chest_pain_encoder = LabelEncoder().fit(all_chest_pain_types)
thal_encoder = LabelEncoder().fit(all_thal_types)
le = LabelEncoder()
le.fit(all_chest_pain_types)  # Fit with all possible categories
df['AHD'] = le.fit_transform(df['AHD'])
df['ChestPain'] = le.fit_transform(df['ChestPain'])
df['Thal'] = le.fit_transform(df['Thal'].astype(str))  # Handle NA values

In [8]:
# Select features and target
features = ['Age', 'Sex', 'ChestPain', 'RestBP', 'Chol', 'Fbs', 'RestECG', 
            'MaxHR', 'ExAng', 'Oldpeak', 'Slope', 'Ca', 'Thal']
X = df[features]
y = df['AHD']


In [9]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Train a model
ml_model = RandomForestClassifier(random_state=42)
ml_model.fit(X_train, y_train)

In [11]:
# Evaluate
y_pred = ml_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96        58
           1       1.00      0.93      0.97        76

    accuracy                           0.96       134
   macro avg       0.96      0.97      0.96       134
weighted avg       0.97      0.96      0.96       134



In [12]:
# Save the model
joblib.dump(ml_model, 'heart_disease_model.pkl')
joblib.dump(le, 'label_encoder.pkl')


['label_encoder.pkl']

In [13]:
df.tail()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
882,298,57,0,0,150,241,0,0,123,1,0.374597,2,0.0,2,1
883,299,45,1,3,116,264,0,0,132,0,1.2,2,0.0,2,1
884,299,45,1,3,121,264,0,0,132,0,1.2,2,0.0,2,1
887,301,57,1,0,130,131,0,0,115,1,1.532385,2,1.0,2,1
888,301,57,1,0,130,131,0,0,115,1,1.506042,2,1.0,2,1


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 670 entries, 0 to 888
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  670 non-null    int64  
 1   Age         670 non-null    int64  
 2   Sex         670 non-null    int64  
 3   ChestPain   670 non-null    int64  
 4   RestBP      670 non-null    int64  
 5   Chol        670 non-null    int64  
 6   Fbs         670 non-null    int64  
 7   RestECG     670 non-null    int64  
 8   MaxHR       670 non-null    int64  
 9   ExAng       670 non-null    int64  
 10  Oldpeak     670 non-null    float64
 11  Slope       670 non-null    int64  
 12  Ca          670 non-null    float64
 13  Thal        670 non-null    int64  
 14  AHD         670 non-null    int64  
dtypes: float64(2), int64(13)
memory usage: 83.8 KB


In [15]:
# SMOTE (Synthetic Minority Oversampling Technique)
from imblearn.over_sampling import SMOTE

# Separate features and target
X = df.drop('AHD', axis=1)
y = df['AHD']

# Apply SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Check new class distribution
print(y_res.value_counts())

AHD
0    346
1    346
Name: count, dtype: int64


In [16]:
# Gaussian Noise Addition
def add_gaussian_noise(df, noise_level=0.05):
    numerical_cols = ['Age', 'RestBP', 'Chol', 'MaxHR', 'Oldpeak']
    noisy_df = df.copy()
    for col in numerical_cols:
        noise = np.random.normal(0, noise_level*df[col].std(), size=len(df))
        noisy_df[col] = df[col] + noise
    return noisy_df

gauss_augmented_df = add_gaussian_noise(df)

In [17]:
gauss_augmented_df.tail()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
882,298,56.755464,0,0,150.43433,242.040196,0,0,123.713415,1,0.283562,2,0.0,2,1
883,299,45.398873,1,3,114.356027,264.702742,0,0,131.487381,0,1.321008,2,0.0,2,1
884,299,44.401213,1,3,119.444773,267.673467,0,0,130.536618,0,1.24507,2,0.0,2,1
887,301,56.805288,1,0,129.249096,126.655665,0,0,114.372241,1,1.606775,2,1.0,2,1
888,301,57.304238,1,0,130.970508,138.514223,0,0,117.648792,1,1.382226,2,1.0,2,1


In [18]:
# Feature Value Perturbation
def perturb_features(df, perturbation_factor=0.1):
    numerical_cols = ['Age', 'RestBP', 'Chol', 'MaxHR', 'Oldpeak']
    perturbed_samples = []
    
    for _, row in df.iterrows():
        new_row = row.copy()
        for col in numerical_cols:
            # Perturb within ±10% of original value
            perturbation = np.random.uniform(-perturbation_factor, perturbation_factor)
            new_row[col] = row[col] * (1 + perturbation)
        perturbed_samples.append(new_row)
    
    return pd.concat([df, pd.DataFrame(perturbed_samples)], ignore_index=True)

perturb_augmented_df = perturb_features(df)

In [19]:
perturb_augmented_df.tail()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
1335,298.0,54.458933,0.0,0.0,137.463154,257.242223,0.0,0.0,130.24207,1.0,0.399512,2.0,0.0,2.0,1.0
1336,299.0,43.168038,1.0,3.0,107.877583,272.928734,0.0,0.0,134.933978,0.0,1.140941,2.0,0.0,2.0,1.0
1337,299.0,44.240183,1.0,3.0,113.345431,249.369513,0.0,0.0,121.47976,0.0,1.132114,2.0,0.0,2.0,1.0
1338,301.0,61.629332,1.0,0.0,142.289561,135.96754,0.0,0.0,115.510389,1.0,1.643178,2.0,1.0,2.0,1.0
1339,301.0,61.238015,1.0,0.0,129.7952,133.584041,0.0,0.0,120.728567,1.0,1.634602,2.0,1.0,2.0,1.0


In [20]:
# Domain-Specific Augmentation
def medical_augmentation(df):
    augmented = []
    
    for _, row in df.iterrows():
        # Create variations based on clinical relationships
        for _ in range(2):  # Create 2 augmented samples per original
            new_row = row.copy()
            
            # If patient has high cholesterol, likely higher blood pressure
            if new_row['Chol'] > 240:
                new_row['RestBP'] += np.random.randint(5, 15)
                
            # If patient has exercise induced angina, likely higher ST depression
            if new_row['ExAng'] == 1:
                new_row['Oldpeak'] += np.random.uniform(0.1, 0.5)
                
            augmented.append(new_row)
    
    return pd.concat([df, pd.DataFrame(augmented)], ignore_index=True)

domain_augmented_df = medical_augmentation(df)

In [21]:
domain_augmented_df.tail(10)

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
2000,298.0,57.0,0.0,0.0,156.0,241.0,0.0,0.0,123.0,1.0,0.515318,2.0,0.0,2.0,1.0
2001,298.0,57.0,0.0,0.0,158.0,241.0,0.0,0.0,123.0,1.0,0.603677,2.0,0.0,2.0,1.0
2002,299.0,45.0,1.0,3.0,129.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,2.0,1.0
2003,299.0,45.0,1.0,3.0,124.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,2.0,1.0
2004,299.0,45.0,1.0,3.0,133.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,2.0,1.0
2005,299.0,45.0,1.0,3.0,132.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,2.0,1.0
2006,301.0,57.0,1.0,0.0,130.0,131.0,0.0,0.0,115.0,1.0,1.898247,2.0,1.0,2.0,1.0
2007,301.0,57.0,1.0,0.0,130.0,131.0,0.0,0.0,115.0,1.0,1.662741,2.0,1.0,2.0,1.0
2008,301.0,57.0,1.0,0.0,130.0,131.0,0.0,0.0,115.0,1.0,1.674238,2.0,1.0,2.0,1.0
2009,301.0,57.0,1.0,0.0,130.0,131.0,0.0,0.0,115.0,1.0,1.893081,2.0,1.0,2.0,1.0


In [22]:
domain_augmented_df = domain_augmented_df.dropna()
domain_augmented_df.tail(10)

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
2000,298.0,57.0,0.0,0.0,156.0,241.0,0.0,0.0,123.0,1.0,0.515318,2.0,0.0,2.0,1.0
2001,298.0,57.0,0.0,0.0,158.0,241.0,0.0,0.0,123.0,1.0,0.603677,2.0,0.0,2.0,1.0
2002,299.0,45.0,1.0,3.0,129.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,2.0,1.0
2003,299.0,45.0,1.0,3.0,124.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,2.0,1.0
2004,299.0,45.0,1.0,3.0,133.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,2.0,1.0
2005,299.0,45.0,1.0,3.0,132.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,2.0,1.0
2006,301.0,57.0,1.0,0.0,130.0,131.0,0.0,0.0,115.0,1.0,1.898247,2.0,1.0,2.0,1.0
2007,301.0,57.0,1.0,0.0,130.0,131.0,0.0,0.0,115.0,1.0,1.662741,2.0,1.0,2.0,1.0
2008,301.0,57.0,1.0,0.0,130.0,131.0,0.0,0.0,115.0,1.0,1.674238,2.0,1.0,2.0,1.0
2009,301.0,57.0,1.0,0.0,130.0,131.0,0.0,0.0,115.0,1.0,1.893081,2.0,1.0,2.0,1.0


In [23]:
# Save the model
joblib.dump(ml_model, 'heart_disease_model.pkl')
joblib.dump(le, 'label_encoder.pkl')


['label_encoder.pkl']

In [24]:
# Configure Gemini
genai.configure(api_key='GEMINI_API_KEY')

# Initialize the model
model_name = 'gemini-2.0-flash'
generation_config = {
    "temperature": 0.7,
    "top_p": 1,
    "top_k": 1,
    "max_output_tokens": 2048,
}


gemini_model = genai.GenerativeModel(model_name=model_name,
                            generation_config=generation_config)


In [25]:
import streamlit as st

# Load the saved model and encoder
model = joblib.load('heart_disease_model.pkl')
le = joblib.load('label_encoder.pkl')

In [26]:
def predict_ahd(input_data):
    """Predict AHD based on input features"""
    try:
        # Convert input to DataFrame
        input_df = pd.DataFrame([input_data])
        
        # Convert categorical variables
        input_df['ChestPain'] = chest_pain_encoder.transform([input_data['ChestPain']])[0]
        input_df['Thal'] = thal_encoder.transform([str(input_data['Thal'])])[0]
        
        # Make prediction
        prediction = model.predict(input_df)[0]
        probability = model.predict_proba(input_df)[0][1]
        
        return prediction, probability
    except Exception as e:
        print(f"Error in prediction: {e}")
        return None, None

def generate_explanation(input_data, prediction, probability):
    """Generate natural language explanation using Gemini"""
    prompt = f"""
    A patient with the following characteristics:
    - Age: {input_data['Age']}
    - Sex: {'Male' if input_data['Sex'] == 1 else 'Female'}
    - Chest Pain Type: {input_data['ChestPain']}
    - Resting Blood Pressure: {input_data['RestBP']} mmHg
    - Cholesterol: {input_data['Chol']} mg/dl
    - Fasting Blood Sugar > 120 mg/dl: {'Yes' if input_data['Fbs'] == 1 else 'No'}
    - Resting ECG Results: {input_data['RestECG']}
    - Maximum Heart Rate Achieved: {input_data['MaxHR']}
    - Exercise Induced Angina: {'Yes' if input_data['ExAng'] == 1 else 'No'}
    - ST Depression Induced by Exercise: {input_data['Oldpeak']}
    - Slope of Peak Exercise ST Segment: {input_data['Slope']}
    - Number of Major Vessels Colored by Fluoroscopy: {input_data['Ca']}
    - Thalassemia: {input_data['Thal']}
    
    Has a {'high' if probability > 0.7 else 'moderate' if probability > 0.5 else 'low'} probability ({probability*100:.1f}%) of having angiographic heart disease (AHD).
    
    Please provide a detailed explanation in simple terms for a non-medical person about what this prediction means, which factors contributed most to this prediction, and what they should do next.
    """
    
    response = gemini_model.generate_content(prompt)
    return response.text