In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib
import google.generativeai as genai
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the dataset
df = pd.read_excel("../heart.xlsx", sheet_name="Heart2")

In [4]:
# Clean the data - handle missing values if any
df = df.dropna()

In [6]:
import numpy as np

# Domain-Specific Augmentation
def medical_augmentation(df):
    augmented = []
    
    for _, row in df.iterrows():
        # Create variations based on clinical relationships
        for _ in range(2):  # Create 2 augmented samples per original
            new_row = row.copy()
            
            # If patient has high cholesterol, likely higher blood pressure
            if new_row['Chol'] > 240:
                new_row['RestBP'] += np.random.randint(5, 15)
                
            # If patient has exercise induced angina, likely higher ST depression
            if new_row['ExAng'] == 1:
                new_row['Oldpeak'] += np.random.uniform(0.1, 0.5)
                
            augmented.append(new_row)
    
    return pd.concat([df, pd.DataFrame(augmented)], ignore_index=True)

df = medical_augmentation(df)
df = df.dropna()


In [7]:
df = df.drop_duplicates()
df.tail(10)

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
873,294,63,1,asymptomatic,140,187,0,2,144,1,4.24004,1,2.0,reversable,Yes
874,294,63,1,asymptomatic,140,187,0,2,144,1,4.315216,1,2.0,reversable,Yes
875,295,63,0,asymptomatic,124,197,0,0,136,1,0.130953,2,0.0,normal,Yes
876,295,63,0,asymptomatic,124,197,0,0,136,1,0.486728,2,0.0,normal,Yes
881,298,57,0,asymptomatic,150,241,0,0,123,1,0.398728,2,0.0,reversable,Yes
882,298,57,0,asymptomatic,149,241,0,0,123,1,0.576449,2,0.0,reversable,Yes
883,299,45,1,typical,122,264,0,0,132,0,1.2,2,0.0,reversable,Yes
884,299,45,1,typical,123,264,0,0,132,0,1.2,2,0.0,reversable,Yes
887,301,57,1,asymptomatic,130,131,0,0,115,1,1.492756,2,1.0,reversable,Yes
888,301,57,1,asymptomatic,130,131,0,0,115,1,1.480909,2,1.0,reversable,Yes


In [None]:
from sklearn.preprocessing import LabelEncoder

# At the beginning of your script (after loading data)
all_chest_pain_types = ['typical', 'atypical', 'non-anginal', 'asymptomatic']
all_thal_types = ['normal', 'fixed defect', 'reversible defect']

# Initialize LabelEncoders with all possible categories
chest_pain_encoder = LabelEncoder().fit(all_chest_pain_types)
thal_encoder = LabelEncoder().fit(all_thal_types)
le = LabelEncoder()
le.fit(all_chest_pain_types)  # Fit with all possible categories
df['AHD'] = le.fit_transform(df['AHD'])
df['ChestPain'] = le.fit_transform(df['ChestPain'])
df['Thal'] = le.fit_transform(df['Thal'].astype(str))  # Handle NA values

In [None]:
# Select features and target
features = ['Age', 'Sex', 'ChestPain', 'RestBP', 'Chol', 'Fbs', 'RestECG', 
            'MaxHR', 'ExAng', 'Oldpeak', 'Slope', 'Ca', 'Thal']
X = df[features]
y = df['AHD']


In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train a model
ml_model = RandomForestClassifier(random_state=42)
ml_model.fit(X_train, y_train)

In [None]:
# Evaluate
y_pred = ml_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95        56
           1       0.99      0.94      0.96        79

    accuracy                           0.96       135
   macro avg       0.95      0.96      0.95       135
weighted avg       0.96      0.96      0.96       135



In [None]:
# Save the model
joblib.dump(ml_model, 'heart_disease_model.pkl')
joblib.dump(le, 'label_encoder.pkl')


['label_encoder.pkl']

In [None]:
df.tail()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
882,298,57,0,0,151,241,0,0,123,1,0.576836,2,0.0,2,1
883,299,45,1,3,119,264,0,0,132,0,1.2,2,0.0,2,1
884,299,45,1,3,115,264,0,0,132,0,1.2,2,0.0,2,1
887,301,57,1,0,130,131,0,0,115,1,1.653636,2,1.0,2,1
888,301,57,1,0,130,131,0,0,115,1,1.410271,2,1.0,2,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 672 entries, 0 to 888
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  672 non-null    int64  
 1   Age         672 non-null    int64  
 2   Sex         672 non-null    int64  
 3   ChestPain   672 non-null    int64  
 4   RestBP      672 non-null    int64  
 5   Chol        672 non-null    int64  
 6   Fbs         672 non-null    int64  
 7   RestECG     672 non-null    int64  
 8   MaxHR       672 non-null    int64  
 9   ExAng       672 non-null    int64  
 10  Oldpeak     672 non-null    float64
 11  Slope       672 non-null    int64  
 12  Ca          672 non-null    float64
 13  Thal        672 non-null    int64  
 14  AHD         672 non-null    int64  
dtypes: float64(2), int64(13)
memory usage: 84.0 KB


In [None]:
# SMOTE (Synthetic Minority Oversampling Technique)
from imblearn.over_sampling import SMOTE

# Separate features and target
X = df.drop('AHD', axis=1)
y = df['AHD']

# Apply SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Check new class distribution
print(y_res.value_counts())

AHD
0    344
1    344
Name: count, dtype: int64


In [None]:
# Gaussian Noise Addition
def add_gaussian_noise(df, noise_level=0.05):
    numerical_cols = ['Age', 'RestBP', 'Chol', 'MaxHR', 'Oldpeak']
    noisy_df = df.copy()
    for col in numerical_cols:
        noise = np.random.normal(0, noise_level*df[col].std(), size=len(df))
        noisy_df[col] = df[col] + noise
    return noisy_df

gauss_augmented_df = add_gaussian_noise(df)

In [None]:
gauss_augmented_df.tail()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
882,298,56.830176,0,0,149.364815,239.052322,0,0,123.537299,1,0.561563,2,0.0,2,1
883,299,45.764927,1,3,119.118723,265.830104,0,0,131.094472,0,1.205863,2,0.0,2,1
884,299,45.181562,1,3,115.257561,264.222601,0,0,130.683624,0,1.181024,2,0.0,2,1
887,301,56.881016,1,0,127.926228,128.196177,0,0,114.938299,1,1.769028,2,1.0,2,1
888,301,56.857741,1,0,129.027234,131.536941,0,0,115.07984,1,1.331046,2,1.0,2,1


In [None]:
# Feature Value Perturbation
def perturb_features(df, perturbation_factor=0.1):
    numerical_cols = ['Age', 'RestBP', 'Chol', 'MaxHR', 'Oldpeak']
    perturbed_samples = []
    
    for _, row in df.iterrows():
        new_row = row.copy()
        for col in numerical_cols:
            # Perturb within ±10% of original value
            perturbation = np.random.uniform(-perturbation_factor, perturbation_factor)
            new_row[col] = row[col] * (1 + perturbation)
        perturbed_samples.append(new_row)
    
    return pd.concat([df, pd.DataFrame(perturbed_samples)], ignore_index=True)

perturb_augmented_df = perturb_features(df)

In [None]:
perturb_augmented_df.tail()

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
1339,298.0,61.055621,0.0,0.0,148.846866,226.983662,0.0,0.0,132.944282,1.0,0.625284,2.0,0.0,2.0,1.0
1340,299.0,42.976061,1.0,3.0,120.050753,278.513355,0.0,0.0,137.692543,0.0,1.113538,2.0,0.0,2.0,1.0
1341,299.0,48.046678,1.0,3.0,120.489288,245.731175,0.0,0.0,140.51235,0.0,1.261445,2.0,0.0,2.0,1.0
1342,301.0,58.98152,1.0,0.0,124.336776,136.397789,0.0,0.0,125.265707,1.0,1.721207,2.0,1.0,2.0,1.0
1343,301.0,51.957374,1.0,0.0,117.564289,131.164994,0.0,0.0,108.855716,1.0,1.410226,2.0,1.0,2.0,1.0


In [None]:
# Domain-Specific Augmentation
def medical_augmentation(df):
    augmented = []
    
    for _, row in df.iterrows():
        # Create variations based on clinical relationships
        for _ in range(2):  # Create 2 augmented samples per original
            new_row = row.copy()
            
            # If patient has high cholesterol, likely higher blood pressure
            if new_row['Chol'] > 240:
                new_row['RestBP'] += np.random.randint(5, 15)
                
            # If patient has exercise induced angina, likely higher ST depression
            if new_row['ExAng'] == 1:
                new_row['Oldpeak'] += np.random.uniform(0.1, 0.5)
                
            augmented.append(new_row)
    
    return pd.concat([df, pd.DataFrame(augmented)], ignore_index=True)

domain_augmented_df = medical_augmentation(df)

In [None]:
domain_augmented_df.tail(10)

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
2006,298.0,57.0,0.0,0.0,161.0,241.0,0.0,0.0,123.0,1.0,0.792354,2.0,0.0,2.0,1.0
2007,298.0,57.0,0.0,0.0,162.0,241.0,0.0,0.0,123.0,1.0,1.067831,2.0,0.0,2.0,1.0
2008,299.0,45.0,1.0,3.0,125.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,2.0,1.0
2009,299.0,45.0,1.0,3.0,129.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,2.0,1.0
2010,299.0,45.0,1.0,3.0,121.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,2.0,1.0
2011,299.0,45.0,1.0,3.0,120.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,2.0,1.0
2012,301.0,57.0,1.0,0.0,130.0,131.0,0.0,0.0,115.0,1.0,1.76588,2.0,1.0,2.0,1.0
2013,301.0,57.0,1.0,0.0,130.0,131.0,0.0,0.0,115.0,1.0,1.854552,2.0,1.0,2.0,1.0
2014,301.0,57.0,1.0,0.0,130.0,131.0,0.0,0.0,115.0,1.0,1.599756,2.0,1.0,2.0,1.0
2015,301.0,57.0,1.0,0.0,130.0,131.0,0.0,0.0,115.0,1.0,1.694116,2.0,1.0,2.0,1.0


In [None]:
domain_augmented_df = domain_augmented_df.dropna()
domain_augmented_df.tail(10)

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
2006,298.0,57.0,0.0,0.0,161.0,241.0,0.0,0.0,123.0,1.0,0.792354,2.0,0.0,2.0,1.0
2007,298.0,57.0,0.0,0.0,162.0,241.0,0.0,0.0,123.0,1.0,1.067831,2.0,0.0,2.0,1.0
2008,299.0,45.0,1.0,3.0,125.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,2.0,1.0
2009,299.0,45.0,1.0,3.0,129.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,2.0,1.0
2010,299.0,45.0,1.0,3.0,121.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,2.0,1.0
2011,299.0,45.0,1.0,3.0,120.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,2.0,1.0
2012,301.0,57.0,1.0,0.0,130.0,131.0,0.0,0.0,115.0,1.0,1.76588,2.0,1.0,2.0,1.0
2013,301.0,57.0,1.0,0.0,130.0,131.0,0.0,0.0,115.0,1.0,1.854552,2.0,1.0,2.0,1.0
2014,301.0,57.0,1.0,0.0,130.0,131.0,0.0,0.0,115.0,1.0,1.599756,2.0,1.0,2.0,1.0
2015,301.0,57.0,1.0,0.0,130.0,131.0,0.0,0.0,115.0,1.0,1.694116,2.0,1.0,2.0,1.0


In [None]:
# Save the model
joblib.dump(ml_model, 'heart_disease_model.pkl')
joblib.dump(le, 'label_encoder.pkl')


['label_encoder.pkl']

In [None]:
# Configure Gemini
genai.configure(api_key='GEMINI_API_KEY')

# Initialize the model
model_name = 'gemini-2.0-flash'
generation_config = {
    "temperature": 0.7,
    "top_p": 1,
    "top_k": 1,
    "max_output_tokens": 2048,
}


gemini_model = genai.GenerativeModel(model_name=model_name,
                            generation_config=generation_config)


In [None]:
import streamlit as st

# Load the saved model and encoder
model = joblib.load('heart_disease_model.pkl')
le = joblib.load('label_encoder.pkl')

In [None]:
def predict_ahd(input_data):
    """Predict AHD based on input features"""
    try:
        # Convert input to DataFrame
        input_df = pd.DataFrame([input_data])
        
        # Convert categorical variables
        input_df['ChestPain'] = chest_pain_encoder.transform([input_data['ChestPain']])[0]
        input_df['Thal'] = thal_encoder.transform([str(input_data['Thal'])])[0]
        
        # Make prediction
        prediction = model.predict(input_df)[0]
        probability = model.predict_proba(input_df)[0][1]
        
        return prediction, probability
    except Exception as e:
        print(f"Error in prediction: {e}")
        return None, None

def generate_explanation(input_data, prediction, probability):
    """Generate natural language explanation using Gemini"""
    prompt = f"""
    A patient with the following characteristics:
    - Age: {input_data['Age']}
    - Sex: {'Male' if input_data['Sex'] == 1 else 'Female'}
    - Chest Pain Type: {input_data['ChestPain']}
    - Resting Blood Pressure: {input_data['RestBP']} mmHg
    - Cholesterol: {input_data['Chol']} mg/dl
    - Fasting Blood Sugar > 120 mg/dl: {'Yes' if input_data['Fbs'] == 1 else 'No'}
    - Resting ECG Results: {input_data['RestECG']}
    - Maximum Heart Rate Achieved: {input_data['MaxHR']}
    - Exercise Induced Angina: {'Yes' if input_data['ExAng'] == 1 else 'No'}
    - ST Depression Induced by Exercise: {input_data['Oldpeak']}
    - Slope of Peak Exercise ST Segment: {input_data['Slope']}
    - Number of Major Vessels Colored by Fluoroscopy: {input_data['Ca']}
    - Thalassemia: {input_data['Thal']}
    
    Has a {'high' if probability > 0.7 else 'moderate' if probability > 0.5 else 'low'} probability ({probability*100:.1f}%) of having angiographic heart disease (AHD).
    
    Please provide a detailed explanation in simple terms for a non-medical person about what this prediction means, which factors contributed most to this prediction, and what they should do next.
    """
    
    response = gemini_model.generate_content(prompt)
    return response.text