<a href="https://colab.research.google.com/github/myself-rajarajan/LIL/blob/main/ml_for_combined_Dataset_lil.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
import os

# Define the path to your folder
my_folder_path = '/content/drive/My Drive/Colab_Notebooks'

# List the files inside that folder to confirm access
print(os.listdir(my_folder_path))

['Heart Disease Predictor', 'reports', 'ml_for_combined_Dataset_lil.ipynb']


In [18]:
# =============================================================================
# 1. IMPORT LIBRARIES
# =============================================================================
import os
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from fpdf import FPDF
from ucimlrepo import fetch_ucirepo  # <-- LIBRARY FOR NEW DATASET LOADING

# =============================================================================
# 2. LOAD AND PREPARE THE DATA (USING UCIMLREPO)
# =============================================================================
print("Fetching and combining datasets...")

# --- Part A: Fetch dataset using ucimlrepo ---
# This fetches a consolidated version of the heart disease dataset
heart_disease = fetch_ucirepo(id=45)
X_data = heart_disease.data.features
y_data = heart_disease.data.targets
df_repo = pd.concat([X_data, y_data], axis=1)
df_repo.rename(columns={'num': 'target'}, inplace=True)

# --- Part B: Fetch the specific Cleveland dataset from URL ---
cleveland_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
column_names = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach",
                "exang", "oldpeak", "slope", "ca", "thal", "target"]
df_cleveland = pd.read_csv(cleveland_url, names=column_names, na_values='?')

# --- Part C: Combine the two dataframes ---
# We combine both sources to create a more comprehensive dataset
df = pd.concat([df_repo, df_cleveland], ignore_index=True)
# Remove any duplicate rows that might exist between the two sources
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True) # Reset index after dropping rows

# --- Data Cleaning (These steps are still essential) ---
# The data fetched contains missing values (NaNs), which must be removed.
df.dropna(inplace=True)

# Ensure all columns have a numeric type for the models
df = df.astype(float)

# The 'target' column has values from 0 (no disease) to 4.
# We convert this into a binary classification problem: 0 = No Disease, 1 = Has Disease.
df["target"] = df["target"].apply(lambda x: 1 if x > 0 else 0)
print("Dataset prepared successfully.")

# =============================================================================
# 3. FEATURE ENGINEERING AND MODEL TRAINING
# =============================================================================
# Separate the cleaned features (X) and target (y)
X = df.drop(columns=["target"])
y = df["target"]

# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features to standardize their range, improving model performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Train Machine Learning Models ---
print("Training models...")
log_reg = LogisticRegression(random_state=42, solver='liblinear')
log_reg.fit(X_train_scaled, y_train)

xgb_model = xgb.XGBClassifier(eval_metric="logloss", use_label_encoder=False, random_state=42)
xgb_model.fit(X_train_scaled, y_train)

nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)
print("Models trained successfully.")

# --- Save Models and Scaler for future use ---
joblib.dump(log_reg, "log_reg_model.pkl")
joblib.dump(xgb_model, "xgb_model.pkl")
joblib.dump(nb_model, "nb_model.pkl")
joblib.dump(scaler, "scaler.pkl")

# =============================================================================
# 4. REPORT GENERATION FUNCTIONS
# =============================================================================
# (This entire section is identical to your original code)
feature_descriptions = {
    "age": "Age (years)",
    "sex": "Sex (1 = Male, 0 = Female)",
    "cp": "Chest Pain Type (1-4)",
    "trestbps": "Resting Blood Pressure (mmHg)",
    "chol": "Serum Cholesterol (mg/dL)",
    "fbs": "Fasting Blood Sugar > 120 mg/dL (1=True, 0=False)",
    "restecg": "Resting ECG Results (0-2)",
    "thalach": "Maximum Heart Rate Achieved",
    "exang": "Exercise-Induced Angina (1=Yes, 0=No)",
    "oldpeak": "ST Depression Induced by Exercise",
    "slope": "Slope of Peak Exercise ST Segment (1-3)",
    "ca": "Number of Major Vessels Colored by Fluoroscopy (0-3)",
    "thal": "Thalassemia Type (3=Normal, 6=Fixed, 7=Reversible)"
}

diet_chart = [
    ["Food Type", "Recommended Items"],
    ["Fruits", "Apples, Berries, Oranges"],
    ["Vegetables", "Spinach, Carrots, Broccoli"],
    ["Proteins", "Fish, Chicken, Beans"],
    ["Grains", "Oats, Brown Rice, Whole Wheat"],
    ["Dairy", "Low-fat Milk, Yogurt"],
    ["Healthy Fats", "Olive Oil, Nuts, Avocado"]
]

def risk_level(prob):
    if prob < 0.4: return "Low Risk"
    elif 0.4 <= prob < 0.7: return "Moderate Risk"
    else: return "High Risk"

def plot_risk_chart(prob_log, prob_xgb, prob_nb, report_filename):
    risks = [prob_log * 100, prob_xgb * 100, prob_nb * 100]
    labels = ["Logistic Regression", "XGBoost", "Naïve Bayes"]
    colors = ['green' if r < 40 else 'gold' if r < 70 else 'red' for r in risks]
    plt.figure(figsize=(8, 5))
    sns.barplot(x=labels, y=risks, palette=colors)
    plt.ylabel("Heart Disease Probability (%)")
    plt.title("Heart Disease Risk Analysis by Model")
    plt.ylim(0, 100)
    for index, value in enumerate(risks):
        plt.text(index, value + 1, f'{value:.2f}%', ha='center', va='bottom')
    os.makedirs("reports", exist_ok=True)
    chart_path = f"reports/{report_filename}_chart.png"
    plt.savefig(chart_path)
    plt.close()
    return chart_path

def generate_pdf_report(risk, prob_log, prob_xgb, prob_nb, user_inputs):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    report_filename = f"Heart_Disease_Report_{timestamp}"
    chart_path = plot_risk_chart(prob_log, prob_xgb, prob_nb, report_filename)
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", "B", 16)
    pdf.cell(0, 10, "Machine Learning-Based Heart Disease Prediction Report", ln=True, align='C')
    pdf.ln(10)
    pdf.set_font("Arial", "B", 12)
    pdf.cell(0, 10, "Patient's Health Data:", ln=True)
    pdf.set_font("Arial", size=10)
    col_width = pdf.w / 2.2
    for key, value in user_inputs.items():
        pdf.cell(col_width, 8, str(key), border=1)
        pdf.cell(col_width, 8, str(value), border=1, ln=True)
    pdf.ln(5)
    pdf.image(chart_path, x=15, w=pdf.w - 30)
    pdf.ln(5)
    pdf.set_font("Arial", "B", 14)
    pdf.cell(0, 10, f"Overall Predicted Risk Level: {risk}", ln=True, align='C')
    pdf.ln(5)
    pdf.set_font("Arial", "B", 12)
    pdf.cell(0, 10, "Prediction Details:", ln=True)
    pdf.set_font("Arial", size=11)
    pdf.cell(0, 8, f"- Logistic Regression Risk: {prob_log*100:.2f}%", ln=True)
    pdf.cell(0, 8, f"- XGBoost Risk: {prob_xgb*100:.2f}%", ln=True)
    pdf.cell(0, 8, f"- Naïve Bayes Risk: {prob_nb*100:.2f}%", ln=True)
    pdf.ln(10)
    pdf.add_page()
    pdf.set_font("Arial", "B", 14)
    pdf.cell(0, 10, "General Heart-Healthy Diet Plan", ln=True, align='C')
    pdf.ln(5)
    pdf.set_font("Arial", "B", 11)
    pdf.cell(col_width, 8, diet_chart[0][0], border=1, align='C')
    pdf.cell(col_width, 8, diet_chart[0][1], border=1, ln=1, align='C')
    pdf.set_font("Arial", "", 10)
    for row in diet_chart[1:]:
        pdf.cell(col_width, 8, row[0], border=1)
        pdf.cell(col_width, 8, row[1], border=1, ln=1)
    pdf.ln(10)
    pdf.set_font("Arial", "I", 9)
    disclaimer = ("Disclaimer: This report is generated by a machine learning model and is for informational purposes only. "
                  "It is not a substitute for professional medical advice, diagnosis, or treatment. "
                  "Always seek the advice of your physician or other qualified health provider with any questions you may have regarding a medical condition.")
    pdf.multi_cell(0, 5, disclaimer)
    report_path = f"/content/drive/My Drive/Colab_Notebooks/reports/{report_filename}.pdf"
    pdf.output(report_path)
    print(f"\n✅ Report generated successfully: {report_path}")

# =============================================================================
# 5. MAIN EXECUTION BLOCK
# =============================================================================
if __name__ == "__main__":
    print("\n--- Heart Disease Prediction System ---")
    print("Please enter the following health data to generate a report.")
    user_inputs = {}
    for key, desc in feature_descriptions.items():
        while True:
            try:
                user_inputs[desc] = float(input(f"-> {desc}: "))
                break
            except ValueError:
                print("   Invalid input. Please enter a valid number.")
    input_df = pd.DataFrame([list(user_inputs.values())], columns=X.columns)
    input_scaled = scaler.transform(input_df)
    prob_log = log_reg.predict_proba(input_scaled)[:, 1][0]
    prob_xgb = xgb_model.predict_proba(input_scaled)[:, 1][0]
    prob_nb = nb_model.predict_proba(input_scaled)[:, 1][0]
    avg_prob = (prob_log + prob_xgb + prob_nb) / 3
    risk = risk_level(avg_prob)
    generate_pdf_report(risk, prob_log, prob_xgb, prob_nb, user_inputs)



Fetching and combining datasets...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Dataset prepared successfully.
Training models...
Models trained successfully.

--- Heart Disease Prediction System ---
Please enter the following health data to generate a report.
-> Age (years): 54
-> Sex (1 = Male, 0 = Female): 1
-> Chest Pain Type (1-4): 4
-> Resting Blood Pressure (mmHg): 140
-> Serum Cholesterol (mg/dL): 210
-> Fasting Blood Sugar > 120 mg/dL (1=True, 0=False): 140
-> Resting ECG Results (0-2): 2
-> Maximum Heart Rate Achieved: 145
-> Exercise-Induced Angina (1=Yes, 0=No): 0
-> ST Depression Induced by Exercise: 2.3
-> Slope of Peak Exercise ST Segment (1-3): 3
-> Number of Major Vessels Colored by Fluoroscopy (0-3): 2
-> Thalassemia Type (3=Normal, 6=Fixed, 7=Reversible): 7



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=labels, y=risks, palette=colors)



✅ Report generated successfully: /content/drive/My Drive/Colab_Notebooks/reports/Heart_Disease_Report_20250906_054434.pdf
