# 1. Importing Libraries
# This section imports all the necessary Python libraries for data manipulation, visualization,
# machine learning, and model persistence.

In [7]:
# AI_Predictive_System/notebooks/model_training.ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer # For handling missing values
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import joblib # For saving the model



In [None]:
# --- Phase 2: Data Preprocessing and EDA ---

# Load CSV - Make sure heart_disease_uci.csv is in the data/ folder
try:
    df = pd.read_csv('../data/heart_disease_uci.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: heart_disease_uci.csv not found. Please ensure it's in the 'data/' directory.")
    exit()

print("\nDataset Head:")
print(df.head())

print("\nDataset Info:")
df.info()

print("\nMissing Values (before handling):")
print(df.isnull().sum())


In [None]:
# --- Data Cleaning and Initial Transformation ---

# Drop 'id' and 'dataset' columns as they are not features for prediction
df = df.drop(['id', 'dataset'], axis=1)

# Rename 'num' column to 'target' for consistency with project requirements
# This dataset's 'num' column indicates stages (0 to 4).
# For binary classification (heart disease or not), we'll convert num > 0 to 1.
df['target'] = df['num'].apply(lambda x: 1 if x > 0 else 0)
df = df.drop('num', axis=1) # Drop the original 'num' column

# Handle inconsistent '?' values by replacing them with NaN
# This dataset often uses '?' for missing values, which pandas reads as strings
df.replace('?', np.nan, inplace=True)

# Convert boolean/object columns that should be numerical to appropriate types
# 'fbs' is 'True'/'False', 'exang' is 'True'/'False'
# Convert 'sex' from 'Male'/'Female' to 1/0
df['sex'] = df['sex'].map({'Male': 1, 'Female': 0})
df['fbs'] = df['fbs'].map({'True': 1, 'False': 0}).astype(float) # Convert to float for imputation if needed
df['exang'] = df['exang'].map({'True': 1, 'False': 0}).astype(float) # Convert to float for imputation if needed

# Convert remaining object columns to numerical or handle them as categorical for OneHotEncoding
# 'cp', 'restecg', 'slope', 'thal' are categorical
# 'ca' is numerical but has missing values and possibly non-integer '?' values, so convert to float
df['ca'] = pd.to_numeric(df['ca'], errors='coerce')
df['oldpeak'] = pd.to_numeric(df['oldpeak'], errors='coerce') # Ensure oldpeak is numeric

print("\nMissing Values (after initial cleaning):")
print(df.isnull().sum())

print("\nDataset Info (after initial cleaning):")
df.info()

print("\nDescriptive Statistics (after initial cleaning):")
print(df.describe())

# Separate features (X) and target (y)
X = df.drop('target', axis=1)
y = df['target']



In [None]:
# Identify categorical and numerical features for preprocessing
# Re-evaluate based on the cleaned dataset
# Ensure 'ca' and 'oldpeak' are treated as numerical since they were coerced to float
numerical_features = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca'] # 'ca' is often treated as categorical, but here it's numerical after cleaning
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']

# --- EDA after initial cleaning (before imputation for visuals) ---
# It's better to perform correlation matrix and histograms after numerical conversions
# but BEFORE imputation of numerical features if you want to see distributions with NaNs.
# For heatmap, it requires all numerical. So, we'll impute or drop NaNs just for this visual if needed
# Or, simply drop rows with NaNs for correlation (not ideal for actual model training)

# Temporarily fill numerical NaNs with mean for correlation matrix to work
# This is ONLY for visualization and not for the actual model pipeline
df_numeric_for_corr = df[numerical_features].fillna(df[numerical_features].mean())

# Convert categorical data for correlation matrix visualization using LabelEncoder or similar
# For demonstration in heatmap, we can temporarily encode categorical columns
temp_df = df_numeric_for_corr.copy()
for col in categorical_features:
    # Only encode if the column exists in the current df for heatmap
    if col in df.columns:
        le = LabelEncoder()
        # Convert to string first to handle NaNs if any are still there (though we already filled numerical)
        temp_df[col] = df[col].astype(str).apply(lambda x: x if x != 'nan' else None) # Handle NaNs
        temp_df[col] = le.fit_transform(temp_df[col].fillna('Missing_Category')) # Impute missing for encoding
        # You might also want to fill missing categorical values with mode before this for better representation

print("\nCorrelation Matrix (Note: Categorical features are label encoded for visualization):")
plt.figure(figsize=(12, 8))
# Combine temp_df with the target column (which is already numerical 0/1)
full_temp_df = pd.concat([temp_df, df['target']], axis=1)
sns.heatmap(full_temp_df.corr(numeric_only=True), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix (after initial cleaning, before pipeline imputation)')
plt.show()

# Histograms for numerical features (after initial cleaning)
# Note: These will still show NaNs if not handled for visualization, which is fine for initial EDA
df.hist(figsize=(15, 10), bins=20)
plt.suptitle('Histograms of Features (after initial cleaning)')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

# Distribution of target variable (now correctly named 'target' and binary)
plt.figure(figsize=(6, 4))
sns.countplot(x='target', data=df)
plt.title('Distribution of Heart Disease (0: No, 1: Yes)')
plt.xlabel('Heart Disease (0: No, 1: Yes)')
plt.ylabel('Count')
plt.show()

print(f"Count of target 0: {df['target'].value_counts()[0]}")
print(f"Count of target 1: {df['target'].value_counts()[1]}")


In [None]:
# --- Phase 3: Model Development ---

# Create preprocessing pipelines for numerical and categorical features
# Use SimpleImputer to handle missing values
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # Impute missing numerical values with mean
    ('scaler', StandardScaler()) # Normalize/scale numerical features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Impute missing categorical values with mode
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # Encode categorical data
])

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' # Keep other columns that are not in num or cat (e.g., if there were any left)
)

# Split data into Train/Test sets (80/20)
# Use the X and y defined after initial cleaning
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# Model Chosen: Random Forest Classifier
# Create a pipeline that first preprocesses and then applies the model
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', RandomForestClassifier(random_state=42))])



In [None]:
# Hyperparameter tuning via GridSearchCV
# Define parameter grid for RandomForestClassifier
param_grid = {
    'classifier__n_estimators': [100, 200], # Reduced for faster execution in initial testing
    'classifier__max_features': ['sqrt', 'log2'],
    'classifier__max_depth': [4, 6, None], # Reduced options
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

print("\nStarting GridSearchCV for Hyperparameter Tuning...")
grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, verbose=1, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train) # Fit the grid search on the training data

print(f"\nBest Parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)

# Evaluation Metrics: Accuracy, Precision, Recall, F1-score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"\nModel Evaluation on Test Set:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Heart Disease', 'Heart Disease'], yticklabels=['No Heart Disease', 'Heart Disease'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Save model as heart_model.pkl
model_save_path = '../model/heart_model.pkl'
joblib.dump(best_model, model_save_path)
print(f"\nTrained model saved to {model_save_path}")

print("\nModel Training and Evaluation Complete.")