In [3]:
# ============================
# 1. Stroke Prediction
# ============================

# Import required libraries for data manipulation and visualization

import pandas as pd  # For handling data in tabular format (DataFrames)
import numpy as np  # For numerical operations and array handling

import matplotlib.pyplot as plt  # For creating static visualizations
import seaborn as sns  # For enhanced statistical data visualization

# Import scikit-learn modules for model building and evaluation

from sklearn.model_selection import train_test_split, cross_val_score  # For splitting data and cross-validation

from sklearn.linear_model import LogisticRegression  # Logistic Regression model
from sklearn.tree import DecisionTreeClassifier  # Decision Tree model
from sklearn.ensemble import RandomForestClassifier  # Random Forest model

from sklearn.metrics import classification_report, confusion_matrix  # For evaluating model performance

# Import SMOTE from imbalanced-learn to handle class imbalance
from imblearn.over_sampling import SMOTE  # Synthetic Minority Over-sampling Technique to balance minority class

In [None]:
# ============================================
# 2. Load and Inspect Dataset
# ============================================

# Define the URL of the raw CSV file hosted on GitHub
url = 'https://raw.githubusercontent.com/monirulislammd/CIND820-Big-Data-Analytics-Project/main/healthcare-dataset-stroke-data.csv'

# Read the CSV file directly from the GitHub URL into a pandas DataFrame
df = pd.read_csv(url)

# Display dataset shape and data types
print("\nDataset Shape:", df.shape, "\n")  # Print number of rows and columns
print(df.info())  # Show column names, non-null counts, and data types

# Display first few rows of the dataset
print("\n Example Dataset:")
df.head()  # Preview the top 5 rows to understand structure and sample values

In [None]:
# ============================================
# 3. Data Cleaning and Preprocessing
# ============================================

# Check for missing values in each column
print("Missing values per column:\n", df.isnull().sum())  # Identify columns with missing data

# Impute missing BMI values using the median (robust to outliers)
df['bmi'].fillna(df['bmi'].median(), inplace=True)  # Replace NaNs in 'bmi' with median value

# Drop 'id' column (not useful for prediction)
df.drop(columns=['id'], inplace=True, errors='ignore')  # Remove identifier column if present

# Remove rows where gender is 'Other' to avoid one-hot encoding mismatch
df.drop(df[df['gender'] == 'Other'].index, inplace=True)  # Drop rows with ambiguous gender category

# Replace 'Unknown' in smoking_status with 'never smoked' for consistency
df['smoking_status'] = df['smoking_status'].replace('Unknown', 'never smoked')  # Simplify category

# Final check for missing values after cleaning
print("\nMissing values after imputation:\n", df.isnull().sum())  # Confirm all missing values handled

# Display summary statistics of cleaned dataset
print("\n\nSummary Statistics:\n")
df.describe() # Show count, mean, std, min, max, and quartiles for numeric columns

In [None]:
# =========================================
# 4. Handle Outliers (IQR Clipping)
# =========================================

# Plot histograms for key numeric features to understand their distributions
df[['age', 'avg_glucose_level', 'bmi']].hist(figsize=(10,5))  # Create histograms for selected numeric columns
plt.suptitle("Histograms of Numeric Features with outliers\n")  # Add a super title for the figure
plt.show()  # Display the plot

# Loop through selected numerical columns to cap outliers
for col in ['age', 'avg_glucose_level', 'bmi']:
    Q1 = df[col].quantile(0.25)  # First quartile (25th percentile)
    Q3 = df[col].quantile(0.75)  # Third quartile (75th percentile)
    IQR = Q3 - Q1  # Interquartile range (spread of middle 50%)

    # Define lower and upper bounds for acceptable values
    lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR

    # Cap values outside the bounds using np.clip
    df[col] = np.clip(df[col], lower, upper)  # Values below 'lower' set to 'lower', above 'upper' set to 'upper'

In [None]:
# =========================================
# 5. Exploratory Data Analysis (EDA)
# =========================================

# Plot histograms for key numeric features to understand their distributions
df[['age', 'avg_glucose_level', 'bmi']].hist(figsize=(10,5))  # Create histograms for selected numeric columns
plt.suptitle("Histograms of Numeric Features after removing outliers\n")  # Add a super title for the figure
plt.show()  # Display the plot

# Define categorical columns to visualize their frequency distributions
cat_cols = ['ever_married', 'gender', 'work_type', 'Residence_type', 'smoking_status']

# Loop through each categorical column and plot a countplot

print("\n\nFrequency Distribution of categorial features:")
for c in cat_cols:
    sns.countplot(x=c, data=df)  # Bar chart showing frequency of each category
    plt.title(f"\n\nDistribution of {c}\n")  # Title for each plot
    plt.show()  # Display the plot

In [None]:
# ==========================
# 6. Feature Encoding
# ==========================

# Encode binary categorical features to numeric format (0/1)
df['ever_married'] = df['ever_married'].map({'No': 0, 'Yes': 1})  # Convert 'ever_married' to binary
df['hypertension'] = df['hypertension'].map({0: 0, 1: 1})  # Already numeric, but ensures consistency
df['heart_disease'] = df['heart_disease'].map({0: 0, 1: 1})  # Already numeric, but explicitly mapped

# Apply one-hot encoding to multi-category features
# drop_first=True avoids dummy variable trap (perfect multicollinearity)
df = pd.get_dummies(
    df,
    columns=['gender', 'work_type', 'Residence_type', 'smoking_status'],
    drop_first=True  # Drops first category to keep k-1 encoded columns
)

# Display first few rows of the encoded dataset
print("Encoded Dataset:\n")
df.head()  # Preview transformed dataset with encoded features

In [None]:
# ============================================
# 7. Handle Class Imbalance (SMOTE)
# ============================================

# Separate features (X) and target (y)
X = df.drop('stroke', axis=1)  # Drop the target column to get feature matrix
y = df['stroke']  # Target variable indicating stroke occurrence (0 = No, 1 = Yes)

# Check class distribution before applying SMOTE
print("Before balancing:\n\n", y.value_counts(normalize=True))  # Show class proportions (e.g., 95% no stroke, 5% stroke)

# Apply SMOTE to balance the dataset by oversampling the minority class
sm = SMOTE(random_state=42)  # Initialize SMOTE with a fixed random seed for reproducibility
X_res, y_res = sm.fit_resample(X, y)  # Generate synthetic samples for minority class

# Check class distribution after applying SMOTE
print("\n\nAfter balancing:\n\n", y_res.value_counts(normalize=True))  # Should now be ~50/50 between classes

In [None]:
# ============================================
# 8. Model Comparison with Cross-Validation
# ============================================

# Define a dictionary of models to compare
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),  # Logistic Regression with increased max iterations
    'Decision Tree': DecisionTreeClassifier(),  # Basic Decision Tree classifier
    'Random Forest': RandomForestClassifier()  # Ensemble method using multiple decision trees
}

# Define evaluation metrics to use during cross-validation
metrics = {
    'Accuracy': 'accuracy',  # Overall correctness
    'Precision': 'precision',  # True Positives / (True Positives + False Positives)
    'Recall': 'recall',  # True Positives / (True Positives + False Negatives)
    'F1 Score': 'f1'  # Harmonic mean of precision and recall
}

# Initialize an empty list to store results for each model
results = []

# Loop through each model and evaluate using 10-fold cross-validation
for name, model in models.items():
    model_result = {'Model': name}  # Store model name
    for metric_name, metric in metrics.items():
        # Perform 10-fold cross-validation using the specified metric
        scores = cross_val_score(model, X_res, y_res, cv=10, scoring=metric)
        # Store mean and standard deviation of the scores
        model_result[f'Mean {metric_name}'] = np.mean(scores)
        model_result[f'Std {metric_name}'] = np.std(scores)
    results.append(model_result)  # Append results for this model

# Convert results list to a DataFrame for easy viewing
results_df = pd.DataFrame(results)

# Sort models by highest mean F1 Score (most balanced metric)
results_df = results_df.sort_values(by='Mean F1 Score', ascending=False)

# Display cross-validation results
print("===== Cross-Validation Results =====\n")
print(results_df.round(4))

# Visualize F1 Score comparison across models using a bar plot
plt.figure(figsize=(8,5))
sns.barplot(data=results_df, x='Model', y='Mean F1 Score', palette='viridis')  # Create barplot
plt.title('\n\nModel Comparison (Mean F1 Score - 10-Fold CV)\n')
plt.ylabel('Mean F1 Score')
plt.ylim(0, 1)  # Set y-axis limits
plt.show()

In [None]:
# =========================================
# 9. Train-Test Split & Best Model Training
# =========================================

# Split the balanced dataset into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res  # Stratify ensures class balance in both sets
)

# Identify the model with the highest mean F1 Score from cross-validation results
best_model_name = results_df.iloc[0]['Model']  # Select top-performing model
print(f"\nBest Model: {best_model_name}")  # Display chosen model

# Retrieve the best model from the models dictionary
best_model = models[best_model_name]  # Load model object

# Train the best model on the training data
best_model.fit(X_train, y_train)  # Fit model to training set

# Predict stroke outcomes on the test set
y_pred = best_model.predict(X_test)  # Generate predictions

# Evaluate model performance on the test set
print("\nTest Set Evaluation:\n")
print(confusion_matrix(y_test, y_pred))  # Show confusion matrix (TP, FP, FN, TN)
print(classification_report(y_test, y_pred))  # Show precision, recall, F1 score, and support

# Display feature importance if the model supports it (e.g., tree-based models)
if hasattr(best_model, 'feature_importances_'):
    # Extract and sort feature importances
    feat_imp = pd.Series(best_model.feature_importances_, index=X_res.columns).sort_values(ascending=False)
    print("\nTop Features with absolute proportions:\n\n", feat_imp.head(10))  # Show top 10 raw importances

    # Normalize importances to show percentage contribution
    feat_imp = feat_imp / feat_imp.sum()

    # Plot top 10 features by normalized importance
    feat_imp.head(10).plot(kind='barh')  # Horizontal bar chart
    plt.xlabel("Normalized Importance")  # X-axis label
    plt.title("\nTop 10 Important Features with their percentage proportion\n")  # Chart title
    plt.show()

In [None]:
# =========================================
# 10. Interactive Stroke Prediction Tool
# =========================================

def predict_stroke_users_interactive(model, feature_info, training_columns):
    """
    Interactive Stroke Prediction Tool consistent with training preprocessing.

    Allows user to input feature values interactively and predicts stroke risk
    using the trained model.

    Parameters:
        model : trained sklearn model
        feature_info : dict of {feature_name: allowed_values or 'numeric'}
        training_columns : list of columns after one-hot encoding
    """

    # Welcome message and instructions
    print("=== Stroke Prediction Tool (One-Hot Encoded) ===")
    print("\nType 'stop' at any time to exit.\n")
    print("Enter your data in the provided box \n")

    while True:
        user_data = {}  # Dictionary to store user inputs

        # Collect user inputs for each feature
        for feature, allowed in feature_info.items():
            while True:
                value = input(f"{feature}: ")  # Prompt user for input
                if value.lower() == 'stop':
                    print("\nExiting prediction tool.")
                    return  # Exit loop and function

                # Handle numeric input
                if allowed == 'numeric':
                    try:
                        user_data[feature] = float(value)  # Convert to float
                        break
                    except ValueError:
                        print("Invalid input. Enter a numeric value.")

                # Handle binary yes/no input
                elif allowed == ['yes', 'no']:
                    if value.lower() in ['yes', 'no']:
                        user_data[feature] = 1 if value.lower() == 'yes' else 0  # Encode as 1/0
                        break
                    else:
                        print("Enter yes or no.")

                # Handle multi-category input
                else:
                    allowed_lower = [v.lower() for v in allowed]  # Normalize allowed values
                    if value.lower() in allowed_lower:
                        idx = allowed_lower.index(value.lower())
                        user_data[feature] = allowed[idx]  # Store original case value
                        break
                    else:
                        print(f"Invalid input. Allowed: {allowed}")

        # Convert user input dictionary to DataFrame
        new_data = pd.DataFrame([user_data])

        # Identify multi-category features for one-hot encoding
        categorical_features = [
            f for f, allowed in feature_info.items()
            if allowed != 'numeric' and allowed != ['yes', 'no']
        ]

        # Apply one-hot encoding to multi-category features
        if categorical_features:
            new_data_encoded = pd.get_dummies(new_data, columns=categorical_features, drop_first=True)
        else:
            new_data_encoded = new_data.copy()

        # Align encoded input with training columns (fill missing with 0)
        new_data_encoded = new_data_encoded.reindex(columns=training_columns, fill_value=0)

        # Predict stroke class and probability
        pred_class = model.predict(new_data_encoded)[0]  # Predicted label
        pred_proba = model.predict_proba(new_data_encoded)[0]  # Probability scores

        # Display prediction results
        print("\n\n===== Prediction Result =====")
        print(f"\nPredicted Class: {pred_class}")  # 0 = No Stroke, 1 = Stroke
        print(f"\nProbability [Class 0, Class 1]: {pred_proba}\n")  # Confidence scores
        print("-" * 50 + "\n")


# Define feature types and allowed values for user input
feature_info = {
    'gender': ['Male', 'Female', 'Other'],
    'age': 'numeric',
    'hypertension': ['yes', 'no'],
    'heart_disease': ['yes', 'no'],
    'ever_married': ['yes', 'no'],
    'work_type': ['Private', 'Self-employed', 'Govt_job', 'Children', 'Never_worked'],
    'Residence_type': ['Urban', 'Rural'],
    'avg_glucose_level': 'numeric',
    'bmi': 'numeric',
    'smoking_status': ['formerly smoked', 'never smoked', 'smokes']
}

# Launch the interactive prediction tool
predict_stroke_users_interactive(best_model, feature_info, X_res.columns)