<a href="https://colab.research.google.com/github/naidk/Internship-Portfolio/blob/main/Data-Science/Pima_Diabetes_Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Step 1: Initial Setup

In [None]:
import sklearn
print("scikit-learn version:", sklearn.__version__)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')


In [None]:
import pandas as pd
df = pd.read_csv('diabetes.csv')

Step 2: Exploratory Data Analysis (EDA)

In [None]:
df.describe()
df.info()
sns.boxplot(data=df)
plt.xticks(rotation=90)
plt.title("Boxplot for Outlier Detection")
plt.show()


Step 2: Full EDA + Data Cleaning Tasks

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

# Load your dataset
df = pd.read_csv("diabetes.csv")  # Or replace with your actual path

# 1. Check for Missing Values
print("🔹 Missing Values:")
print(df.isnull().sum())

# 2. Check Data Types
print("\n🔹 Data Types:")
print(df.dtypes)

# 3. Basic Descriptive Statistics
print("\n🔹 Descriptive Stats:")
print(df.describe())

# 4. Check for Duplicates
duplicates = df.duplicated().sum()
print(f"\n🔹 Duplicates Found: {duplicates}")
df = df.drop_duplicates()

# 5. Class Balance Visualization
sns.countplot(x='Outcome', data=df)
plt.title("🔹 Class Balance")
plt.show()

# 6. Feature Distributions
df.hist(figsize=(12, 10), bins=20)
plt.suptitle("🔹 Feature Distributions", fontsize=16)
plt.show()

# 7. Correlation Heatmap
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("🔹 Correlation Heatmap")
plt.show()

# 8. Z-Score for Outlier Check
z_scores = np.abs(zscore(df.drop('Outcome', axis=1)))
outliers = (z_scores > 3).sum(axis=0)
print("\n🔹 Z-Score Based Outlier Count Per Feature:")
print(outliers)

# 9. Skewness
print("\n🔹 Skewness of Features:")
print(df.drop('Outcome', axis=1).skew().sort_values(ascending=False))


In [None]:
# Columns where 0 is biologically invalid
cols_with_zero_as_nan = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]

# Replace 0s with NaN
df[cols_with_zero_as_nan] = df[cols_with_zero_as_nan].replace(0, np.nan)


In [None]:
print("Missing values after treating 0s as NaN:")
print(df[cols_with_zero_as_nan].isnull().sum())


In [None]:
# Median imputation
df[cols_with_zero_as_nan] = df[cols_with_zero_as_nan].fillna(df[cols_with_zero_as_nan].median())


In [None]:
print("Final missing values after imputation:")
print(df.isnull().sum())


Step 3A: Outlier Detection Using Z-Score Method

In [None]:
from scipy.stats import zscore

def remove_outliers_zscore(df, threshold=3):
    df_clean = df.copy()
    z_scores = np.abs(zscore(df_clean.select_dtypes(include=[np.number])))
    mask = (z_scores < threshold).all(axis=1)
    df_clean = df_clean[mask]
    return df_clean


In [None]:
df_no_outliers = remove_outliers_zscore(df)
print("Shape after outlier removal:", df_no_outliers.shape)


In [None]:
df_with_outliers = df.copy()           # Cleaned for zeros but keeps outliers
df_without_outliers = df_no_outliers   # Cleaned for zeros AND outliers


 Log1p Transformation Code (Log(1 + x) to handle 0s)

In [None]:
import numpy as np

skewed_features = ["Insulin", "DiabetesPedigreeFunction", "Age", "Pregnancies"]

for feature in skewed_features:
    # Only apply if feature has positive values
    df_with_outliers[feature] = np.log1p(df_with_outliers[feature])
    df_without_outliers[feature] = np.log1p(df_without_outliers[feature])



In [None]:
# Re-check skewness after transformation
print("🔍 Skewness after log1p transformation (with outliers):")
print(df_with_outliers[skewed_features].skew())

print("\n🔍 Skewness after log1p transformation (without outliers):")
print(df_without_outliers[skewed_features].skew())


In [None]:
# If Outcome column was lost during transformations, reattach it
df_with_outliers["Outcome"] = df["Outcome"]
df_without_outliers["Outcome"] = df["Outcome"]


In [None]:
from sklearn.preprocessing import StandardScaler

# Separate features and target
X_with_outliers = df_with_outliers.drop("Outcome", axis=1)
y_with_outliers = df_with_outliers["Outcome"]

X_without_outliers = df_without_outliers.drop("Outcome", axis=1)
y_without_outliers = df_without_outliers["Outcome"]

# Scale the features
scaler_with = StandardScaler()
scaler_without = StandardScaler()

X_with_outliers_scaled = scaler_with.fit_transform(X_with_outliers)
X_without_outliers_scaled = scaler_without.fit_transform(X_without_outliers)

print("Shapes:", X_with_outliers_scaled.shape, X_without_outliers_scaled.shape)


Step 6: Train-Test Split (on both datasets)

In [None]:
from sklearn.model_selection import train_test_split

# Features & Target
X_with = df_with_outliers.drop("Outcome", axis=1)
y_with = df_with_outliers["Outcome"]

X_without = df_without_outliers.drop("Outcome", axis=1)
y_without = df_without_outliers["Outcome"]

# Split both
X_train_with, X_test_with, y_train_with, y_test_with = train_test_split(X_with, y_with, test_size=0.2, random_state=42)
X_train_wo, X_test_wo, y_train_wo, y_test_wo = train_test_split(X_without, y_without, test_size=0.2, random_state=42)


In [None]:
# 🔄 Step: Handle Class Imbalance using SMOTE
from imblearn.over_sampling import SMOTE

# SMOTE for 'with outliers' data
smote_with = SMOTE(random_state=42)
X_train_with_smote, y_train_with_smote = smote_with.fit_resample(X_train_with, y_train_with)

# SMOTE for 'without outliers' data
smote_wo = SMOTE(random_state=42)
X_train_wo_smote, y_train_wo_smote = smote_wo.fit_resample(X_train_wo, y_train_wo)

# Check class distribution after SMOTE
print("🔹 Class Distribution After SMOTE (With Outliers):")
print(y_train_with_smote.value_counts())
print("\n🔹 Class Distribution After SMOTE (Without Outliers):")
print(y_train_wo_smote.value_counts())


Step 7: Standard Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

# Create scaler objects
scaler_with = StandardScaler()
scaler_wo = StandardScaler()

# Scale after SMOTE
X_train_with_scaled = scaler_with.fit_transform(X_train_with_smote)
X_test_with_scaled = scaler_with.transform(X_test_with)  # use original test set

X_train_wo_scaled = scaler_wo.fit_transform(X_train_wo_smote)
X_test_wo_scaled = scaler_wo.transform(X_test_wo)        # use original test set


 Step 8: Model Training (Start with Logistic Regression)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score

# Train using SMOTE-balanced data
lr = LogisticRegression()
lr.fit(X_train_with_scaled, y_train_with_smote)

# Predict on the original (unchanged) test set
y_pred_with = lr.predict(X_test_with_scaled)

# Evaluation
print("🔹 Logistic Regression (With Outliers + SMOTE):")
print("Accuracy:", accuracy_score(y_test_with, y_pred_with))
print("Confusion Matrix:\n", confusion_matrix(y_test_with, y_pred_with))
print("Classification Report:\n", classification_report(y_test_with, y_pred_with))
print("ROC-AUC Score:", roc_auc_score(y_test_with, lr.predict_proba(X_test_with_scaled)[:, 1]))


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score

# Train using SMOTE-balanced data
lr_wo = LogisticRegression()
lr_wo.fit(X_train_wo_scaled, y_train_wo_smote)

# Predict on the original (unchanged) test set
y_pred_wo = lr_wo.predict(X_test_wo_scaled)

# Evaluation
print("🔹 Logistic Regression (Without Outliers + SMOTE):")
print("Accuracy:", accuracy_score(y_test_wo, y_pred_wo))
print("Confusion Matrix:\n", confusion_matrix(y_test_wo, y_pred_wo))
print("Classification Report:\n", classification_report(y_test_wo, y_pred_wo))
print("ROC-AUC Score:", roc_auc_score(y_test_wo, lr_wo.predict_proba(X_test_wo_scaled)[:, 1]))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Train on SMOTE-balanced data
rf_with = RandomForestClassifier(random_state=42)
rf_with.fit(X_train_with_scaled, y_train_with_smote)

# Predict on the original test set
y_pred_with_rf = rf_with.predict(X_test_with_scaled)

# Evaluation
print("🔹 Random Forest (With Outliers + SMOTE):")
print("Accuracy:", accuracy_score(y_test_with, y_pred_with_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test_with, y_pred_with_rf))
print("Classification Report:\n", classification_report(y_test_with, y_pred_with_rf))
print("ROC-AUC Score:", roc_auc_score(y_test_with, rf_with.predict_proba(X_test_with_scaled)[:, 1]))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Model
rf_wo = RandomForestClassifier(random_state=42)
rf_wo.fit(X_train_wo_scaled, y_train_wo_smote)

# Predict on the original test set
y_pred_wo_rf = rf_wo.predict(X_test_wo_scaled)

# Evaluation
print("\n🔹 Random Forest (Without Outliers + SMOTE):")
print("Accuracy:", accuracy_score(y_test_wo, y_pred_wo_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test_wo, y_pred_wo_rf))
print("Classification Report:\n", classification_report(y_test_wo, y_pred_wo_rf))
print("ROC-AUC Score:", roc_auc_score(y_test_wo, rf_wo.predict_proba(X_test_wo_scaled)[:, 1]))


In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

xgb_with = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_with.fit(X_train_with_scaled, y_train_with_smote)
y_pred_xgb_with = xgb_with.predict(X_test_with_scaled)

print("🔹 XGBoost (With Outliers + SMOTE):")
print("Accuracy:", accuracy_score(y_test_with, y_pred_xgb_with))
print("Confusion Matrix:\n", confusion_matrix(y_test_with, y_pred_xgb_with))
print("Classification Report:\n", classification_report(y_test_with, y_pred_xgb_with))
print("ROC-AUC Score:", roc_auc_score(y_test_with, xgb_with.predict_proba(X_test_with_scaled)[:, 1]))


In [None]:
xgb_wo = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_wo.fit(X_train_wo_scaled, y_train_wo_smote)
y_pred_xgb_wo = xgb_wo.predict(X_test_wo_scaled)

print("\n🔹 XGBoost (Without Outliers + SMOTE):")
print("Accuracy:", accuracy_score(y_test_wo, y_pred_xgb_wo))
print("Confusion Matrix:\n", confusion_matrix(y_test_wo, y_pred_xgb_wo))
print("Classification Report:\n", classification_report(y_test_wo, y_pred_xgb_wo))
print("ROC-AUC Score:", roc_auc_score(y_test_wo, xgb_wo.predict_proba(X_test_wo_scaled)[:, 1]))


In [None]:
print(X_train_wo_scaled.shape, y_train_wo.shape)
print(X_test_wo_scaled.shape, y_test_wo.shape)

# Check for NaNs or infinite values
import numpy as np
print("NaNs in X_train:", np.isnan(X_train_wo_scaled).sum())
print("NaNs in X_test:", np.isnan(X_test_wo_scaled).sum())
print("NaNs in y_train:", y_train_wo.isna().sum())
print("NaNs in y_test:", y_test_wo.isna().sum())


In [None]:
!pip install lazypredict


In [None]:
from lazypredict.Supervised import LazyClassifier
import warnings
warnings.filterwarnings("ignore")

# Initialize LazyClassifier
lazy_clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Fit and evaluate on scaled dataset
models, predictions = lazy_clf.fit(X_train_wo_scaled, X_test_wo_scaled, y_train_wo, y_test_wo)

# Display Top 10 Models
print("🔝 Top Performing Models:")
print(models.head(10))


In [None]:
print(X_train_wo_scaled.shape, y_train_wo.shape)
print(X_test_wo_scaled.shape, y_test_wo.shape)


In [None]:
# Correct split again from raw df_without_outliers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_wo = df_without_outliers.drop("Outcome", axis=1)
y_wo = df_without_outliers["Outcome"]

X_train_wo, X_test_wo, y_train_wo, y_test_wo = train_test_split(X_wo, y_wo, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_wo_scaled = scaler.fit_transform(X_train_wo)
X_test_wo_scaled = scaler.transform(X_test_wo)

# Confirm shapes now
print(X_train_wo_scaled.shape, y_train_wo.shape)
print(X_test_wo_scaled.shape, y_test_wo.shape)


In [None]:
from lazypredict.Supervised import LazyClassifier

# Instantiate and run
lazy_clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = lazy_clf.fit(X_train_wo_scaled, X_test_wo_scaled, y_train_wo, y_test_wo)

# Show top models
print("🔝 Top Performing Models:")
print(models.head(10))


Step: Feature Importance Visualization

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Get feature names
features = X_train_wo.columns
coefficients = lr_wo.coef_[0]

# Create a DataFrame
lr_importance = pd.DataFrame({'Feature': features, 'Importance': coefficients})
lr_importance = lr_importance.sort_values(by='Importance', key=abs, ascending=False)

# Plot
plt.figure(figsize=(10,6))
plt.barh(lr_importance['Feature'], lr_importance['Importance'])
plt.title("🔹 Logistic Regression - Feature Importance (Coefficients)")
plt.xlabel("Coefficient Value")
plt.gca().invert_yaxis()
plt.show()


In [None]:
# Get importances
rf_importance = pd.Series(rf_wo.feature_importances_, index=X_train_wo.columns)
rf_importance = rf_importance.sort_values(ascending=True)

# Plot
plt.figure(figsize=(10,6))
rf_importance.plot(kind='barh')
plt.title("🔹 Random Forest - Feature Importance")
plt.xlabel("Importance Score")
plt.show()


In [None]:
# Get importances
xgb_importance = pd.Series(xgb_wo.feature_importances_, index=X_train_wo.columns)
xgb_importance = xgb_importance.sort_values(ascending=True)

# Plot
plt.figure(figsize=(10,6))
xgb_importance.plot(kind='barh')
plt.title("🔹 XGBoost - Feature Importance")
plt.xlabel("Importance Score")
plt.show()


Tune Top Models Using Optuna

In [None]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix

# Step 1: Define Optuna tuning function
def tune_logistic(trial):
    C = trial.suggest_float("C", 0.01, 10.0, log=True)
    penalty = trial.suggest_categorical("penalty", ["l2"])
    solver = trial.suggest_categorical("solver", ["lbfgs", "liblinear"])

    model = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=1000)
    return cross_val_score(model, X_train_wo_scaled, y_train_wo, cv=5, scoring="roc_auc").mean()

# Step 2: Run the optimization
study_lr = optuna.create_study(direction="maximize")
study_lr.optimize(tune_logistic, n_trials=20)

# Step 3: Get best parameters and print them
best_lr_params = study_lr.best_params
print("✅ Best Logistic Regression Parameters:", best_lr_params)

# Step 4: Train model using best parameters
lr_best = LogisticRegression(**best_lr_params, max_iter=1000)
lr_best.fit(X_train_wo_scaled, y_train_wo)

# Step 5: Evaluate model
y_pred_lr = lr_best.predict(X_test_wo_scaled)
y_proba_lr = lr_best.predict_proba(X_test_wo_scaled)[:, 1]

print("\n📊 Evaluation on Test Set (Logistic Regression):")
print("Accuracy:", accuracy_score(y_test_wo, y_pred_lr))
print("ROC-AUC Score:", roc_auc_score(y_test_wo, y_proba_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test_wo, y_pred_lr))
print("Classification Report:\n", classification_report(y_test_wo, y_pred_lr))


In [None]:
from xgboost import XGBClassifier
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# 🔍 Step 1: Define objective function for Optuna
def tune_xgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
    }
    model = XGBClassifier(**params, use_label_encoder=False, eval_metric="logloss")
    return cross_val_score(model, X_train_wo_scaled, y_train_wo, cv=5, scoring="roc_auc").mean()

# 🔁 Step 2: Run optimization
study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(tune_xgb, n_trials=20)

# ✅ Step 3: Best parameters
best_xgb_params = study_xgb.best_params
print("✅ Best XGB Parameters:", best_xgb_params)

# 🎯 Step 4: Train model with best parameters
xgb_best = XGBClassifier(**best_xgb_params, use_label_encoder=False, eval_metric="logloss")
xgb_best.fit(X_train_wo_scaled, y_train_wo)

# 🧪 Step 5: Evaluation
y_pred_xgb = xgb_best.predict(X_test_wo_scaled)
y_proba_xgb = xgb_best.predict_proba(X_test_wo_scaled)[:, 1]

print("\n📊 Evaluation on Test Set (XGBClassifier):")
print("Accuracy:", accuracy_score(y_test_wo, y_pred_xgb))
print("ROC-AUC Score:", roc_auc_score(y_test_wo, y_proba_xgb))
print("Classification Report:\n", classification_report(y_test_wo, y_pred_xgb))


In [None]:
import optuna
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# 📌 Objective function to optimize var_smoothing
def objective(trial):
    var_smoothing = trial.suggest_loguniform('var_smoothing', 1e-12, 1e-6)
    model = GaussianNB(var_smoothing=var_smoothing)

    # 5-fold CV on training set
    score = cross_val_score(model, X_train_wo_scaled, y_train_wo,
                            scoring='roc_auc', cv=5).mean()
    return score

# 🔁 Run optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# 📌 Best parameters and score
print("✅ Best Parameters:", study.best_params)
print("🔝 Best ROC-AUC Score:", study.best_value)

# 🎯 Train with best params
best_gnb = GaussianNB(var_smoothing=study.best_params['var_smoothing'])
best_gnb.fit(X_train_wo_scaled, y_train_wo)

# 🧪 Predict and Evaluate
y_pred_gnb = best_gnb.predict(X_test_wo_scaled)
y_proba_gnb = best_gnb.predict_proba(X_test_wo_scaled)[:, 1]

print("\n📊 Evaluation on Test Set:")
print("Accuracy:", accuracy_score(y_test_wo, y_pred_gnb))
print("ROC-AUC Score:", roc_auc_score(y_test_wo, y_proba_gnb))
print("Classification Report:\n", classification_report(y_test_wo, y_pred_gnb))


In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB

# Best estimators
estimators = [
    ('lr', LogisticRegression(C=0.1334, penalty='l2', solver='liblinear')),
    ('xgb', XGBClassifier(**best_xgb, use_label_encoder=False, eval_metric='logloss')),
    ('gnb', GaussianNB(var_smoothing=2.634962028799826e-07))
]

# Meta learner
final_estimator = LogisticRegression()

# Create StackingClassifier
stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=final_estimator,
    passthrough=True,
    cv=5,
    n_jobs=-1
)

# Fit model
stacking_clf.fit(X_train_wo_scaled, y_train_wo)

# Evaluate
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

y_pred_stack = stacking_clf.predict(X_test_wo_scaled)
y_proba_stack = stacking_clf.predict_proba(X_test_wo_scaled)[:, 1]

print("\n📊 Stacking Ensemble Evaluation:")
print("Accuracy:", accuracy_score(y_test_wo, y_pred_stack))
print("ROC-AUC Score:", roc_auc_score(y_test_wo, y_proba_stack))
print("Confusion Matrix:\n", confusion_matrix(y_test_wo, y_pred_stack))
print("Classification Report:\n", classification_report(y_test_wo, y_pred_stack))


In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(stacking_clf, X_train_wo_scaled, y_train_wo, cv=5, scoring='roc_auc')
print("✅ Mean ROC-AUC:", cv_scores.mean())
print("📉 Standard Deviation:", cv_scores.std())


In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(stacking_clf, X_train_wo_scaled, y_train_wo, cv=skf, scoring='roc_auc')
print("Stratified ROC-AUC Mean:", cv_scores.mean())
print("Stratified ROC-AUC Std Dev:", cv_scores.std())


In [None]:
from sklearn.inspection import permutation_importance

# Fit your best model again
stacking_clf.fit(X_train_wo, y_train_wo)

# Permutation Importance on Test Set
result = permutation_importance(stacking_clf, X_test_wo, y_test_wo, n_repeats=10, random_state=42, scoring='roc_auc')

# Plot
import matplotlib.pyplot as plt

sorted_idx = result.importances_mean.argsort()
plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_idx)), result.importances_mean[sorted_idx])
plt.yticks(range(len(sorted_idx)), np.array(X_test_wo.columns)[sorted_idx])
plt.title("Permutation Feature Importance (ROC-AUC)")
plt.xlabel("Mean Importance")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Predict probabilities and labels
y_probs = stacking_clf.predict_proba(X_test_wo)[:, 1]
y_preds = stacking_clf.predict(X_test_wo)

# Create a DataFrame for debugging
df_debug = pd.DataFrame({
    "True Label": y_test_wo,
    "Predicted Label": y_preds,
    "Prediction Confidence": y_probs
})

# Filter misclassified samples
misclassified = df_debug[df_debug["True Label"] != df_debug["Predicted Label"]]

# Visualize misclassified samples' confidence
plt.figure(figsize=(10, 5))
sns.histplot(data=misclassified, x="Prediction Confidence", hue="True Label", multiple="stack", kde=True, palette="Set2")
plt.title("Confidence Distribution for Misclassified Samples")
plt.xlabel("Predicted Probability")
plt.ylabel("Count")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import roc_curve, accuracy_score, confusion_matrix, classification_report, roc_auc_score

# 1. Get FPR, TPR, thresholds
fpr, tpr, thresholds = roc_curve(y_test_wo, y_proba_xgb)

# 2. Find optimal index where TPR - FPR is maximum
optimal_idx = (tpr - fpr).argmax()
optimal_threshold = thresholds[optimal_idx]
print(f"✅ Optimal Threshold: {optimal_threshold:.2f}")

# 3. Apply the optimal threshold
y_pred_opt = (y_proba_xgb >= optimal_threshold).astype(int)

# 4. Evaluate
print("\n📊 Evaluation with Optimized Threshold:")
print("Accuracy:", accuracy_score(y_test_wo, y_pred_opt))
print("ROC-AUC:", roc_auc_score(y_test_wo, y_proba_xgb))  # same as before
print("Confusion Matrix:\n", confusion_matrix(y_test_wo, y_pred_opt))
print("Classification Report:\n", classification_report(y_test_wo, y_pred_opt))


In [None]:
from sklearn.metrics import roc_curve, accuracy_score, roc_auc_score, confusion_matrix, classification_report

# Get predicted probabilities from stacking ensemble
y_proba_stack = stacking_clf.predict_proba(X_test_wo_scaled)[:, 1]

# Find optimal threshold
fpr, tpr, thresholds = roc_curve(y_test_wo, y_proba_stack)
optimal_idx = (tpr - fpr).argmax()
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal Threshold for Stacking: {optimal_threshold:.2f}")

# Apply threshold to get predicted labels
y_pred_stack_opt = (y_proba_stack >= optimal_threshold).astype(int)

# Evaluate the optimized ensemble predictions
print("\n📊 Evaluation with Optimized Threshold (Stacking):")
print("Accuracy:", accuracy_score(y_test_wo, y_pred_stack_opt))
print("ROC-AUC:", roc_auc_score(y_test_wo, y_proba_stack))
print("Confusion Matrix:\n", confusion_matrix(y_test_wo, y_pred_stack_opt))
print("Classification Report:\n", classification_report(y_test_wo, y_pred_stack_opt))


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

# Define base models with tuned parameters
log_reg = LogisticRegression(C=0.1334, penalty='l2', solver='liblinear')
xgb_clf = XGBClassifier(**best_xgb, use_label_encoder=False, eval_metric='logloss')
gnb_clf = GaussianNB(var_smoothing=2.634962028799826e-07)

# Create soft voting ensemble
voting_clf = VotingClassifier(
    estimators=[('lr', log_reg), ('xgb', xgb_clf), ('gnb', gnb_clf)],
    voting='soft',  # Use 'hard' for majority vote
    n_jobs=-1
)

# Fit the model
voting_clf.fit(X_train_wo_scaled, y_train_wo)

# Predict
y_pred_vote = voting_clf.predict(X_test_wo_scaled)
y_proba_vote = voting_clf.predict_proba(X_test_wo_scaled)[:, 1]

# Evaluate
print("\n📊 Voting Ensemble Evaluation:")
print("Accuracy:", accuracy_score(y_test_wo, y_pred_vote))
print("ROC-AUC Score:", roc_auc_score(y_test_wo, y_proba_vote))
print("Confusion Matrix:\n", confusion_matrix(y_test_wo, y_pred_vote))
print("Classification Report:\n", classification_report(y_test_wo, y_pred_vote))


In [None]:
from sklearn.metrics import roc_curve

# Step 1: Get predicted probabilities
y_proba_voting = voting_clf.predict_proba(X_test_wo_scaled)[:, 1]

# Step 2: Find optimal threshold
fpr, tpr, thresholds = roc_curve(y_test_wo, y_proba_voting)
optimal_idx = (tpr - fpr).argmax()
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal Threshold: {optimal_threshold:.2f}")

# Step 3: Recalculate predictions with the new threshold
y_pred_voting_opt = (y_proba_voting >= optimal_threshold).astype(int)

# Step 4: Evaluate
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

print("\n📊 Evaluation with Optimized Threshold (Voting Ensemble):")
print("Accuracy:", accuracy_score(y_test_wo, y_pred_voting_opt))
print("ROC-AUC Score:", roc_auc_score(y_test_wo, y_proba_voting))
print("Confusion Matrix:\n", confusion_matrix(y_test_wo, y_pred_voting_opt))
print("Classification Report:\n", classification_report(y_test_wo, y_pred_voting_opt))


In [None]:
import joblib
joblib.dump(voting_clf, "voting_ensemble_final.pkl")


In [None]:
import joblib

# Save the VotingClassifier
joblib.dump(voting_clf, 'voting_model.pkl')

# Save the scaler used during preprocessing (like StandardScaler or MinMaxScaler)
joblib.dump(scaler_wo, 'scaler.pkl')


In [None]:
!git clone https://github.com/naidk/Internship-Portfolio.git
%cd Internship-Portfolio
