<a href="https://colab.research.google.com/github/my-tech123/Assignment-4/blob/main/assignment_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===============================================================
# ENSEMBLE MODELS FOR IMBALANCED DATA
# Techniques: Bagging, Boosting, Stacking + SMOTE
# Dataset: Credit Card Fraud Detection
# ===============================================================

# --- 1Ô∏è‚É£  Install dependencies ---
!pip install -q pandas numpy scikit-learn imbalanced-learn xgboost matplotlib seaborn

# --- 2Ô∏è‚É£  Import libraries ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, RocCurveDisplay

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE

# --- 3Ô∏è‚É£  Load dataset (Credit Card Fraud Dataset) ---
# You can replace this with your own path if you already have the CSV.
url = "https://storage.googleapis.com/download.tensorflow.org/data/creditcard.csv"
df = pd.read_csv(url)

print("‚úÖ Dataset loaded successfully.")
print(df.head())
print("\nClass Distribution:\n", df['Class'].value_counts())

# --- 4Ô∏è‚É£  Preprocessing ---
X = df.drop('Class', axis=1)
y = df['Class']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# --- 5Ô∏è‚É£  Handle imbalance with SMOTE ---
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)
print(f"\nAfter SMOTE: {y_res.value_counts()}")

# --- 6Ô∏è‚É£  Train Ensemble Models ---
models = {
    "Random Forest (Bagging)": RandomForestClassifier(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = []

for name, model in models.items():
    model.fit(X_res, y_res)
    y_pred = model.predict(X_test)
    auc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append((name, auc, f1))
    print(f"\nüîπ {name}")
    print("AUC:", round(auc, 4))
    print("F1-Score:", round(f1, 4))
    print(classification_report(y_test, y_pred))

# --- 7Ô∏è‚É£  Stacking Ensemble ---
estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42))
]

stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    n_jobs=-1
)

stack_model.fit(X_res, y_res)
y_pred_stack = stack_model.predict(X_test)

auc_stack = roc_auc_score(y_test, y_pred_stack)
f1_stack = f1_score(y_test, y_pred_stack)
results.append(("Stacking Ensemble", auc_stack, f1_stack))

print("\nüî∑ Stacking Ensemble Results:")
print("AUC:", round(auc_stack, 4))
print("F1-Score:", round(f1_stack, 4))
print(classification_report(y_test, y_pred_stack))

# --- 8Ô∏è‚É£  Compare model performance ---
results_df = pd.DataFrame(results, columns=["Model", "AUC", "F1-Score"])
print("\nüìä Model Performance Summary:\n")
print(results_df)

# --- 9Ô∏è‚É£  Visualization ---
plt.figure(figsize=(8,4))
sns.barplot(data=results_df.melt('Model'), x='Model', y='value', hue='variable')
plt.title("Model Comparison (AUC & F1-Score)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# --- üîü ROC Curve for best model ---
best_model_name = results_df.sort_values("AUC", ascending=False).iloc[0]['Model']
print(f"\nüèÜ Best Model: {best_model_name}")

if "Stacking" in best_model_name:
    best_model = stack_model
else:
    best_model = models[best_model_name]

RocCurveDisplay.from_estimator(best_model, X_test, y_test)
plt.title(f"ROC Curve - {best_model_name}")
plt.show()


‚úÖ Dataset loaded successfully.
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   
