# Part 4: Feature Engineering & Selection
**Student:** Nevin Nijanthan (IT24102335)

In [None]:
# ===============================
# Feature Engineering & EDA Notebook
# Author: Nevin Nijanthan
# ===============================

# --- 1. Imports ---
import pandas as pd
from sklearn.feature_selection import mutual_info_classif
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline

# --- 2. Settings ---
members = {
    "Kishan Ahamed": "IT24103829",
    "Abhinaya Kumar": "IT24103851",
    "Lafry": "IT24103834",
    "Nevin Nijanthan": "IT24102335",
    "Indhuwara": "IT24103022",
    "Sandali": "IT24103843"
}

# Output path for visualizations
output_viz_path = "../results/eda_visualizations/"
if not os.path.exists(output_viz_path):
    os.makedirs(output_viz_path)

# Dataset path
data_path = "../data/raw/creditcard.csv"

# --- 3. Load Dataset ---
try:
    df = pd.read_csv(data_path)
    print("✅ Dataset loaded successfully! Shape:", df.shape)
except FileNotFoundError:
    print(f"❌ Error: {data_path} not found.")

# --- 4. Feature Engineering ---
if 'df' in locals():
    if 'Amount' in df.columns and 'Time' in df.columns:
        df['Amount_per_Time'] = df['Amount'] / (df['Time'] + 1)
        print("✅ Created new feature: Amount_per_Time")
    else:
        print("⚠️ Skipping feature engineering (Amount/Time not found).")

# --- 5. Mutual Information for Feature Selection ---
if 'df' in locals():
    X_temp = df.drop('Class', axis=1)
    y_temp = df['Class']

    mi_scores = mutual_info_classif(X_temp, y_temp, random_state=42)
    mi_series = pd.Series(mi_scores, index=X_temp.columns).sort_values(ascending=False)

    print("🔹 Mutual Information Scores:")
    display(mi_series)

    selected_features = mi_series[mi_series > 0].index.tolist()
    print("🔹 Selected features (MI > 0):", selected_features)

    # --- 6. Plot Mutual Information Scores ---
    plt.figure(figsize=(12,6))
    mi_series.sort_values(ascending=False).plot(kind='bar', color='skyblue')
    plt.title("Mutual Information Scores of Features")
    plt.ylabel("MI Score")
    plt.tight_layout()
    plt.show()
    plt.savefig(f"{output_viz_path}{members['Nevin Nijanthan']}_feature_engineering.png")
    plt.close()

    # --- 7. Class Distribution ---
    plt.figure(figsize=(6,4))
    sns.countplot(x='Class', data=df)
    plt.title("Fraud (1) vs Non-Fraud (0) - Before SMOTE")
    plt.show()
    plt.savefig(f"{output_viz_path}{members['Nevin Nijanthan']}_class_distribution_before_smote.png")
    plt.close()

    # --- 8. Feature Histograms ---
    df[selected_features].hist(figsize=(20, 15), bins=30, edgecolor='black')
    plt.suptitle("Feature Histograms")
    plt.show()
    plt.savefig(f"{output_viz_path}{members['Nevin Nijanthan']}_feature_histograms.png")
    plt.close()

    # --- 9. Correlation Heatmap ---
    plt.figure(figsize=(12, 8))
    corr = df[selected_features].corr()
    sns.heatmap(corr, cmap='coolwarm', linewidths=0.5, annot=True)
    plt.title("Feature Correlation Heatmap")
    plt.show()
    plt.savefig(f"{output_viz_path}{members['Nevin Nijanthan']}_feature_correlation_heatmap.png")
    plt.close()
