# Integrated Preprocessing Pipeline
This notebook contains the complete, integrated preprocessing pipeline for the credit card fraud detection dataset. It combines the work of all group members.

In [5]:
# For inline plotting in Jupyter
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import mutual_info_classif

# Correct IT numbers and names
members = {
    "Kishan Ahamed": "IT24103829",
    "Abhinaya Kumar": "IT24103851",
    "Lafry": "IT24103834",
    "Nevin Nijanthan": "IT24102335",
    "Indhuwara": "IT24103022",
    "Sandali": "IT24103843"
}

# Define the path for saving visualizations
output_viz_path = "../results/eda_visualizations/"
if not os.path.exists(output_viz_path):
    os.makedirs(output_viz_path)

# 1. Load dataset
df = pd.read_csv("../data/raw/creditcard.csv")
print("\n=== Dataset Loaded ===")
df.head()

# ========================= Kishan Ahamed =========================
# 2. Handle missing values
print("\nMissing values:\n", df.isnull().sum())
# Always generate missing values heatmap
plt.figure(figsize=(12,6))
sns.heatmap(df.isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title("Missing Values Heatmap (After Imputation / Placeholder)")
plt.savefig(f"{output_viz_path}{members['Kishan Ahamed']}_Kishan_Ahamed_missing_values.png")
plt.close()

# If missing values exist, fill them
if df.isnull().sum().sum() > 0:
    print("\nHandling missing values...")
    for col in df.columns:
        if df[col].dtype in ['float64', 'int64']:
            df[col] = df[col].fillna(df[col].median())
        else:
            df[col] = df[col].fillna(df[col].mode()[0])
    print("✅ Missing values imputed.")
else:
    print("✅ No missing values found.")

# ========================= Abhinaya Kumar =========================
# 3. Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
    print(f"\nEncoding categorical variables: {list(categorical_cols)}")
    le = LabelEncoder()
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col])
        # EDA: value counts bar plot for categorical variable
        plt.figure(figsize=(8,4))
        sns.countplot(x=col, data=df)
        plt.title(f"Value Counts for {col} (After Encoding)")
        plt.savefig(f"{output_viz_path}{members['Abhinaya Kumar']}_Abhinaya_Kumar_{col}_encoding.png")
        plt.close()
else:
    print("✅ No categorical variables found.")
    # Placeholder plot
    plt.figure(figsize=(6,4))
    plt.bar(['No categorical columns'], [1], color='gray')
    plt.title("No Categorical Columns Found")
    plt.savefig(f"{output_viz_path}{members['Abhinaya Kumar']}_Abhinaya_Kumar_no_categorical.png")
    plt.close()

# ========================= Lafry =========================
# 4. Outlier handling using percentile clipping (1st–99th percentile)
num_cols = df.select_dtypes(include=['float64', 'int64']).columns.drop('Class', errors='ignore')
print(f"\nClipping outliers in {len(num_cols)} numeric features to 1st–99th percentile...")

# Histograms BEFORE clipping
df[num_cols].hist(figsize=(20, 15), bins=30, edgecolor='black')
plt.suptitle("Numeric Features Histograms BEFORE Outlier Clipping", fontsize=16)
plt.savefig(f"{output_viz_path}{members['Lafry']}_Lafry_outlier_histograms_before.png")
plt.close()

# Clipping outliers
for col in num_cols:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df[col] = np.clip(df[col], lower, upper)

# Histograms AFTER clipping
df[num_cols].hist(figsize=(20, 15), bins=30, edgecolor='black')
plt.suptitle("Numeric Features Histograms AFTER Outlier Clipping", fontsize=16)
plt.savefig(f"{output_viz_path}{members['Lafry']}_Lafry_outlier_histograms_after.png")
plt.close()

print("✅ Outlier clipping complete.")

# ========================= Nevin Nijanthan =========================
# 5. Feature Engineering + Feature Selection
if 'Amount' in df.columns and 'Time' in df.columns:
    df['Amount_per_Time'] = df['Amount'] / (df['Time'] + 1)
    print("✅ Created new feature: Amount_per_Time")
else:
    print("⚠️ Skipping feature engineering (Amount/Time not found).")

# Mutual Information Feature Selection
X_temp = df.drop('Class', axis=1)
y_temp = df['Class']
mi_scores = mutual_info_classif(X_temp, y_temp, random_state=42)
mi_series = pd.Series(mi_scores, index=X_temp.columns).sort_values(ascending=False)
selected_features = mi_series[mi_series > 0].index.tolist()

# Bar plot of MI scores
plt.figure(figsize=(12,6))
mi_series.sort_values(ascending=False).plot(kind='bar')
plt.title("Mutual Information Scores of Features")
plt.ylabel("MI Score")
plt.tight_layout()
plt.savefig(f"{output_viz_path}{members['Nevin Nijanthan']}_Nevin_Nijanthan_feature_engineering.png")
plt.close()

X = df[selected_features]
y = df['Class']

# Class distribution BEFORE SMOTE
sns.countplot(x='Class', data=df)
plt.title("Fraud (1) vs Non-Fraud (0) - Before SMOTE")
plt.savefig(f"{output_viz_path}{members['Nevin Nijanthan']}_Nevin_Nijanthan_class_distribution_before_smote.png")
plt.close()

# Feature histograms
df[selected_features].hist(figsize=(20, 15), bins=30, edgecolor='black')
plt.suptitle("Feature Histograms")
plt.savefig(f"{output_viz_path}{members['Nevin Nijanthan']}_Nevin_Nijanthan_feature_histograms.png")
plt.close()

# Correlation heatmap
plt.figure(figsize=(12, 8))
corr = df[selected_features].corr()
sns.heatmap(corr, cmap='coolwarm', linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.savefig(f"{output_viz_path}{members['Nevin Nijanthan']}_Nevin_Nijanthan_feature_correlation_heatmap.png")
plt.close()

# ========================= Indhuwara =========================
# 6. Train/Test Split + Scaling
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Boxplot BEFORE SMOTE
train_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
train_df['Class'] = y_train.values
plt.figure(figsize=(12,6))
sns.boxplot(x='Class', y='Amount', data=train_df)
plt.title("Boxplot of 'Amount' BEFORE SMOTE")
plt.savefig(f"{output_viz_path}{members['Indhuwara']}_Indhuwara_train_test_split.png")
plt.close()

# ========================= Sandali =========================
# 7. Apply SMOTE only on training set
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train)

# Boxplot AFTER SMOTE
train_res_df = pd.DataFrame(X_train_res, columns=X_train.columns)
train_res_df['Class'] = y_train_res
plt.figure(figsize=(12,6))
sns.boxplot(x='Class', y='Amount', data=train_res_df)
plt.title("Boxplot of 'Amount' AFTER SMOTE")
plt.savefig(f"{output_viz_path}{members['Sandali']}_Sandali_smote_boxplot_after.png")
plt.close()

# Class distributions
sns.countplot(x=y_train_res)
plt.title("Fraud (1) vs Non-Fraud (0) - After SMOTE (Train)")
plt.savefig(f"{output_viz_path}{members['Sandali']}_Sandali_smote_class_distribution.png")
plt.close()

# ========================= Shared =========================
# 8. Summary statistics
print("\nSummary statistics:\n", df.describe())

# 9. Preprocessing Quality Check
print("\n=== Preprocessing Quality Check ===")
print(f"Training set shape after SMOTE: {X_train_res.shape}, Labels: {y_train_res.shape}")
print("NaN values in training features:", np.isnan(X_train_res).sum())
print("Infinite values in training features:", np.isinf(X_train_res).sum())
feature_variance = np.var(X_train_res, axis=0)
low_var_features = np.where(feature_variance < 1e-6)[0]
if len(low_var_features) > 0:
    print(f"⚠️ Low variance features detected at indices: {low_var_features}")
else:
    print("✅ All features have acceptable variance.")

# 10. Multicollinearity check
corr_matrix = pd.DataFrame(X_train_res).corr()
high_corr = np.where((corr_matrix > 0.95) & (corr_matrix < 1.0))
if len(high_corr[0]) > 0:
    print("⚠️ Highly correlated features detected!")
else:
    print("✅ No problematic correlations detected.")



=== Dataset Loaded ===

Missing values:
 Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64
✅ No missing values found.
✅ No categorical variables found.

Clipping outliers in 30 numeric features to 1st–99th percentile...
✅ Outlier clipping complete.
✅ Created new feature: Amount_per_Time

Summary statistics:
                 Time             V1             V2             V3  \
count  284807.000000  284807.000000  284807.000000  284807.000000   
mean    94816.385366       0.038150       0.022670       0.021939   
std     47445.797516       1.712310       1.275830       1.370562   
min      2422.000000      -6.563199      -4.960300      -3.978377   
25%     5