# Integrated Preprocessing Pipeline
This notebook contains the complete, integrated preprocessing pipeline for the credit card fraud detection dataset. It combines the work of all group members.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import mutual_info_classif

# Define the path for saving visualizations
output_viz_path = "../results/eda_visualizations/"
if not os.path.exists(output_viz_path):
    os.makedirs(output_viz_path)

# 1. Load dataset
df = pd.read_csv("../data/raw/creditcard.csv")
print("
=== Dataset Loaded ===")
print(df.head())

# ========================= Kishan Ahamed =========================
# 2. Handle missing values
print("
Missing values:
", df.isnull().sum())
if df.isnull().sum().sum() > 0:
    print("
Handling missing values...")
    for col in df.columns:
        if df[col].dtype in ['float64', 'int64']:
            df[col] = df[col].fillna(df[col].median())
        else:
            df[col] = df[col].fillna(df[col].mode()[0])
else:
    print("✅ No missing values found.")

# ========================= Abhinaya Kumar =========================
# 3. Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
    print(f"
Encoding categorical variables: {list(categorical_cols)}")
    le = LabelEncoder()
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col])
else:
    print("✅ No categorical variables found.")

# ========================= Lafry =========================
# 4. Outlier handling using percentile clipping (1st–99th percentile)
num_cols = df.select_dtypes(include=['float64', 'int64']).columns.drop('Class', errors='ignore')
print(f"
Clipping outliers in {len(num_cols)} numeric features to 1st–99th percentile...")
for col in num_cols:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df[col] = np.clip(df[col], lower, upper)
print("✅ Outlier clipping complete.")

# ========================= Nevin Nijanthan =========================
# 5. Feature Engineering (before train/test split) + Feature Selection
if 'Amount' in df.columns and 'Time' in df.columns:
    df['Amount_per_Time'] = df['Amount'] / (df['Time'] + 1)
    print("✅ Created new feature: Amount_per_Time")
else:
    print("⚠️ Skipping feature engineering (Amount/Time not found).")

# Mutual Information Feature Selection
X_temp = df.drop('Class', axis=1)
y_temp = df['Class']
mi_scores = mutual_info_classif(X_temp, y_temp, random_state=42)
mi_series = pd.Series(mi_scores, index=X_temp.columns).sort_values(ascending=False)
print("
Mutual Information Scores:
", mi_series)
selected_features = mi_series[mi_series > 0].index.tolist()
print("
Selected features based on MI (>0):
", selected_features)
X = df[selected_features]
y = df['Class']

# Class distribution BEFORE SMOTE
print("
Class distribution BEFORE SMOTE:
", y.value_counts())
sns.countplot(x='Class', data=df)
plt.title("Fraud (1) vs Non-Fraud (0) - Before SMOTE")
plt.savefig(f"{output_viz_path}class_distribution_before_smote.png")
plt.close()

# Feature histograms
df[selected_features].hist(figsize=(20, 15), bins=30, edgecolor='black')
plt.suptitle("Feature Histograms", fontsize=16)
plt.savefig(f"{output_viz_path}feature_histograms.png")
plt.close()

# Correlation heatmap
plt.figure(figsize=(12, 8))
corr = df[selected_features].corr()
sns.heatmap(corr, cmap='coolwarm', linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.savefig(f"{output_viz_path}feature_correlation_heatmap.png")
plt.close()

# ========================= Indhuwara =========================
# 6. Train/Test Split + Scaling
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Boxplot BEFORE SMOTE (example with 'Amount')
train_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
train_df['Class'] = y_train.values
plt.figure(figsize=(12,6))
sns.boxplot(x='Class', y='Amount', data=train_df)
plt.title("Boxplot of 'Amount' BEFORE SMOTE")
plt.savefig(f"{output_viz_path}amount_boxplot_before_smote.png")
plt.close()

# ========================= Sandali =========================
# 7. Apply SMOTE only on training set (Balancing Classes)
print("
Applying SMOTE to training set...")
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train)

# Boxplot AFTER SMOTE
train_res_df = pd.DataFrame(X_train_res, columns=X_train.columns)
train_res_df['Class'] = y_train_res
plt.figure(figsize=(12,6))
sns.boxplot(x='Class', y='Amount', data=train_res_df)
plt.title("Boxplot of 'Amount' AFTER SMOTE")
plt.savefig(f"{output_viz_path}amount_boxplot_after_smote.png")
plt.close()

# Check class distributions
print("
Class distribution BEFORE SMOTE (Train):
", y_train.value_counts())
print("
Class distribution AFTER SMOTE (Train):
", pd.Series(y_train_res).value_counts())
sns.countplot(x=y_train_res)
plt.title("Fraud (1) vs Non-Fraud (0) - After SMOTE (Train)")
plt.savefig(f"{output_viz_path}class_distribution_after_smote.png")
plt.close()

# ========================= Shared (Final Checks – Whole Team) =========================
# 8. Summary statistics
print("
Summary statistics:
", df.describe())

# 9. Preprocessing Quality Check
print("
=== Preprocessing Quality Check ===")
print(f"Training set shape after SMOTE: {X_train_res.shape}, Labels: {y_train_res.shape}")
print("NaN values in training features:", np.isnan(X_train_res).sum())
print("Infinite values in training features:", np.isinf(X_train_res).sum())
feature_variance = np.var(X_train_res, axis=0)
low_var_features = np.where(feature_variance < 1e-6)[0]
if len(low_var_features) > 0:
    print(f"⚠️ Low variance features detected at indices: {low_var_features}")
else:
    print("✅ All features have acceptable variance.")

# 10. Multicollinearity check
corr_matrix = pd.DataFrame(X_train_res).corr()
high_corr = np.where((corr_matrix > 0.95) & (corr_matrix < 1.0))
if len(high_corr[0]) > 0:
    print("⚠️ Highly correlated features detected!")
else:
    print("✅ No problematic correlations detected.")

SyntaxError: unterminated string literal (detected at line 18) (58217433.py, line 18)