# Part 5: Train/Test Split & Scaling
**Student:** Indhuwara (IT24103022)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif
import matplotlib.pyplot as plt
import seaborn as sns
import os

members = {"Kishan Ahamed": "IT24103829", "Abhinaya Kumar": "IT24103851", "Lafry": "IT24103834", "Nevin Nijanthan": "IT24102335", "Indhuwara": "IT24103022", "Sandali": "IT24103843"}
output_viz_path = "../results/eda_visualizations/"
if not os.path.exists(output_viz_path):
    os.makedirs(output_viz_path)

try:
    df = pd.read_csv("../data/raw/creditcard.csv")
    print("=== Dataset Loaded ===")
    X_temp = df.drop('Class', axis=1)
    y_temp = df['Class']
    mi_scores = mutual_info_classif(X_temp, y_temp, random_state=42)
    mi_series = pd.Series(mi_scores, index=X_temp.columns).sort_values(ascending=False)
    selected_features = mi_series[mi_series > 0].index.tolist()
    X = df[selected_features]
    y = df['Class']
except FileNotFoundError:
    print("Error: creditcard.csv not found. Make sure it is in the 'data/raw' directory.")

if 'X' in locals() and 'y' in locals():
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    train_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
    train_df['Class'] = y_train.values
    plt.figure(figsize=(12,6))
    sns.boxplot(x='Class', y='Amount', data=train_df)
    plt.title("Boxplot of 'Amount' BEFORE SMOTE")
    plt.savefig(f"{output_viz_path}{members['Indhuwara']}_Indhuwara_train_test_split.png")
    plt.close()

    print("✅ Train/Test split and scaling complete.")
    print(f"Training set shape: {X_train_scaled.shape}")
    print(f"Test set shape: {X_test_scaled.shape}")