# Credit Card Fraud Detection - Data Preprocessing

## HỌ VÀ TÊN: Cao Tấn Hoàng Huy
## MSSV: 23127051

In [None]:
# Khai báo các thư viện cần thiết
import numpy as np
import matplotlib.pyplot as plt

# Setup
np.random.seed(42)

## Load data

**Note:** Chạy notebook `01_data_exploration.ipynb` trước để có biến `data` và `column_names`

In [None]:
# Check if data already exists in memory (from previous notebook)
try:
    print(f"Checking for data from previous notebook...")
    print(f"Data shape: {data.shape}")
    print(f"Column names: {len(column_names)}")
    print(f"✓ Data loaded from previous notebook successfully!")
except NameError:
    # Load fresh if not available
    print("Data not found in memory. Loading from CSV...")
    
    with open('../data/creditcard.csv', 'r') as f:
        lines = f.readlines()
    
    # Skip header và lấy tên columns
    header = lines[0].strip().replace('"', '').split(',')
    
    data_list = []
    error_lines = []
    
    for i, line in enumerate(lines[1:], 1):
        try:
            line = line.strip()
            if not line:
                continue
                
            values = line.split(',')
            
            float_values = []
            for val in values:
                val = val.strip().strip('"')
                float_values.append(float(val))
            
            if len(float_values) == 31:
                data_list.append(float_values)
            else:
                error_lines.append((i, len(float_values)))
                
        except Exception as e:
            error_lines.append((i, str(e)))
    
    data = np.array(data_list, dtype=np.float64)
    column_names = header
    
    print(f"✓ Data loaded from CSV successfully!")
    print(f"  Shape: {data.shape}")
    print(f"  Columns: {len(column_names)}")

# DATA PREPROCESSING

## 1. Tách features và target

In [None]:
# Tách features và target variable - Vectorized approach
print("Separating features and target variable...")

# Tách X (features) và y (target) - Using array slicing (no loops)
X = data[:, :-1]  # Tất cả columns trừ column cuối
y = data[:, -1]   # Column cuối là Class

# Feature names (không bao gồm Class)
feature_names = column_names[:-1]

print(f"Features (X):")
print(f"  Shape: {X.shape}")
print(f"  Features: {len(feature_names)}")

print(f"\nTarget (y):")
print(f"  Shape: {y.shape}")
print(f"  Unique values: {np.unique(y)}")

# Quick view of features - Using vectorized enumeration
print(f"\nFeature names:")
feature_info = np.array([f"  {i:2d}: {name}" for i, name in enumerate(feature_names)])
print('\n'.join(feature_info[:10]))  # Show first 10
if len(feature_names) > 10:
    print(f"  ... and {len(feature_names) - 10} more features")

# Vectorized class distribution using np.unique with return_counts
print(f"\nClass distribution:")
unique_classes, class_counts = np.unique(y, return_counts=True)
class_percentages = (class_counts / len(y)) * 100
class_labels = np.where(unique_classes == 0, "Normal", "Fraud")

# Vectorized string formatting
for cls, count, pct, label in zip(unique_classes, class_counts, class_percentages, class_labels):
    print(f"  {label} ({cls}): {count:>6,} samples ({pct:>5.2f}%)")

## 2. Standardization Implementation (Pure NumPy)

In [None]:
# StandardScaler implementation với NumPy only - FULLY VECTORIZED
class StandardScaler:
    """
    Efficient StandardScaler using pure NumPy vectorization
    - No loops for array operations
    - Broadcasting for efficient computation
    - Memory-efficient operations
    """
    def __init__(self):
        self.mean_ = None
        self.std_ = None
        self.n_features_ = None
        self.fitted = False
    
    def fit(self, X):
        """Tính mean và std từ training data - Fully vectorized"""
        X = np.asarray(X, dtype=np.float64)
        self.n_features_ = X.shape[1]
        
        # Vectorized mean và std calculation - NO LOOPS
        self.mean_ = np.mean(X, axis=0)  # Broadcasting across features
        self.std_ = np.std(X, axis=0, ddof=1)  # Sample std
        
        # Vectorized clipping to avoid division by zero
        self.std_ = np.where(self.std_ == 0, 1.0, self.std_)
        
        self.fitted = True
        return self
    
    def transform(self, X):
        """Apply standardization - Fully vectorized using broadcasting"""
        if not self.fitted:
            raise ValueError("Scaler chưa được fit. Hãy gọi fit() trước.")
        
        X = np.asarray(X, dtype=np.float64)
        
        if X.shape[1] != self.n_features_:
            raise ValueError(f"Số features không khớp. Expected {self.n_features_}, got {X.shape[1]}")
        
        # Vectorized standardization using broadcasting - NO LOOPS
        # Broadcasting automatically handles: (n_samples, n_features) - (n_features,)
        X_scaled = (X - self.mean_) / self.std_
        
        return X_scaled
    
    def fit_transform(self, X):
        """Fit và transform in one step"""
        return self.fit(X).transform(X)
    
    def inverse_transform(self, X_scaled):
        """Chuyển về scale gốc - Vectorized"""
        if not self.fitted:
            raise ValueError("Scaler chưa được fit.")
        
        X_scaled = np.asarray(X_scaled, dtype=np.float64)
        
        # Vectorized inverse transformation using broadcasting
        X_original = X_scaled * self.std_ + self.mean_
        
        return X_original

print("StandardScaler implementation completed (fully vectorized)!")

## 3. Train-Test Split Implementation (Pure NumPy)

In [None]:
def stratified_train_test_split(X, y, test_size=0.2, random_state=None):
    """
    Stratified train-test split - VECTORIZED với NumPy
    - Sử dụng fancy indexing và masking
    - Minimal loops (chỉ loop qua classes, không loop qua samples)
    - Memory-efficient operations
    """
    if random_state is not None:
        np.random.seed(random_state)
    
    X = np.asarray(X, dtype=np.float64)
    y = np.asarray(y, dtype=np.float64)
    
    n_samples = len(X)
    unique_classes = np.unique(y)
    
    # Pre-allocate arrays for better memory efficiency
    train_indices = []
    test_indices = []
    
    # Loop only over classes (typically 2 classes, not thousands of samples)
    for class_label in unique_classes:
        # Vectorized masking to find class indices - NO LOOP
        class_mask = (y == class_label)
        class_indices = np.where(class_mask)[0]  # Fancy indexing
        n_class_samples = len(class_indices)
        
        # Vectorized calculation
        n_test_samples = int(np.round(n_class_samples * test_size))
        
        # Vectorized random permutation
        shuffled_indices = np.random.permutation(class_indices)
        
        # Array slicing - vectorized split
        test_indices.append(shuffled_indices[:n_test_samples])
        train_indices.append(shuffled_indices[n_test_samples:])
    
    # Vectorized concatenation
    train_indices = np.concatenate(train_indices)
    test_indices = np.concatenate(test_indices)
    
    # Vectorized shuffling
    train_indices = np.random.permutation(train_indices)
    test_indices = np.random.permutation(test_indices)
    
    # Vectorized fancy indexing for split - NO LOOPS
    X_train = X[train_indices]
    X_test = X[test_indices]
    y_train = y[train_indices]
    y_test = y[test_indices]
    
    return X_train, X_test, y_train, y_test

print("Stratified train-test split implementation completed (vectorized)!")

## 4. Perform Train-Test Split

In [None]:
# Thực hiện train-test split
print("Performing stratified train-test split...")

X_train, X_test, y_train, y_test = stratified_train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nSplit results:")
print(f"  Training set: {X_train.shape}")
print(f"  Test set:     {X_test.shape}")

# Kiểm tra class distribution sau split
print(f"\nClass distribution verification:")
print(f"Original dataset:")
unique_orig, counts_orig = np.unique(y, return_counts=True)
for cls, count in zip(unique_orig, counts_orig):
    pct = (count / len(y)) * 100
    label = "Normal" if cls == 0 else "Fraud"
    print(f"  {label}: {count:>6,} ({pct:>5.2f}%)")

print(f"\nTraining set:")
unique_train, counts_train = np.unique(y_train, return_counts=True)
for cls, count in zip(unique_train, counts_train):
    pct = (count / len(y_train)) * 100
    label = "Normal" if cls == 0 else "Fraud"
    print(f"  {label}: {count:>6,} ({pct:>5.2f}%)")

print(f"\nTest set:")
unique_test, counts_test = np.unique(y_test, return_counts=True)
for cls, count in zip(unique_test, counts_test):
    pct = (count / len(y_test)) * 100
    label = "Normal" if cls == 0 else "Fraud"
    print(f"  {label}: {count:>6,} ({pct:>5.2f}%)")

print(f"\nStratification successful! Class distributions are preserved.")

## 5. Feature Standardization

In [None]:
# Feature Standardization
print("Performing feature standardization...")

# Khởi tạo và fit scaler trên training data
scaler = StandardScaler()
scaler.fit(X_train)

# Transform cả train và test
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nStandardization completed!")
print(f"  Training set scaled: {X_train_scaled.shape}")
print(f"  Test set scaled:     {X_test_scaled.shape}")

# Verification - check means và stds của scaled training data
print(f"\nVerification of standardization:")
print(f"  Training set means (first 5 features): {np.mean(X_train_scaled[:, :5], axis=0)}")
print(f"  Training set stds (first 5 features):  {np.std(X_train_scaled[:, :5], axis=0, ddof=1)}")

# Original vs Scaled comparison
print(f"\nOriginal vs Scaled comparison (first feature):")
print(f"  Original - Mean: {np.mean(X_train[:, 0]):.6f}, Std: {np.std(X_train[:, 0], ddof=1):.6f}")
print(f"  Scaled   - Mean: {np.mean(X_train_scaled[:, 0]):.6f}, Std: {np.std(X_train_scaled[:, 0], ddof=1):.6f}")

print(f"\nFeature standardization completed successfully!")

## 6. Polynomial Features Implementation

In [None]:
def create_polynomial_features(X, degree=2, include_bias=False, interaction_only=False):
    """
    Tạo polynomial features - HIGHLY VECTORIZED với NumPy
    - Sử dụng broadcasting và np.einsum
    - Minimal loops (only for generating combinations)
    - Memory-efficient operations
    
    Parameters:
    -----------
    X : array-like, shape [n_samples, n_features]
        Input data
    degree : int, default=2
        Polynomial degree
    include_bias : bool, default=False
        Whether to include bias column (all 1s)
    interaction_only : bool, default=False
        Whether to produce interaction features only
    
    Returns:
    --------
    X_poly : array, shape [n_samples, n_output_features]
        Transformed data with polynomial features
    """
    X = np.asarray(X, dtype=np.float64)
    n_samples, n_features = X.shape
    
    if degree < 1:
        raise ValueError("degree phải >= 1")
    
    # Pre-allocate list for features
    poly_features = []
    
    if include_bias:
        # Vectorized bias column creation
        poly_features.append(np.ones((n_samples, 1), dtype=np.float64))
    
    if not interaction_only or degree == 1:
        # Add original features
        poly_features.append(X)
    
    if degree >= 2:
        # Vectorized degree 2 feature generation using broadcasting
        # Generate all pairwise interactions efficiently
        
        # Method: Use outer product concept with broadcasting
        for i in range(n_features):
            # Get starting index based on interaction_only
            start_j = i if not interaction_only else i + 1
            
            if start_j < n_features:
                # Vectorized multiplication using broadcasting
                # X[:, i:i+1] has shape (n_samples, 1)
                # X[:, start_j:] has shape (n_samples, n_remaining_features)
                # Broadcasting creates all interactions at once
                interactions = X[:, i:i+1] * X[:, start_j:]
                poly_features.append(interactions)
    
    # Vectorized concatenation - single operation
    X_poly = np.concatenate(poly_features, axis=1)
    
    return X_poly

print("Polynomial features implementation completed (highly vectorized)!")

## 7. Create Enhanced Features

In [None]:
# Tạo polynomial features cho training và test sets
print("Creating polynomial features...")

# Create polynomial features (degree=2, interaction terms)
X_train_poly = create_polynomial_features(
    X_train_scaled, 
    degree=2, 
    include_bias=False,
    interaction_only=True  # Only interaction terms, not squares
)

X_test_poly = create_polynomial_features(
    X_test_scaled, 
    degree=2, 
    include_bias=False,
    interaction_only=True
)

print(f"\nPolynomial feature creation completed!")
print(f"  Original features: {X_train_scaled.shape[1]}")
print(f"  Enhanced features: {X_train_poly.shape[1]}")
print(f"  Training set: {X_train_poly.shape}")
print(f"  Test set:     {X_test_poly.shape}")

print(f"\nFeature enhancement completed!")
print(f"Data is ready for modeling.")