# Fraud Detection - Machine Learning Pipeline

**Objective:** Membangun machine learning pipeline untuk prediksi fraud detection

**Catatan:** Menggunakan sample data untuk efisiensi komputasi

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

## 2. Load Data (Sample)

In [None]:
# Load data dengan sampling untuk efisiensi
SAMPLE_SIZE = 100000  # Gunakan 100k rows

train = pd.read_csv('dataset/train_transaction.csv', nrows=SAMPLE_SIZE)
test = pd.read_csv('dataset/test_transaction.csv', nrows=50000)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Fraud rate: {train['isFraud'].mean():.4f}")

## 3. Data Preprocessing

In [None]:
# Pilih fitur numerik
numeric_cols = train.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols.remove('TransactionID')
if 'isFraud' in numeric_cols:
    numeric_cols.remove('isFraud')

X = train[numeric_cols].copy()
y = train['isFraud'].copy()

# Handle missing values
X.fillna(X.median(), inplace=True)

print(f"Features: {len(numeric_cols)}")
print(f"X shape: {X.shape}")

## 4. Train-Validation Split & Scaling

In [None]:
# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

print(f"Train: {X_train_scaled.shape}")
print(f"Val: {X_val_scaled.shape}")

## 5. Model Training - Logistic Regression

In [None]:
# Logistic Regression
lr_model = LogisticRegression(class_weight='balanced', max_iter=500, random_state=42, n_jobs=-1)
lr_model.fit(X_train_scaled, y_train)

lr_pred_proba = lr_model.predict_proba(X_val_scaled)[:, 1]
lr_pred = lr_model.predict(X_val_scaled)
lr_auc = roc_auc_score(y_val, lr_pred_proba)

print(f"Logistic Regression ROC-AUC: {lr_auc:.4f}")

## 6. Model Training - Random Forest

In [None]:
# Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    verbose=1
)
rf_model.fit(X_train_scaled, y_train)

rf_pred_proba = rf_model.predict_proba(X_val_scaled)[:, 1]
rf_pred = rf_model.predict(X_val_scaled)
rf_auc = roc_auc_score(y_val, rf_pred_proba)

print(f"Random Forest ROC-AUC: {rf_auc:.4f}")

## 7. Model Comparison

In [None]:
# Perbandingan performa model
comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'ROC-AUC': [lr_auc, rf_auc],
    'Precision': [
        precision_score(y_val, lr_pred),
        precision_score(y_val, rf_pred)
    ],
    'Recall': [
        recall_score(y_val, lr_pred),
        recall_score(y_val, rf_pred)
    ],
    'F1-Score': [
        f1_score(y_val, lr_pred),
        f1_score(y_val, rf_pred)
    ]
})

comparison = comparison.sort_values('ROC-AUC', ascending=False)
print("=== PERBANDINGAN MODEL ===")
print(comparison.to_string(index=False))
print(f"\nModel terbaik: {comparison.iloc[0]['Model']}")

## 8. Prediksi Test Set

In [None]:
# Gunakan model terbaik untuk prediksi
best_model = rf_model if rf_auc > lr_auc else lr_model

# Prediksi test set
test_ids = test['TransactionID'].copy()
X_test = test[numeric_cols].copy()
X_test.fillna(X.median(), inplace=True)
X_test_scaled = scaler.transform(X_test)

test_proba = best_model.predict_proba(X_test_scaled)[:, 1]

# Submission
submission = pd.DataFrame({
    'TransactionID': test_ids,
    'isFraud': test_proba
})
submission.to_csv('submission_ml.csv', index=False)
print("Submission saved!")