In [None]:
import sys, platform
print("Python:", sys.version)
print("Platform:", platform.platform())

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, f1_score, precision_score, recall_score, confusion_matrix
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras.models import load_model
import joblib


In [None]:
from google.colab import drive
drive.mount('/content/drive')

csv_path = "/content/drive/MyDrive/transactions.csv"

# AE artefact'ları (daha önce kaydetmiştin)
ae_model_path = "/content/drive/MyDrive/improved_autoencoder.h5"      # adını sen nasıl kaydettiysen ona göre düzelt
scaler_path   = "/content/drive/MyDrive/ae_scaler.pkl"       # adını sen nasıl kaydettiysen ona göre düzelt


In [None]:
AE_FEATURES = [
    "amount_ngn",
    "spending_deviation_score",
    "velocity_score",
    "user_avg_txn_amt",
    "user_std_txn_amt",
    "txn_hour",
    "is_night_txn",
    "user_txn_frequency_24h",
    "txn_count_last_1h",
    "avg_gap_between_txns",
    "device_seen_count",
    "is_device_shared",
    "new_device_transaction",
    "geospatial_velocity_anomaly"
]


In [None]:
LGBM_BASE_FEATURES = [
    "amount_ngn",
    "user_avg_txn_amt",
    "user_std_txn_amt",
    "user_txn_frequency_24h",
    "txn_count_last_1h",
    "txn_count_last_24h",
    "total_amount_last_1h",
    "avg_gap_between_txns",
    "txn_hour",
    "is_weekend",
    "is_night_txn",
    "device_seen_count",
    "is_device_shared",
    "ip_seen_count",
    "is_ip_shared",
    "new_device_transaction",
    "geospatial_velocity_anomaly",
]


In [None]:
LGBM_HYBRID_FEATURES = LGBM_BASE_FEATURES + ["anomaly_score"]


In [None]:
df = pd.read_csv(csv_path)
print("Dataset shape:", df.shape)

required = set(AE_FEATURES + LGBM_BASE_FEATURES + ["is_fraud", "timestamp"])
missing = sorted([c for c in required if c not in df.columns])
print("Missing:", missing[:20], "..." if len(missing) > 20 else "")
assert len(missing) == 0, f"Missing columns: {missing}"


In [None]:
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
df = df.dropna(subset=["timestamp"]).sort_values("timestamp").reset_index(drop=True)

split_ratio = 0.8
split_idx = int(len(df) * split_ratio)

df_train = df.iloc[:split_idx].copy()
df_test  = df.iloc[split_idx:].copy()

print("Train:", df_train.shape, "Test:", df_test.shape)
print("Train fraud rate:", df_train["is_fraud"].mean())
print("Test fraud rate:", df_test["is_fraud"].mean())


In [None]:
ae_model = load_model(ae_model_path)
scaler = joblib.load(scaler_path)

print("AE loaded.")


In [None]:
def compute_anomaly_score(df_part: pd.DataFrame, batch_size: int = 4096) -> np.ndarray:
    X = df_part[AE_FEATURES].copy()
    X_scaled = scaler.transform(X)

    recon = ae_model.predict(X_scaled, batch_size=batch_size, verbose=0)
    mse = np.mean(np.square(X_scaled - recon), axis=1)
    return mse

df_train["anomaly_score"] = compute_anomaly_score(df_train)
df_test["anomaly_score"]  = compute_anomaly_score(df_test)

print("anomaly_score added.")
print(df_train["anomaly_score"].describe())


In [None]:
y_train = df_train["is_fraud"].astype(int).values
y_test  = df_test["is_fraud"].astype(int).values

X_train_base = df_train[LGBM_BASE_FEATURES].copy()
X_test_base  = df_test[LGBM_BASE_FEATURES].copy()

X_train_hyb = df_train[LGBM_HYBRID_FEATURES].copy()
X_test_hyb  = df_test[LGBM_HYBRID_FEATURES].copy()

print("Base:", X_train_base.shape, X_test_base.shape)
print("Hybrid:", X_train_hyb.shape, X_test_hyb.shape)


In [None]:
neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
scale_pos_weight = neg / max(pos, 1)
print("neg:", neg, "pos:", pos, "scale_pos_weight:", scale_pos_weight)

params = {
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "max_depth": -1,
    "min_data_in_leaf": 200,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "lambda_l2": 1.0,
    "scale_pos_weight": scale_pos_weight,
    "verbosity": -1,
    "n_jobs": -1,
}


In [None]:
dtrain = lgb.Dataset(X_train_base, label=y_train)
dvalid = lgb.Dataset(X_test_base, label=y_test, reference=dtrain)

baseline_model = lgb.train(
    params,
    dtrain,
    num_boost_round=2000,
    valid_sets=[dtrain, dvalid],
    valid_names=["train", "test"],
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(50)]
)


In [None]:
dtrain_h = lgb.Dataset(X_train_hyb, label=y_train)
dvalid_h = lgb.Dataset(X_test_hyb, label=y_test, reference=dtrain_h)

hybrid_model = lgb.train(
    params,
    dtrain_h,
    num_boost_round=2000,
    valid_sets=[dtrain_h, dvalid_h],
    valid_names=["train", "test"],
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(50)]
)


In [None]:
def evaluate(model, X_test, y_test, name="model"):
    proba = model.predict(X_test, num_iteration=model.best_iteration)

    auc = roc_auc_score(y_test, proba)
    ap  = average_precision_score(y_test, proba)

    # Threshold seçimi: F1 maksimum
    precision, recall, thresholds = precision_recall_curve(y_test, proba)
    f1s = (2 * precision * recall) / (precision + recall + 1e-12)
    best_idx = np.argmax(f1s)
    best_thr = thresholds[max(best_idx - 1, 0)] if len(thresholds) else 0.5  # güvenli
    y_pred = (proba >= best_thr).astype(int)

    f1 = f1_score(y_test, y_pred)
    p  = precision_score(y_test, y_pred, zero_division=0)
    r  = recall_score(y_test, y_pred, zero_division=0)
    cm = confusion_matrix(y_test, y_pred)

    print(f"\n=== {name} ===")
    print("ROC AUC:", auc)
    print("PR AUC :", ap)
    print("Best thr (F1):", best_thr)
    print("F1:", f1, "Precision:", p, "Recall:", r)
    print("Confusion matrix:\n", cm)

    return {"auc": auc, "ap": ap, "best_thr": float(best_thr), "f1": f1, "precision": p, "recall": r}

baseline_metrics = evaluate(baseline_model, X_test_base, y_test, "Baseline (Behavior only)")
hybrid_metrics   = evaluate(hybrid_model, X_test_hyb, y_test, "Hybrid (Behavior + anomaly_score)")


In [None]:
import pandas as pd

imp = pd.DataFrame({
    "feature": X_train_hyb.columns,
    "importance": hybrid_model.feature_importance(importance_type="gain")
}).sort_values("importance", ascending=False)

imp.head(20)


In [None]:
# LightGBM model
joblib.dump(hybrid_model, "/content/lgbm_model.pkl")

# training metrics (tez için sakla)
report = {
    "baseline": baseline_metrics,
    "hybrid": hybrid_metrics,
    "lgbm_base_features": LGBM_BASE_FEATURES,
    "lgbm_hybrid_features": LGBM_HYBRID_FEATURES,
    "ae_features": AE_FEATURES
}
joblib.dump(report, "/content/training_report.pkl")

print("Saved: lgbm_model.pkl, training_report.pkl")


In [None]:
from google.colab import files
files.download("/content/lgbm_model.pkl")
files.download("/content/training_report.pkl")
