# 1. Load Data

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from xgboost.callback import EarlyStopping
from sklearn.metrics import classification_report
import joblib

# Load cleaned data
df = pd.read_csv("../data/processed/cleaned_data.csv")
print(df.shape)
df.head()


(590540, 422)


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,-999.0,150.0,discover,142.0,...,missing,-999.0,missing,missing,missing,missing,missing,missing,missing,missing
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,missing,-999.0,missing,missing,missing,missing,missing,missing,missing,missing
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,missing,-999.0,missing,missing,missing,missing,missing,missing,missing,missing
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,missing,-999.0,missing,missing,missing,missing,missing,missing,missing,missing
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [11]:
# 3. Encode categorical columns 
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].astype("category").cat.codes

# 4. Separate features and target
X = df.drop(columns=["isFraud", "TransactionID"])
y = df["isFraud"]

# 5. Train/test split with stratification
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 6. Handle class imbalance
scale_weight = (y_train == 0).sum() / (y_train == 1).sum()

# 7. Initialize and train XGBoost model
model = XGBClassifier(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    scale_pos_weight=scale_weight,
    eval_metric="logloss",
    use_label_encoder=False,
    random_state=42
)

model.fit(
    X_train,
    y_train
)


# 8. Make predictions
y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)[:, 1]

# 9. Evaluate model
print("✅ Classification Report:")
print(classification_report(y_val, y_pred))

# 10. Save model
joblib.dump(model, "../models/fraud_model.joblib")
print("✅ Model saved to models/fraud_model.joblib")

# 11. Export predictions DataFrame
preds_df = X_val.copy()
preds_df["TransactionID"] = df.loc[X_val.index, "TransactionID"]
preds_df["actual"] = y_val.values
preds_df["predicted"] = y_pred
preds_df["fraud_probability"] = y_prob

preds_df.to_csv("../data/processed/predictions.csv", index=False)
print("Predictions saved to data/processed/predictions.csv")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.87      0.92    113975
           1       0.18      0.78      0.29      4133

    accuracy                           0.86    118108
   macro avg       0.58      0.82      0.61    118108
weighted avg       0.96      0.86      0.90    118108

✅ Model saved to models/fraud_model.joblib
Predictions saved to data/processed/predictions.csv


In [16]:
#Create lightweight sample version for Git (under 100 mb)

df = pd.read_csv("../data/processed/cleaned_data.csv")
df_sample = df.sample(n=10000, random_state=42)
df_sample.to_csv("../data/processed/predictions_sample.csv", index=False)