# Import the **dependencies**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
import json

# **Load** and **split** the dataset

In [None]:
SEED = 42

# Load
df = pd.read_csv("/content/anonymized_dataset.csv")
df = df.dropna(subset=["message", "label"])
df["label"] = df["label"].astype(int)

# Split
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=SEED)
train_df, val_df = train_test_split(train_df, test_size=0.125, stratify=train_df["label"], random_state=SEED)

# Train the **BiLSTM** model

In [None]:
# train_bilstm.py
MAX_LEN = 200
MAX_WORDS = 50000
EMBED_DIM = 100
BATCH_SIZE = 64
EPOCHS = 2

# Tokenize
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<UNK>")
tokenizer.fit_on_texts(train_df["message"])

def prep_texts(texts):
    seqs = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seqs, maxlen=MAX_LEN, padding="post", truncating="post")

X_train = prep_texts(train_df["message"])
X_val   = prep_texts(val_df["message"])
X_test  = prep_texts(test_df["message"])

y_train, y_val, y_test = train_df["label"].values, val_df["label"].values, test_df["label"].values

# Model
model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=EMBED_DIM, input_length=MAX_LEN),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train
es = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)
history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                    batch_size=BATCH_SIZE, epochs=EPOCHS, callbacks=[es])

# Save
model.save("/content/bilstm_model.h5")
# Save tokenizer
tokenizer_json = tokenizer.to_json()
with open("/content/bilstm_tokenizer.json", "w", encoding="utf-8") as f:
    f.write(tokenizer_json)


# Train the **BERT** model

In [None]:
# train_bert.py
MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 2
OUTPUT_DIR = "/content/bert_finetuned"

# Tokenization
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(batch):
    return tokenizer(batch["message"], truncation=True, padding="max_length", max_length=MAX_LEN)

train_ds = Dataset.from_pandas(train_df).map(preprocess, batched=True)
val_ds   = Dataset.from_pandas(val_df).map(preprocess, batched=True)
test_ds  = Dataset.from_pandas(test_df).map(preprocess, batched=True)

cols = ["input_ids", "attention_mask", "label"]
train_ds.set_format(type="torch", columns=cols)
val_ds.set_format(type="torch", columns=cols)
test_ds.set_format(type="torch", columns=cols)

# Model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Training
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    metric_for_best_model="f1",
    save_total_limit=2,
    seed=SEED,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer
)

trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)


# Train the **XGBoost** model

In [None]:
# train_xgboost.py

# TF-IDF
tfidf = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), max_features=50000)
X_train = tfidf.fit_transform(train_df["message"])
X_val   = tfidf.transform(val_df["message"])
X_test  = tfidf.transform(test_df["message"])

y_train, y_val, y_test = train_df["label"], val_df["label"], test_df["label"]

# Model
clf = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    n_estimators=50,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=SEED,
    n_jobs=-1,
    tree_method="gpu_hist",
    predictor="gpu_predictor"
)

clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

# Save
joblib.dump(clf, "/content/xgb_model.joblib")
joblib.dump(tfidf, "/content/tfidf.joblib")


# Evaluate the **model metrics**

In [None]:
# evaluate_models.py

X_test = test_df["message"].tolist()
y_test = test_df["label"].values

N = 2000 # or any number you want
X_test = X_test[:N]
y_test = y_test[:N]
results = {}

# ----------------
# 1. Evaluate BERT
# ----------------

print("Loading and evaluating BERT model...")
bert_path = "/content/bert_finetuned"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_path)
bert_model = AutoModelForSequenceClassification.from_pretrained(bert_path)
bert_model.eval()

import time

def bert_predict(texts, batch_size=32):
    preds = []
    start_time = time.time()
    for i in range(0, len(texts), batch_size):
        print(f"BERT: Predicting batch {i//batch_size + 1}/{(len(texts)-1)//batch_size + 1}")
        batch_texts = texts[i:i+batch_size]
        inputs = bert_tokenizer(
            batch_texts, truncation=True, padding=True, max_length=256, return_tensors="pt")
        with torch.no_grad():
            outputs = bert_model(**inputs)
        batch_preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        preds.extend(batch_preds)
    end_time = time.time()
    inference_time = end_time - start_time
    return np.array(preds), inference_time


bert_preds, bert_time = bert_predict(X_test)
results["BERT"] = {
    "accuracy": accuracy_score(y_test, bert_preds),
    "precision": precision_score(y_test, bert_preds),
    "recall": recall_score(y_test, bert_preds),
    "f1": f1_score(y_test, bert_preds),
    "inference_time (s)": bert_time,
    "inference_time_per_message (ms)": (bert_time / len(X_test)) * 1000
}
print("\nBERT metrics:")
for k, v in results["BERT"].items():
    print(f"{k}: {v:.4f}")

# ----------------
# 2. Evaluate XGBoost
# ----------------
print("\nLoading and evaluating XGBoost model...")
xgb_model = joblib.load("/content/xgb_model.joblib")
tfidf = joblib.load("/content/tfidf.joblib")

start_time = time.time()
X_test_tfidf = tfidf.transform(X_test)
xgb_preds = xgb_model.predict(X_test_tfidf)
end_time = time.time()
xgb_time = end_time - start_time

results["XGBoost"] = {
    "accuracy": accuracy_score(y_test, xgb_preds),
    "precision": precision_score(y_test, xgb_preds),
    "recall": recall_score(y_test, xgb_preds),
    "f1": f1_score(y_test, xgb_preds),
    "inference_time (s)": xgb_time,
    "inference_time_per_message (ms)": (xgb_time / len(X_test)) * 1000
}
print("\nXGBoost metrics:")
for k, v in results["XGBoost"].items():
    print(f"{k}: {v:.4f}")

# ----------------
# 3. Evaluate BiLSTM
# ----------------
print("\nLoading and evaluating BiLSTM model...")
bilstm_model = load_model("/content/bilstm_model.h5")

# Reload tokenizer from JSON
with open("/content/bilstm_tokenizer.json") as f:
    tokenizer_json = f.read()
bilstm_tokenizer = tokenizer_from_json(tokenizer_json)

MAX_LEN = 200

start_time = time.time()
X_test_seq = bilstm_tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN,
                           padding="post", truncating="post")

print("BiLSTM: Predicting...")
bilstm_probs = bilstm_model.predict(X_test_pad)
bilstm_preds = (bilstm_probs > 0.5).astype(int).reshape(-1)
end_time = time.time()
bilstm_time = end_time - start_time

results["BiLSTM"] = {
    "accuracy": accuracy_score(y_test, bilstm_preds),
    "precision": precision_score(y_test, bilstm_preds),
    "recall": recall_score(y_test, bilstm_preds),
    "f1": f1_score(y_test, bilstm_preds),
    "inference_time (s)": bilstm_time,
    "inference_time_per_message (ms)": (bilstm_time / len(X_test)) * 1000
}
print("\nBiLSTM metrics:")
for k, v in results["BiLSTM"].items():
    print(f"{k}: {v:.4f}")

# ----------------
# Print comparison
# ----------------
print("\nModel Comparison:\n")
df_results = pd.DataFrame(results).T
print(df_results)

# ----------------
# Plot confusion matrices
# ----------------


def plot_cm(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=["Legit (0)", "Fraud (1)"],
                yticklabels=["Legit (0)", "Fraud (1)"])
    plt.title(title)
    plt.ylabel("True")
    plt.xlabel("Predicted")
    plt.tight_layout()
    plt.show()


plot_cm(y_test, bert_preds, "BERT Confusion Matrix")
plot_cm(y_test, xgb_preds, "XGBoost Confusion Matrix")
plot_cm(y_test, bilstm_preds, "BiLSTM Confusion Matrix")