In [14]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split


In [15]:
df = pd.read_csv("click_fraud_dataset.csv")  # update path if needed

df.head()

Unnamed: 0,click_id,timestamp,user_id,ip_address,device_type,browser,operating_system,referrer_url,page_url,click_duration,scroll_depth,mouse_movement,keystrokes_detected,ad_position,click_frequency,time_since_last_click,device_ip_reputation,VPN_usage,proxy_usage,bot_likelihood_score,is_fraudulent
0,d875835d-3a4a-4a20-b0d1-6cddf89afc6a,2024-08-23 02:47:39,65a2f621-707b-49be-9c3e-ccac0b1d89ef,141.36.49.37,Tablet,Safari,Android,https://evans-ford.com/,http://www.turner-stewart.com/,0.29,60,111,8,Bottom,7,72,Good,0,1,0.29,0
1,a2d3f028-7790-4be1-9f75-df1357edbbdb,2025-01-30 23:23:50,135e0114-76c5-43ea-bdef-80ab537dc009,216.29.19.201,Desktop,Opera,iOS,https://pierce-ferguson.net/,http://www.rodriguez.biz/,0.64,25,452,29,Bottom,9,201,Suspicious,0,0,0.74,0
2,36d787b2-fbce-43ef-8c02-7c8746d7e3db,2025-01-21 05:41:12,a6922984-78cb-4c01-9c88-bfe3a13a0aaf,167.133.41.231,Tablet,Safari,Linux,https://www.martinez.com/,https://beck.biz/,0.42,36,431,18,Bottom,9,326,Good,0,1,0.14,0
3,01fc0078-096b-4f90-82ae-aa8085b719ac,2024-10-12 08:18:14,d30788b2-4048-4770-a4b1-a9358788818f,216.146.33.78,Tablet,Edge,macOS,https://jones-mendoza.com/,https://www.alvarado.com/,4.29,29,472,37,Side,4,33,Suspicious,0,0,0.65,0
4,0afdf2af-0b48-47d5-bfb6-e087053e1eb9,2024-04-19 14:44:35,dfc42287-6325-4344-b373-b8e61ea6e5c1,146.37.54.245,Desktop,Opera,Windows,https://www.griffith-holloway.com/,http://gonzalez.com/,2.46,94,50,2,Side,7,97,Good,0,0,0.06,0


In [17]:
df.shape, df.columns


((5000, 21),
 Index(['click_id', 'timestamp', 'user_id', 'ip_address', 'device_type',
        'browser', 'operating_system', 'referrer_url', 'page_url',
        'click_duration', 'scroll_depth', 'mouse_movement',
        'keystrokes_detected', 'ad_position', 'click_frequency',
        'time_since_last_click', 'device_ip_reputation', 'VPN_usage',
        'proxy_usage', 'bot_likelihood_score', 'is_fraudulent'],
       dtype='object'))

In [18]:
FEATURES = [
    "click_frequency",
    "time_since_last_click",
    "VPN_usage",
    "proxy_usage",
    "bot_likelihood_score"
]

TARGET = "is_fraudulent"

X = df[FEATURES].copy()
y = df[TARGET]


In [19]:
X.head()
X.shape



(5000, 5)

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [21]:
X_train.shape, X_test.shape


((4000, 5), (1000, 5))

In [22]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [23]:
X_train_scaled.shape, X_test_scaled.shape


((4000, 5), (1000, 5))

In [25]:
import joblib

joblib.dump(scaler, "backend_scaler.pkl")


['backend_scaler.pkl']

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score
)


In [27]:
models = {
    "LogisticRegression": LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        random_state=42
    ),

    "RandomForest": RandomForestClassifier(
        n_estimators=300,
        max_depth=8,
        class_weight="balanced",
        random_state=42
    ),

    "XGBoost": XGBClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=42
    )
}


In [28]:
results = {}

for name, model in models.items():
    print(f"Training {name}...")

    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]

    results[name] = {
        "roc_auc": roc_auc_score(y_test, y_prob),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred)
    }


Training LogisticRegression...
Training RandomForest...
Training XGBoost...


In [29]:
results_df = pd.DataFrame(results).T.sort_values("roc_auc", ascending=False)
results_df


Unnamed: 0,roc_auc,precision,recall,f1
LogisticRegression,1.0,0.932331,1.0,0.964981
RandomForest,1.0,1.0,1.0,1.0
XGBoost,1.0,1.0,1.0,1.0


In [30]:
results_df


Unnamed: 0,roc_auc,precision,recall,f1
LogisticRegression,1.0,0.932331,1.0,0.964981
RandomForest,1.0,1.0,1.0,1.0
XGBoost,1.0,1.0,1.0,1.0


In [31]:
best_model_name = results_df.index[0]
best_model_name


'LogisticRegression'

In [32]:
from xgboost import XGBClassifier

best_model = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

best_model.fit(X_train_scaled, y_train)


In [33]:
probs = best_model.predict_proba(X_test_scaled)[:, 1]

print("Probability stats:")
print(pd.Series(probs).describe())

print(
    "Fraud mean:",
    probs[y_test == 1].mean(),
    "Genuine mean:",
    probs[y_test == 0].mean()
)


Probability stats:
count    1000.000000
mean        0.247999
std         0.431353
min         0.000375
25%         0.000398
50%         0.000408
75%         0.000486
max         0.999027
dtype: float64
Fraud mean: 0.9987536 Genuine mean: 0.0004091001


In [34]:
check_df = X_test.copy()
check_df["true_label"] = y_test.values
check_df["fraud_prob"] = probs

check_df.sample(10)


Unnamed: 0,click_frequency,time_since_last_click,VPN_usage,proxy_usage,bot_likelihood_score,true_label,fraud_prob
2887,6,239,0,0,0.96,1,0.998749
1970,9,10,0,0,0.57,0,0.000424
3219,7,99,0,0,0.6,0,0.000407
4330,2,358,0,0,0.56,0,0.000412
2985,7,191,0,0,0.92,1,0.998762
2830,5,45,0,0,0.98,1,0.998722
2062,5,406,0,0,0.63,0,0.000396
1584,8,510,0,0,0.33,0,0.000407
2103,5,164,0,0,0.98,1,0.998734
4300,8,229,0,0,0.81,1,0.998749


In [35]:
# Save sklearn wrapper (for FastAPI)
joblib.dump(best_model, "xgb_backend_model.pkl")

# Save native booster (future-proof, optional but recommended)
best_model.get_booster().save_model("xgb_backend_model.json")


In [40]:
loaded_model = joblib.load("xgb_backend_model.pkl")
loaded_scaler = joblib.load("backend_scaler.pkl")

test_probs = loaded_model.predict_proba(
    loaded_scaler.transform(X_test.iloc[:5])
)[:, 1]

test_probs


array([3.9416357e-04, 9.9878103e-01, 3.9346510e-04, 4.0482945e-04,
       9.9874890e-01], dtype=float32)