In [5]:
import pandas as pd

df = pd.read_csv('click_fraud_dataset.csv')
df.head()


Unnamed: 0,click_id,timestamp,user_id,ip_address,device_type,browser,operating_system,referrer_url,page_url,click_duration,...,mouse_movement,keystrokes_detected,ad_position,click_frequency,time_since_last_click,device_ip_reputation,VPN_usage,proxy_usage,bot_likelihood_score,is_fraudulent
0,d875835d-3a4a-4a20-b0d1-6cddf89afc6a,2024-08-23 02:47:39,65a2f621-707b-49be-9c3e-ccac0b1d89ef,141.36.49.37,Tablet,Safari,Android,https://evans-ford.com/,http://www.turner-stewart.com/,0.29,...,111,8,Bottom,7,72,Good,0,1,0.29,0
1,a2d3f028-7790-4be1-9f75-df1357edbbdb,2025-01-30 23:23:50,135e0114-76c5-43ea-bdef-80ab537dc009,216.29.19.201,Desktop,Opera,iOS,https://pierce-ferguson.net/,http://www.rodriguez.biz/,0.64,...,452,29,Bottom,9,201,Suspicious,0,0,0.74,0
2,36d787b2-fbce-43ef-8c02-7c8746d7e3db,2025-01-21 05:41:12,a6922984-78cb-4c01-9c88-bfe3a13a0aaf,167.133.41.231,Tablet,Safari,Linux,https://www.martinez.com/,https://beck.biz/,0.42,...,431,18,Bottom,9,326,Good,0,1,0.14,0
3,01fc0078-096b-4f90-82ae-aa8085b719ac,2024-10-12 08:18:14,d30788b2-4048-4770-a4b1-a9358788818f,216.146.33.78,Tablet,Edge,macOS,https://jones-mendoza.com/,https://www.alvarado.com/,4.29,...,472,37,Side,4,33,Suspicious,0,0,0.65,0
4,0afdf2af-0b48-47d5-bfb6-e087053e1eb9,2024-04-19 14:44:35,dfc42287-6325-4344-b373-b8e61ea6e5c1,146.37.54.245,Desktop,Opera,Windows,https://www.griffith-holloway.com/,http://gonzalez.com/,2.46,...,50,2,Side,7,97,Good,0,0,0.06,0


In [6]:
target = 'is_fraudulent'

num_cols = [
    'click_duration',
    'scroll_depth',
    'mouse_movement',
    'keystrokes_detected',
    'click_frequency',
    'time_since_last_click',
    'bot_likelihood_score',
    'VPN_usage',
    'proxy_usage'
]

cat_cols = [
    'device_type',
    'browser',
    'operating_system',
    'ad_position',
    'device_ip_reputation'
]


In [7]:
from sklearn.model_selection import train_test_split

X = df[num_cols + cat_cols]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [8]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ]
)


In [9]:
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

print("Final feature count:", X_train_prep.shape[1])


Final feature count: 28


In [10]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

ann = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_prep.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

ann.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)







In [11]:
history = ann.fit(
    X_train_prep,
    y_train,
    epochs=15,
    batch_size=256,
    validation_split=0.1,
    verbose=1
)


Epoch 1/15


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [12]:
probs = ann.predict(X_test_prep).flatten()

print("Min probability:", probs.min())
print("Max probability:", probs.max())
print("Sample probabilities:", probs[:10])


Min probability: 3.888311e-08
Max probability: 0.99958587
Sample probabilities: [2.2244951e-05 9.9781668e-01 1.4267348e-02 1.8955499e-01 9.9946427e-01
 4.8588126e-04 5.7708682e-03 8.6661714e-01 3.8102069e-06 4.1620256e-04]


In [13]:
from sklearn.metrics import classification_report

preds = (probs > 0.5).astype(int)
print(classification_report(y_test, preds))


              precision    recall  f1-score   support

           0       0.99      0.98      0.99       752
           1       0.94      0.98      0.96       248

    accuracy                           0.98      1000
   macro avg       0.97      0.98      0.97      1000
weighted avg       0.98      0.98      0.98      1000



In [14]:
new_click = pd.DataFrame([{
    'click_duration': 0.4,
    'scroll_depth': 30,
    'mouse_movement': 420,
    'keystrokes_detected': 4,
    'click_frequency': 9,
    'time_since_last_click': 110,
    'bot_likelihood_score': 0.62,
    'VPN_usage': 1,
    'proxy_usage': 0,
    'device_type': 'Mobile',
    'browser': 'Chrome',
    'operating_system': 'Android',
    'ad_position': 'Top',
    'device_ip_reputation': 'Suspicious'
}])


In [15]:
new_click_prep = preprocessor.transform(new_click)

prob = ann.predict(new_click_prep)[0][0]

print(f"Fraud Probability: {prob:.6f}")
print("ðŸš¨ FRAUD" if prob > 0.5 else "âœ… LEGIT")


Fraud Probability: 0.018944
âœ… LEGIT


In [16]:
print(prob)

0.018944092


In [20]:
# Take 5 REAL fraud samples (already preprocessed)
real_fraud = X_test_prep[y_test == 1][:5]

fraud_probs = ann.predict(real_fraud).flatten()
fraud_probs




array([0.9978167 , 0.9994643 , 0.86661714, 0.99785376, 0.9993327 ],
      dtype=float32)

In [None]:
new_click_prep = preprocessor.transform(new_click)

print(new_click_prep.sum())


6.5432489310976605


In [21]:
# Compare legit vs fraud probabilities
legit_probs = ann.predict(X_test_prep[y_test == 0][:5]).flatten()

print("Fraud probs:", fraud_probs)
print("Legit probs:", legit_probs)


Fraud probs: [0.9978167  0.9994643  0.86661714 0.99785376 0.9993327 ]
Legit probs: [2.2244951e-05 1.4267348e-02 1.8955497e-01 4.8588126e-04 5.7708682e-03]


In [22]:
print(type(ann))
print(type(X_test_prep))
print(type(y_test))


<class 'keras.src.engine.sequential.Sequential'>
<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


In [23]:
import numpy as np

fraud_indices = np.where(y_test == 1)[0]
print("Number of fraud samples in test set:", len(fraud_indices))


Number of fraud samples in test set: 248


In [24]:
fraud_X = X_test_prep[fraud_indices[:10]]
fraud_probs = ann.predict(fraud_X).flatten()

fraud_probs




array([0.9978167 , 0.9994643 , 0.86661714, 0.99785376, 0.9993327 ,
       0.98967516, 0.99767363, 0.9523466 , 0.9700246 , 0.9985507 ],
      dtype=float32)

In [25]:
legit_indices = np.where(y_test == 0)[0]
legit_X = X_test_prep[legit_indices[:10]]
legit_probs = ann.predict(legit_X).flatten()

print("Fraud probs:", fraud_probs)
print("Legit probs:", legit_probs)


Fraud probs: [0.9978167  0.9994643  0.86661714 0.99785376 0.9993327  0.98967516
 0.99767363 0.9523466  0.9700246  0.9985507 ]
Legit probs: [2.2244951e-05 1.4267348e-02 1.8955499e-01 4.8588126e-04 5.7708682e-03
 3.8102069e-06 4.1620256e-04 5.2133696e-06 5.5518717e-06 7.2454261e-03]


In [26]:
# Compare a real fraud sample vs your manual input
real_fraud = X_test_prep[y_test == 1][0:1]
manual_input = new_click_prep   # what you are using

print("Real fraud prob:", ann.predict(real_fraud)[0][0])
print("Manual input prob:", ann.predict(manual_input)[0][0])


Real fraud prob: 0.9978167
Manual input prob: 0.018944092


In [27]:
# simulate real-world prediction
idx = y_test[y_test == 1].index[0]
sample = X.loc[idx:idx]   # raw row

sample_prep = preprocessor.transform(sample)
prob = ann.predict(sample_prep)[0][0]

print(prob, "FRAUD" if prob > 0.5 else "LEGIT")


0.9978167 FRAUD


In [28]:
new_click = pd.DataFrame([{
    'click_duration': 0.01,
    'scroll_depth': 0,
    'mouse_movement': 0,
    'keystrokes_detected': 0,
    'click_frequency': 50,
    'time_since_last_click': 1,
    'bot_likelihood_score': 0.95,
    'VPN_usage': 1,
    'proxy_usage': 1,
    'device_type': 'Mobile',
    'browser': 'Chrome',
    'operating_system': 'Android',
    'ad_position': 'Top',
    'device_ip_reputation': 'Bad'
}])

prob = ann.predict(preprocessor.transform(new_click))[0][0]
print(prob)


0.40318152


In [29]:
ann.save("ann_click_fraud_model.h5")


  saving_api.save_model(


In [30]:
import joblib
joblib.dump(preprocessor, "preprocessor.pkl")


['preprocessor.pkl']

In [34]:
import tensorflow as tf
import joblib
import pandas as pd

model = tf.keras.models.load_model("ann_click_fraud_model.h5")
preprocessor = joblib.load("preprocessor.pkl")

print("Model and preprocessor loaded successfully")


Model and preprocessor loaded successfully


In [35]:
# Load dataset again
df = pd.read_csv("click_fraud_dataset.csv")

# Pick ONE known fraud sample
fraud_sample = df[df["is_fraudulent"] == 1].iloc[[0]]
fraud_sample


Unnamed: 0,click_id,timestamp,user_id,ip_address,device_type,browser,operating_system,referrer_url,page_url,click_duration,...,mouse_movement,keystrokes_detected,ad_position,click_frequency,time_since_last_click,device_ip_reputation,VPN_usage,proxy_usage,bot_likelihood_score,is_fraudulent
6,a8a187f8-186a-4e50-b50b-b0da9daebadd,2024-03-19 01:44:20,92ac6feb-7506-4a80-92f2-97299de012dc,103.139.41.163,Tablet,Edge,Android,http://www.figueroa.com/,https://www.barnes-bautista.net/,0.97,...,366,47,Bottom,3,263,Good,0,0,0.98,1


In [36]:
X_fraud = fraud_sample.drop(columns=["is_fraudulent"])

X_fraud_prep = preprocessor.transform(X_fraud)

prob = model.predict(X_fraud_prep)[0][0]

print("Fraud probability:", prob)
print("Prediction:", "FRAUD" if prob > 0.5 else "LEGIT")


Fraud probability: 0.99941033
Prediction: FRAUD


In [37]:
legit_sample = df[df["is_fraudulent"] == 0].iloc[[0]]

X_legit = legit_sample.drop(columns=["is_fraudulent"])
X_legit_prep = preprocessor.transform(X_legit)

prob_legit = model.predict(X_legit_prep)[0][0]

print("Legit probability:", prob_legit)
print("Prediction:", "FRAUD" if prob_legit > 0.5 else "LEGIT")


Legit probability: 1.6992875e-05
Prediction: LEGIT


In [38]:
new_click = pd.DataFrame([{
    'click_duration': 0.01,
    'scroll_depth': 0,
    'mouse_movement': 0,
    'keystrokes_detected': 0,
    'click_frequency': 50,
    'time_since_last_click': 1,
    'bot_likelihood_score': 0.95,
    'VPN_usage': 1,
    'proxy_usage': 1,
    'device_type': 'Mobile',
    'browser': 'Chrome',
    'operating_system': 'Android',
    'ad_position': 'Top',
    'device_ip_reputation': 'Bad'
}])

new_prep = preprocessor.transform(new_click)
prob_new = model.predict(new_prep)[0][0]

print("Fraud probability:", prob_new)
print("Prediction:", "FRAUD" if prob_new > 0.5 else "LEGIT")


Fraud probability: 0.40318152
Prediction: LEGIT


In [None]:
model = joblib.load("decision_tree.pkl")

print("Model type:", type(model))
print("Expected number of features:", model.n_features_in_)
