In [5]:
import pandas as pd

fraud = pd.read_csv("../data/processed/fraud_processed.csv")


# Define target and features

In [6]:
X = fraud.drop(columns=['class'])
y = fraud['class']


In [7]:
X = X.drop(columns=['user_id', 'device_id', 'ip_address'], errors='ignore')


# Encode categorical features

In [10]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load processed dataset
fraud = pd.read_csv("../data/processed/fraud_processed.csv")

# Separate features and target
X = fraud.drop(columns=['class'], errors='ignore')
y = fraud['class']

# Drop identifiers / unnecessary columns
X = X.drop(columns=[
    'user_id', 'signup_time', 'purchase_time', 
    'device_id', 'ip_address', 'lower_bound_ip_address', 'upper_bound_ip_address'
], errors='ignore')

# One-hot encode low-cardinality categorical features
X = pd.get_dummies(X, columns=['sex', 'browser'], drop_first=True)

# Label encode medium/high-cardinality categorical features
le = LabelEncoder()
for col in ['source', 'country']:
    X[col] = le.fit_transform(X[col].astype(str))

# Check shape
print("Features shape:", X.shape)


Features shape: (151112, 14)


# Train-test split (MANDATORY: stratified)

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


# Apply SMOTE

In [13]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


# Train model

In [14]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_resampled, y_train_resampled)


# Evaluate baseline model

In [15]:
from sklearn.metrics import (
    precision_recall_curve,
    f1_score,
    confusion_matrix,
    average_precision_score
)

y_pred = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)[:, 1]

f1 = f1_score(y_test, y_pred)
auc_pr = average_precision_score(y_test, y_prob)

f1, auc_pr


(0.306938097216452, np.float64(0.2805674188837108))

In [16]:
confusion_matrix(y_test, y_pred)


array([[20035,  7358],
       [  983,  1847]])

# Train Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_resampled, y_train_resampled)


# Evaluate ensemble model

In [18]:
y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

f1_rf = f1_score(y_test, y_pred_rf)
auc_pr_rf = average_precision_score(y_test, y_prob_rf)

f1_rf, auc_pr_rf


(0.4787411106067484, np.float64(0.6157897741684646))

In [19]:
confusion_matrix(y_test, y_pred_rf)


array([[25196,  2197],
       [ 1248,  1582]])

# Cross-validation

In [21]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(
    rf,
    X_train_resampled,
    y_train_resampled,
    scoring='f1',
    cv=cv
)

cv_scores.mean(), cv_scores.std()


(np.float64(0.8351109509935657), np.float64(0.0019587244893555576))

# Model comparison & selection

In [22]:
results = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest"],
    "F1 Score": [f1, f1_rf],
    "AUC-PR": [auc_pr, auc_pr_rf]
})

results


Unnamed: 0,Model,F1 Score,AUC-PR
0,Logistic Regression,0.306938,0.280567
1,Random Forest,0.478741,0.61579


In [25]:
import joblib

joblib.dump(rf, "../models/random_forest.pkl")

['../models/random_forest.pkl']