### Data Modelling for Fraud Detection

In [None]:
import pandas as pd
import numpy as np

pd.set_option("mode.copy_on_write", True)
from sklearn.model_selection import train_test_split

In [2]:
fraud = pd.read_csv("complete_dataset.csv")
fraud.head()

Unnamed: 0,TransactionID,Timestamp,MerchantID,Amount,CustomerID,TransactionAmount,AnomalyScore,FraudIndicator,Category,MerchantName,MerchantLocation,CustomerName,CustomerAge,CustomerAddress
0,1,2022-01-01 00:00:00,2701,55.530334,1952,79.413607,0.686699,0,Other,Merchant 2701,Location 2701,Customer 1952,50,Address 1952
1,2,2022-01-01 01:00:00,2070,12.88118,1027,12.053087,0.081749,0,Online,Merchant 2070,Location 2070,Customer 1027,46,Address 1027
2,3,2022-01-01 02:00:00,2238,50.176322,1955,33.310357,0.023857,0,Travel,Merchant 2238,Location 2238,Customer 1955,34,Address 1955
3,4,2022-01-01 03:00:00,2879,41.634001,1796,46.121117,0.876994,0,Travel,Merchant 2879,Location 2879,Customer 1796,33,Address 1796
4,5,2022-01-01 04:00:00,2966,78.122853,1946,54.051618,0.034059,0,Other,Merchant 2966,Location 2966,Customer 1946,18,Address 1946


In [None]:
# converting timestamp to a datetime object

fraud["Timestamp"] = pd.to_datetime(fraud["Timestamp"])

In [None]:
# Extract useful time-based features
fraud["Hour"] = fraud["Timestamp"].dt.hour
fraud["Day"] = fraud["Timestamp"].dt.day
fraud["Month"] = fraud["Timestamp"].dt.month
fraud["Weekday"] = fraud["Timestamp"].dt.weekday
fraud["Year"] = fraud["Timestamp"].dt.year

In [None]:
fraud = fraud.drop(columns=["Timestamp"])

In [6]:
fraud.head()

Unnamed: 0,TransactionID,MerchantID,Amount,CustomerID,TransactionAmount,AnomalyScore,FraudIndicator,Category,MerchantName,MerchantLocation,CustomerName,CustomerAge,CustomerAddress,Hour,Day,Month,Weekday,Year
0,1,2701,55.530334,1952,79.413607,0.686699,0,Other,Merchant 2701,Location 2701,Customer 1952,50,Address 1952,0,1,1,5,2022
1,2,2070,12.88118,1027,12.053087,0.081749,0,Online,Merchant 2070,Location 2070,Customer 1027,46,Address 1027,1,1,1,5,2022
2,3,2238,50.176322,1955,33.310357,0.023857,0,Travel,Merchant 2238,Location 2238,Customer 1955,34,Address 1955,2,1,1,5,2022
3,4,2879,41.634001,1796,46.121117,0.876994,0,Travel,Merchant 2879,Location 2879,Customer 1796,33,Address 1796,3,1,1,5,2022
4,5,2966,78.122853,1946,54.051618,0.034059,0,Other,Merchant 2966,Location 2966,Customer 1946,18,Address 1946,4,1,1,5,2022


Columns to keep:
- Amount: Transaction Value
- Timestamp: Transformed into meaningful features (eg hour, day, weekend/weekday)
- Category: Provides transaction context
- CustomerAge: Demographic feature that might correlate with fraud behavior
- MerchantLocation: (Optional) if geographic patterns matter, transform into distance or region-based features

In [None]:
features_to_keep = [
    "Hour",
    "Day",
    "Month",
    "Weekday",
    "Year",
    "Amount",
    "Category",
    "CustomerAge",
    "FraudIndicator",
]
training_data = fraud[features_to_keep]

In [14]:
fraud["Category"].value_counts()

Category
Other     210
Food      204
Travel    198
Online    196
Retail    192
Name: count, dtype: int64

In [15]:
from sklearn.preprocessing import LabelEncoder

# converting Category to a factor variable
label_encoder = LabelEncoder()
training_data["Category"] = label_encoder.fit_transform(fraud["Category"])

In [16]:
training_data.dtypes

Hour                int32
Day                 int32
Month               int32
Weekday             int32
Year                int32
Amount            float64
Category            int64
CustomerAge         int64
FraudIndicator      int64
dtype: object

In [None]:
X = training_data.drop(columns=["FraudIndicator"])
y = fraud["FraudIndicator"]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# initialize the model
rf_model = RandomForestClassifier(random_state=42)

# train the model
rf_model.fit(X_train, y_train)

# evaluate on validation data
y_val_pred = rf_model.predict(X_val)
print("Validation Performance:")
print(classification_report(y_val, y_val_pred))

Validation Performance:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98       143
           1       0.00      0.00      0.00         7

    accuracy                           0.95       150
   macro avg       0.48      0.50      0.49       150
weighted avg       0.91      0.95      0.93       150



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# define the hyperparameter grid
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

# set up GridSearchCV
grid_search = GridSearchCV(
    estimator=rf_model, param_grid=param_grid, scoring="f1", cv=3, verbose=2, n_jobs=1
)

# run grid search
grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")

# update the model with the best parameters
best_rf_model = grid_search.best_estimator_

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.2s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   0.2s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   0.2s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   0.2s
[CV] END max_depth=10, min_sa

### Final Model Evaluation

Evaluating the best model on the test data to confirm performance

In [22]:
# evaluate on test data

y_test_pred = best_rf_model.predict(X_test)
print("Test performance:")
print(classification_report(y_test, y_test_pred))

Test performance:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       144
           1       0.00      0.00      0.00         6

    accuracy                           0.96       150
   macro avg       0.48      0.50      0.49       150
weighted avg       0.92      0.96      0.94       150



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Cross Validation

Using cross-validation to validate the model performance

In [None]:
from sklearn.model_selection import cross_val_score

# performing cross validation
cv_scores = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring="f1")

print(f"Cross-Validation F1 Scores: {cv_scores}")
print(f"Mean F1 Score: {cv_scores.mean()}")

Cross-Validation F1 Scores: [0. 0. 0. 0. 0.]
Mean F1 Score: 0.0


### Serializing the trained model

Saving the trained model for production deployment

In [24]:
import joblib

# saving the model
joblib.dump(best_rf_model, "random_forest_model.pkl")
print("Model saved to random_forest_model.pkl")

Model saved to random_forest_model.pkl
