### Data Modelling for Fraud Detection

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import VotingClassifier

In [2]:
fraud = pd.read_csv("complete_dataset.csv")
fraud.head()

Unnamed: 0,TransactionID,Timestamp,MerchantID,Amount,CustomerID,TransactionAmount,AnomalyScore,FraudIndicator,Category,MerchantName,MerchantLocation,CustomerName,CustomerAge,CustomerAddress
0,1,2022-01-01 00:00:00,2701,55.530334,1952,79.413607,0.686699,0,Other,Merchant 2701,Location 2701,Customer 1952,50,Address 1952
1,2,2022-01-01 01:00:00,2070,12.88118,1027,12.053087,0.081749,0,Online,Merchant 2070,Location 2070,Customer 1027,46,Address 1027
2,3,2022-01-01 02:00:00,2238,50.176322,1955,33.310357,0.023857,0,Travel,Merchant 2238,Location 2238,Customer 1955,34,Address 1955
3,4,2022-01-01 03:00:00,2879,41.634001,1796,46.121117,0.876994,0,Travel,Merchant 2879,Location 2879,Customer 1796,33,Address 1796
4,5,2022-01-01 04:00:00,2966,78.122853,1946,54.051618,0.034059,0,Other,Merchant 2966,Location 2966,Customer 1946,18,Address 1946


In [3]:
# converting timestamp to a datetime object

fraud["Timestamp"] = pd.to_datetime(fraud["Timestamp"])

# Extract useful time-based features
fraud["Hour"] = fraud["Timestamp"].dt.hour
fraud["Day"] = fraud["Timestamp"].dt.day
fraud["Month"] = fraud["Timestamp"].dt.month
fraud["Weekday"] = fraud["Timestamp"].dt.weekday
fraud["Year"] = fraud["Timestamp"].dt.year

In [4]:
fraud = fraud.drop(columns=["Timestamp"])

In [5]:
fraud.head()

Unnamed: 0,TransactionID,MerchantID,Amount,CustomerID,TransactionAmount,AnomalyScore,FraudIndicator,Category,MerchantName,MerchantLocation,CustomerName,CustomerAge,CustomerAddress,Hour,Day,Month,Weekday,Year
0,1,2701,55.530334,1952,79.413607,0.686699,0,Other,Merchant 2701,Location 2701,Customer 1952,50,Address 1952,0,1,1,5,2022
1,2,2070,12.88118,1027,12.053087,0.081749,0,Online,Merchant 2070,Location 2070,Customer 1027,46,Address 1027,1,1,1,5,2022
2,3,2238,50.176322,1955,33.310357,0.023857,0,Travel,Merchant 2238,Location 2238,Customer 1955,34,Address 1955,2,1,1,5,2022
3,4,2879,41.634001,1796,46.121117,0.876994,0,Travel,Merchant 2879,Location 2879,Customer 1796,33,Address 1796,3,1,1,5,2022
4,5,2966,78.122853,1946,54.051618,0.034059,0,Other,Merchant 2966,Location 2966,Customer 1946,18,Address 1946,4,1,1,5,2022


In [None]:
# dropping unwanted columns
fraud = fraud.drop(
    [
        "TransactionID",
        "MerchantID",
        "CustomerID",
        "MerchantName",
        "MerchantLocation",
        "CustomerName",
        "CustomerAddress",
    ],
    axis=1,
)

In [7]:
fraud.head()

Unnamed: 0,Amount,TransactionAmount,AnomalyScore,FraudIndicator,Category,CustomerAge,Hour,Day,Month,Weekday,Year
0,55.530334,79.413607,0.686699,0,Other,50,0,1,1,5,2022
1,12.88118,12.053087,0.081749,0,Online,46,1,1,1,5,2022
2,50.176322,33.310357,0.023857,0,Travel,34,2,1,1,5,2022
3,41.634001,46.121117,0.876994,0,Travel,33,3,1,1,5,2022
4,78.122853,54.051618,0.034059,0,Other,18,4,1,1,5,2022


In [8]:
# split into features (X) and target (y)
X = fraud.drop(columns=["FraudIndicator"])
y = fraud["FraudIndicator"]

# train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
categorical_features = ["Category"]

numerical_features = [col for col in X.columns if col not in categorical_features]

In [None]:
# preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

In [11]:
# apply the preprocessing pipeline to the features
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)

In [12]:
# applying SMOTE for oversampling of minority classes
# the data has a lot more of non fraud cases than fraud ones
# so SMOTE helps with the imbalanced sampling
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_processed, y_train)

#### 1. Random Forest Classifier

In [13]:
rf = RandomForestClassifier(random_state=42)
print("Training Random Forest Classifier...")
rf.fit(X_train_res, y_train_res)
rf_pred = rf.predict(X_val_processed)

print("\nRandom Forest - Classification Report:")
print(classification_report(y_val, rf_pred))

Training Random Forest Classifier...

Random Forest - Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       193
           1       0.00      0.00      0.00         7

    accuracy                           0.96       200
   macro avg       0.48      0.50      0.49       200
weighted avg       0.93      0.96      0.95       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### 2. XGBoost Classifier

In [15]:
xgb = XGBClassifier(random_state=42)
print("Training XGBoost Classifier...")
xgb.fit(X_train_res, y_train_res)
xgb_pred = xgb.predict(X_val_processed)

print("\nXGBoost - Classification Report:")
print(classification_report(y_val, xgb_pred))

Training XGBoost Classifier...

XGBoost - Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98       193
           1       0.00      0.00      0.00         7

    accuracy                           0.95       200
   macro avg       0.48      0.49      0.49       200
weighted avg       0.93      0.95      0.94       200



#### 3. Neural Network (MLPClassifier)

In [16]:
nn = MLPClassifier(random_state=42)
print("Training Neural Network (MLPClassifier)...")
nn.fit(X_train_res, y_train_res)
nn_pred = nn.predict(X_val_processed)

print("\nNeural Network - Classification Report:")
print(classification_report(y_val, nn_pred))

Training Neural Network (MLPClassifier)...

Neural Network - Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       193
           1       0.00      0.00      0.00         7

    accuracy                           0.94       200
   macro avg       0.48      0.49      0.49       200
weighted avg       0.93      0.94      0.94       200





#### 4. Logistic Regression with Class Weights

In [None]:
log_reg = LogisticRegression(class_weight="balanced", random_state=42)
print("Training Logistic Regression...")
log_reg.fit(X_train_res, y_train_res)
log_reg_pred = log_reg.predict(X_val_processed)

print("\nLogistic Regression - Classification Report:")
print(classification_report(y_val, log_reg_pred))

Training Logistic Regression...

Logistic Regression - Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.63      0.76       193
           1       0.03      0.29      0.05         7

    accuracy                           0.62       200
   macro avg       0.49      0.46      0.41       200
weighted avg       0.93      0.62      0.74       200



#### 5. SVM with Class Weights

In [None]:
svm = SVC(class_weight="balanced", random_state=42)
print("Training Support Vector Machine...")
svm.fit(X_train_res, y_train_res)
svm_pred = svm.predict(X_val_processed)

print("\nSVM - Classification Report:")
print(classification_report(y_val, svm_pred))

Training Support Vector Machine...

SVM - Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.96       193
           1       0.08      0.14      0.11         7

    accuracy                           0.92       200
   macro avg       0.53      0.54      0.53       200
weighted avg       0.94      0.92      0.93       200



#### 6. K-Nearest Neighbors

In [20]:
knn = KNeighborsClassifier()
print("Training K-Nearest Neighbors...")
knn.fit(X_train_res, y_train_res)
knn_pred = knn.predict(X_val_processed)

print("\nK-Nearest Neighbors - Classification Report:")
print(classification_report(y_val, knn_pred))

Training K-Nearest Neighbors...

K-Nearest Neighbors - Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.76      0.85       193
           1       0.04      0.29      0.07         7

    accuracy                           0.74       200
   macro avg       0.50      0.52      0.46       200
weighted avg       0.93      0.74      0.82       200



#### 7. Stacking Ensemble

In [None]:
# Define the base models for the ensemble
base_learners = [
    ("rf", RandomForestClassifier(random_state=42)),
    ("xgb", XGBClassifier(random_state=42)),
    ("log_reg", LogisticRegression(class_weight="balanced", random_state=42)),
]

# Define the meta-model for the stacking ensemble
meta_model = LogisticRegression()

# Create the stacking ensemble
stacking = StackingClassifier(estimators=base_learners, final_estimator=meta_model)

print("Training Stacking Ensemble...")
stacking.fit(X_train_res, y_train_res)
stacking_pred = stacking.predict(X_val_processed)

print("\nStacking Ensemble - Classification Report:")
print(classification_report(y_val, stacking_pred))

Training Stacking Ensemble...

Stacking Ensemble - Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       193
           1       0.00      0.00      0.00         7

    accuracy                           0.96       200
   macro avg       0.48      0.50      0.49       200
weighted avg       0.93      0.96      0.95       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### 8. CatBoost

In [None]:
# for CatBoost, the preprocessing will be different

preprocessor_cat = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features)  # Only scale numerical features
    ]
)

# applying preprocessing to numerical features
X_train_cat = preprocessor_cat.fit_transform(X_train)
X_val_cat = preprocessor_cat.transform(X_val)

# Apply SMOTE to numerical data
smote = SMOTE(random_state=42)
X_train_res_cat, y_train_res_cat = smote.fit_resample(X_train_cat, y_train)

# Convert resampled numerical data back to a DataFrame
X_train_res_cat_df = pd.DataFrame(X_train_res_cat, columns=numerical_features)

# Extract the indices of the resampled data
resampled_indices = smote.fit_resample(X_train_cat, y_train)[
    1
]  # Second output is resampled indices

# Map the categorical features to the resampled indices
categorical_resampled = (
    X_train[categorical_features].iloc[resampled_indices].reset_index(drop=True)
)

# Combine numerical and categorical features
X_train_res_cat_df[categorical_features] = categorical_resampled

# Prepare validation data
X_val_cat_df = pd.DataFrame(X_val_cat, columns=numerical_features)
X_val_cat_df[categorical_features] = X_val[categorical_features].reset_index(drop=True)

# Initialize and train CatBoost Classifier
catboost = CatBoostClassifier(
    cat_features=list(
        range(
            len(numerical_features), len(numerical_features) + len(categorical_features)
        )
    ),
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    random_state=42,
    verbose=200,
)

print("Training CatBoost Classifier...")
catboost.fit(X_train_res_cat_df, y_train_res_cat)

# Predict and evaluate performance
catboost_pred = catboost.predict(X_val_cat_df)

# Print classification report
print("\nCatBoost - Classification Report:")
print(classification_report(y_val, catboost_pred))

Training CatBoost Classifier...
0:	learn: 0.4327908	total: 66.6ms	remaining: 1m 6s
200:	learn: 0.0006879	total: 248ms	remaining: 984ms
400:	learn: 0.0006831	total: 366ms	remaining: 546ms
600:	learn: 0.0006778	total: 480ms	remaining: 319ms
800:	learn: 0.0006725	total: 600ms	remaining: 149ms
999:	learn: 0.0006683	total: 721ms	remaining: 0us

CatBoost - Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.20      0.33       193
           1       0.03      0.71      0.06         7

    accuracy                           0.21       200
   macro avg       0.49      0.46      0.19       200
weighted avg       0.92      0.21      0.32       200

