In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, f1_score, recall_score, precision_score, ConfusionMatrixDisplay
import xgboost as xgb
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline 
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import pickle
from lightgbm import LGBMClassifier
import mlflow
from hyperopt import fmin, hp, tpe, Trials, STATUS_OK
from hyperopt.pyll import scope

In [3]:
mlflow.set_experiment("fraud-detection-2")

<Experiment: artifact_location='file:///Users/aravindrajeshmenon/Documents/DataScienceProjects/Projects/fraud_detection/credit_card_fraud/notebook/mlruns/442187196940008915', creation_time=1760316890544, experiment_id='442187196940008915', last_update_time=1760316890544, lifecycle_stage='active', name='fraud-detection-2', tags={'mlflow.experimentKind': 'custom_model_development'}>

## Loading the data

In [6]:
df = pd.read_csv('../data/creditcard.csv')

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

Since there are no missing values or any categorical values, we can directly move on to modelling the data which simplifies our workload significantly. First, we will run a model with all the variables and then afterwards run a model with a lower number of features that have been removed through some sort of filter (low correlation with target variable, low variance etc.).

In [8]:
x = df.drop(['Class'], axis = 1)
y = df['Class']

We can now split the data into training and test set. We will then split the test set into the validation and test set to ensure our model gets enough data to train on.

In [9]:
x_train, x_ , y_train, y_ = train_test_split(x,y,train_size = 0.8, random_state = 42, stratify = y)
x_val, x_test, y_val, y_test = train_test_split(x_, y_, train_size = 0.5, random_state = 42, stratify = y_)


### Testing Logistic Regression model

In [10]:
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky']
for solver in solvers:
    with mlflow.start_run():
        mlflow.set_tag("model", "logistic_reg")
        mlflow.log_param("solver", solver)
        lr = LogisticRegression(max_iter = 5000, class_weight = 'balanced', solver = solver)
        lr.fit(x_train, y_train)
        y_pred = lr.predict(x_val)
        f1 = f1_score(y_val, y_pred)
        print(f1)
        recall = recall_score(y_val, y_pred)
        precision = precision_score(y_val, y_pred)
        report = classification_report(y_val, y_pred)
        print(report)
        mlflow.log_metric("f1", f1)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("precision", precision)

    
    


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=5000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.10449574726609964
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     28432
           1       0.06      0.88      0.10        49

    accuracy                           0.97     28481
   macro avg       0.53      0.93      0.55     28481
weighted avg       1.00      0.97      0.99     28481

0.10643564356435643
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     28432
           1       0.06      0.88      0.11        49

    accuracy                           0.97     28481
   macro avg       0.53      0.93      0.55     28481
weighted avg       1.00      0.97      0.99     28481

0.10696517412935323
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     28432
           1       0.06      0.88      0.11        49

    accuracy                           0.97     28481
   macro avg       0.53      0.93      0.55     28481
weighted avg  

## Model Performance

The model demonstrates moderate fraud detection capability but suffers from a significant precision problem. With a recall of 89.3%, the model successfully identifies most fraudulent transactions, catching 67 out of 75 fraud cases. However, the precision of only 7% indicates that the model generates an excessive number of false positives with 880 legitimate transactions being flagged as fraudulent.

This imbalance results in a low F1-score of 0.13, suggesting the model is not really production-ready. 

To improve model performance, we need to address the class imbalance more effectively through techniques such as SMOTE or exploring ensemble methods. We will first try to employ other random forest, xgboost and lightgbm models to compare performances

In [10]:
best_result = fmin(
    fn = objective, 
    space = space, 
    algo = tpe.suggest, 
    max_evals = 20
)

100%|██████████| 20/20 [25:05<00:00, 75.29s/trial, best loss: -0.8260869565217391]


We see that the random forest method achieves a reasonably good f1 score of 0.82 which is already a massive improvement over the logistic regression model. Now, let us see the f1 scores achieved by the boosted tree methods. 

In order to use xgb.train, the dataset has to be converted into a DMatrix. We will first start with this.

In [11]:
train = xgb.DMatrix(data = x_train, label = y_train)
valid = xgb.DMatrix(data = x_val, label = y_val)
test = xgb.DMatrix(data = x_test, label = y_test)

In [13]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", 'xgboost')
        mlflow.log_params(params)
        booster = xgb.train(
            params = params, 
            dtrain = train, 
            num_boost_round = 500, 
            evals=[(train, "train"), (valid, "validation")],
            early_stopping_rounds = 50,
            verbose_eval = False
        )
        y_pred_probs = booster.predict(valid)
        y_pred = (y_pred_probs > 0.5).astype(int)
        f1 = f1_score(y_val, y_pred)
        mlflow.log_metric('f1', f1)
       
        return {'loss': -f1, 'status': STATUS_OK}
        
        



In [None]:
space = {
    'max_depth': scope.int(hp.uniform('max_depth', 4, 100)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),  # 0.0067 to 1.0
    'subsample': hp.uniform('subsample', 0.7, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1.0),
    'min_child_weight': hp.choice('min_child_weight', [1, 3, 5, 7]),
    'gamma': hp.uniform('gamma', 0, 0.3),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -2),  
    'reg_lambda': hp.loguniform('reg_lambda', -5, -2),
    'seed' : 42, 
    'objective' : 'binary:logistic'
}

In [15]:
trials = Trials()
best_result = fmin(
    fn = objective, 
    space = space, 
    algo = tpe.suggest, 
    max_evals = 150
)

100%|██████████| 150/150 [03:02<00:00,  1.21s/trial, best loss: -0.898876404494382] 


We see from the results that the best f1 score achieved across trials is 0.88 which is an even bigger improvement than the random forest model was able to achieve. Before finalising xgboost as the model of choice, let us first also test the lightgbm model. 

Now, let us try out LightGBM model. We will, as before, define a search space for hyperopt to minimise the loss function, here given by -F1. Then, we will log the parameters as well as the F1 score in each metric to find out the "best" parameters. We will then load this model and test on how well it performs on test data. 

In [16]:
space = {
    'max_depth': scope.int(hp.uniform('max_depth', 10, 25)),
    'num_leaves': scope.int(hp.uniform('num_leaves', 20, 100)),
    'min_child_samples': scope.int(hp.uniform('min_child_samples', 10, 50)),
    'learning_rate': hp.loguniform('learning_rate', -2.5, 0),
    'n_estimators': scope.int(hp.uniform('n_estimators', 100, 1000)),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -2.5),
    'reg_lambda': hp.loguniform('reg_lambda', -5, -3),
    'subsample': hp.uniform('subsample', 0.75, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.75, 1.0),
}
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model', 'lightgbm')
        mlflow.log_params(params)

        lgb_model = LGBMClassifier(
            class_weight='balanced',
            random_state=42,
            n_jobs=-1,
            verbose=-1,
            max_depth = params['max_depth'],
            num_leaves = params['num_leaves'],
            min_child_samples = params['min_child_samples'],
            n_estimators = params['n_estimators'],
            reg_alpha = params['reg_alpha'],
            reg_lambda = params['reg_lambda'],
            subsample = params['subsample'],
            colsample_bytree = params['colsample_bytree']
    )
        lgb_model.fit(x_train, y_train)
        y_pred = lgb_model.predict(x_val)
        f1 = f1_score(y_val, y_pred)
        mlflow.log_metric("f1", f1)
    return {'loss': -f1, 'status': STATUS_OK}

In [17]:
best_result = fmin(
    fn = objective, 
    space = space, 
    algo = tpe.suggest, 
    max_evals = 150
)

100%|██████████| 150/150 [09:48<00:00,  3.92s/trial, best loss: -0.8842105263157894]


## Results

Both Random Forest (F1: 0.84) and XGBoost (F1: 0.88) perform similarly and way better than logistic regression. XGBoost in particular iscatching fraud with high precision (92-98%) as well decently high recall as well.

**Next steps:**
- Try RandomUnderSampler and SMOTE to balance classes better. 

In [18]:
over = SMOTE(sampling_strategy=0.005)
under = RandomUnderSampler(sampling_strategy=0.002)

We keep the class imbalance intentionally because fraud is rare in real life. 
Balancing too much creates unrealistic data that won't work in production.

- Undersample to 0.002: This removes most of the majority class to cut down on 
  computation and noise, but keeps enough samples for the model to learn 
  normal patterns
  
- SMOTE (0.005): Adds minimal synthetic fraud cases (bringing fraud to 0.5% of majority)
  to slightly improve minority class representation without creating too many fake samples

This approach keeps synthetic data minimal and preserves the imbalanced 
nature of fraud, so the model learns realistic patterns.


In [19]:
x_train_undersampled, y_train_undersampled= under.fit_resample(x_train, y_train)

In [20]:
y_train_undersampled.value_counts()

Class
0    197000
1       394
Name: count, dtype: int64

In [21]:
x_train_resampled,  y_train_resampled = over.fit_resample(x_train_undersampled, y_train_undersampled)

In [22]:
y_train_resampled.value_counts()

Class
0    197000
1       985
Name: count, dtype: int64

In [23]:
smote_train = xgb.DMatrix(data = x_train_resampled, label = y_train_resampled)
valid = xgb.DMatrix(data = x_val, label = y_val)
test = xgb.DMatrix(data = x_test, label = y_test)

In [24]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", 'xgboost_smote')
        mlflow.log_params(params)
        booster = xgb.train(
            params = params, 
            dtrain = smote_train, 
            num_boost_round = 500, 
            evals=[(smote_train, "train"), (valid, "validation")],
            early_stopping_rounds = 50,
            verbose_eval = False
        )
        y_pred_probs = booster.predict(valid)
        y_pred = (y_pred_probs > 0.5).astype(int)
        f1 = f1_score(y_val, y_pred)
        mlflow.log_metric('f1', f1)
        return {'loss': -f1, 'status': STATUS_OK}

In [25]:
space = {
    'max_depth': scope.int(hp.uniform('max_depth', 25, 50 )),
    'learning_rate': hp.loguniform('learning_rate', -3, -1),  # 0.0067 to 1.0
    'subsample': hp.uniform('subsample', 0.7, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1.0),
    'min_child_weight': hp.choice('min_child_weight', [1, 3, 5, 7]),
    'gamma': hp.uniform('gamma', 0, 0.2),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -2),  
    'reg_lambda': hp.loguniform('reg_lambda', -5, -2), 
    'seed' : 42, 
    'objective' : 'binary:logistic'
}

In [26]:
trials = Trials()
best_result = fmin(
    fn = objective, 
    space = space, 
    algo = tpe.suggest, 
    max_evals = 150
)

100%|██████████| 150/150 [03:53<00:00,  1.55s/trial, best loss: -0.9010989010989011]


We see that hyperopt has been able to achieve a validation f1 score of 0.90 on the resampled data. Now, as the f1 scores of the best XGBoost model trained on the original data as well as the xgboost model trained on the resampled data are quite similar, we need to verify the better model by testing on the test data. In the next section, the best lightgbm model as well as the two XGboost models will be made to predict on the test set to compare their results.

## Testing the candidate models on the test set

### LightGBM model

In [28]:
best_params = {
    'reg_lambda': 0.006902186066790664,
    'max_depth': 32,
    'learning_rate': 0.13087526206252295,
    'n_estimators': 905,
    'min_child_samples': 18,
    'num_leaves': 43,
    'colsample_bytree': 0.8564403531085696,
    'reg_alpha': 0.008070446322648825,
    'subsample': 0.8931242253475022
}

In [29]:
lgb_model = LGBMClassifier(**best_params)
lgb_model.fit(x_train, y_train)

y_pred_lgb = lgb_model.predict(x_test)



In [30]:
print(classification_report(y_test, y_pred_lgb))
print(confusion_matrix(y_test, y_pred_lgb))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.72      0.84      0.77        49

    accuracy                           1.00     28481
   macro avg       0.86      0.92      0.89     28481
weighted avg       1.00      1.00      1.00     28481

[[28416    16]
 [    8    41]]


The model achieves 84% recall on fraud detection, successfully catching 41 out of 49 
fraudulent transactions. Precision is 72%, meaning 16 legitimate transactions were 
incorrectly flagged as fraud. Only 8 actual frauds were missed. Overall, the model 
performs well on the imbalanced dataset while keeping false alarms relatively low.

### XGBoost model (trained on original data)

In [34]:
best_params = {
    'reg_lambda': 0.02946709949617557,
    'gamma': 0.4035327095549419,
    'seed': 42,
    'max_depth': 76,
    'min_child_weight': 7,
    'learning_rate': 0.08284903106602773,
    'objective': 'binary:hinge',
    'colsample_bytree': 0.7141674994079779,
    'reg_alpha': 0.048323273989931845,
    'subsample': 0.855037126541903
}

with mlflow.start_run():
    mlflow.log_params(best_params)
    
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=200,
        evals=[(valid, "validation")],
        early_stopping_rounds=50
    )
    
    y_pred_proba = booster.predict(test)
    y_pred_xgb = (y_pred_proba > 0.5).astype(int)
    f1 = f1_score(y_test, y_pred_xgb)
    
    mlflow.xgboost.log_model(booster, "xgb_model")
    mlflow.log_metric("f1", f1)
    
print(f"F1 Score: {f1}")


[0]	validation-error:0.00067
[1]	validation-error:0.00060
[2]	validation-error:0.00056
[3]	validation-error:0.00056
[4]	validation-error:0.00060
[5]	validation-error:0.00056
[6]	validation-error:0.00053
[7]	validation-error:0.00056
[8]	validation-error:0.00049
[9]	validation-error:0.00046
[10]	validation-error:0.00046
[11]	validation-error:0.00046
[12]	validation-error:0.00046
[13]	validation-error:0.00046
[14]	validation-error:0.00046
[15]	validation-error:0.00049
[16]	validation-error:0.00049
[17]	validation-error:0.00049
[18]	validation-error:0.00049
[19]	validation-error:0.00049
[20]	validation-error:0.00049
[21]	validation-error:0.00049
[22]	validation-error:0.00049
[23]	validation-error:0.00049
[24]	validation-error:0.00049
[25]	validation-error:0.00042
[26]	validation-error:0.00046
[27]	validation-error:0.00042
[28]	validation-error:0.00046
[29]	validation-error:0.00042
[30]	validation-error:0.00042
[31]	validation-error:0.00042
[32]	validation-error:0.00042
[33]	validation-erro

  xgb_model.save_model(model_data_path)


F1 Score: 0.875


In [35]:
print(classification_report(y_test, y_pred_xgb))
print(confusion_matrix(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.89      0.86      0.88        49

    accuracy                           1.00     28481
   macro avg       0.95      0.93      0.94     28481
weighted avg       1.00      1.00      1.00     28481

[[28427     5]
 [    7    42]]


XGBoost outperforms LightGBM on fraud detection with an F1 score of 0.88 compared to 0.77. 
It achieves 86% recall vs 84%, catching 42 out of 49 frauds instead of 41. Precision 
improves significantly to 89% from 72%, reducing false alarms from 16 to just 5. Missed 
frauds drop from 8 to 7. XGBoost provides better overall balance between catching fraud 
and minimizing customer disruption.

In [None]:
#with open('../models/xgb_boost.pkl', 'wb') as f_out: 
    #pickle.dump(booster, f_out)

In [36]:
best_params = {
    'reg_lambda': 0.026883404750849747,
    'gamma': 0.047305821238349866,
    'seed': 42,
    'max_depth': 32,
    'min_child_weight': 7,
    'learning_rate': 0.0913081674061314,
    'objective': 'binary:logistic',
    'colsample_bytree': 0.9044835863913875,
    'reg_alpha': 0.07544378959317678,
    'subsample': 0.8414064834751469
}

In [37]:
rs_booster = xgb.train(
    params = best_params, 
    dtrain = smote_train, 
    num_boost_round = 200, 
    evals = [(train, "train"), (valid, "validation")],
    early_stopping_rounds = 50,
    verbose_eval = False
)
y_pred_probs = rs_booster.predict(test)
y_pred_rs = (y_pred_probs > 0.5).astype(int)
f1 = f1_score(y_test, y_pred_rs)
mlflow.xgboost.log_model(booster, "xgb_model_rs")
mlflow.log_metric("f1", f1)
print(f1)

  xgb_model.save_model(model_data_path)


0.8541666666666666


In [38]:
print(classification_report(y_test, y_pred_rs))
print(confusion_matrix(y_test, y_pred_rs))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.87      0.84      0.85        49

    accuracy                           1.00     28481
   macro avg       0.94      0.92      0.93     28481
weighted avg       1.00      1.00      1.00     28481

[[28426     6]
 [    8    41]]


XGBoost trained on resampled data achieves 84% recall and 87% precision, with an F1 score 
of 0.85. It catches 41 out of 49 frauds with 6 false alarms and 8 missed frauds. While 
performance is solid, it's slightly worse than the original XGBoost trained on imbalanced 
data (F1: 0.88 vs 0.85).

Conclusion:
The original XGBoost model trained on imbalanced data will be deployed. It achieves the 
best F1 score (0.88), highest precision (89%), and fewest false alarms (5). Resampling 
didn't really improve performance, which kinda confirms that maintaining the natural class distribution 
produces better results for this fraud detection task.