In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, f1_score, recall_score, precision_score, ConfusionMatrixDisplay
import xgboost as xgb
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline 
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import pickle
from lightgbm import LGBMClassifier
import mlflow
from hyperopt import fmin, hp, tpe, Trials, STATUS_OK
from hyperopt.pyll import scope

In [2]:
mlflow.set_experiment("fraud-detection-2")

2025/10/13 01:54:50 INFO mlflow.tracking.fluent: Experiment with name 'fraud-detection-2' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/aravindrajeshmenon/Documents/DataScienceProjects/Projects/fraud_detection/credit_card_fraud/notebook/mlruns/442187196940008915', creation_time=1760316890544, experiment_id='442187196940008915', last_update_time=1760316890544, lifecycle_stage='active', name='fraud-detection-2', tags={}>

In [3]:
import mlflow
print(mlflow.get_tracking_uri())

file:///Users/aravindrajeshmenon/Documents/DataScienceProjects/Projects/fraud_detection/credit_card_fraud/notebook/mlruns


In [4]:
df = pd.read_csv('../data/creditcard.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

We see that since there are no missing values or any categorical values, we can directly move on to modelling the data which simplifies our workload significantly. First, we will run a model with all the variables and then afterwards run a model with a lower number of features that have been removed through some sort of filter (low correlation with target variable, low variance etc.).

In [6]:
x = df.drop(['Class'], axis = 1)
y = df['Class']

Since the data has been setup temporally, we can employ time-series splits to the data, using the first 80% of the data as training data and the remaining 20% of the data as test data. 

In [7]:
x_train, x_ , y_train, y_ = train_test_split(x,y,train_size = 0.8, random_state = 42, stratify = y)
x_val, x_test, y_val, y_test = train_test_split(x_, y_, train_size = 0.5, random_state = 42, stratify = y_)


In [8]:
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky']
for solver in solvers:
    with mlflow.start_run():
        mlflow.set_tag("model", "logistic_reg")
        mlflow.log_param("solver", solver)
        lr = LogisticRegression(max_iter = 5000, class_weight = 'balanced', solver = solver)
        lr.fit(x_train, y_train)
        y_pred = lr.predict(x_val)
        f1 = f1_score(y_val, y_pred)
        recall = recall_score(y_val, y_pred)
        precision = precision_score(y_val, y_pred)
        mlflow.log_metric("f1", f1)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("precision", precision)

    
    


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=5000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Model Performance

The model demonstrates moderate fraud detection capability but suffers from a significant precision problem. With a recall of 89.3%, the model successfully identifies most fraudulent transactions, catching 67 out of 75 fraud cases. However, the precision of only 7% indicates that the model generates an excessive number of false positives—flagging 880 legitimate transactions as fraudulent.

This imbalance results in a low F1-score of 0.13, suggesting the model is currently not production-ready. While the high recall is desirable for minimizing financial losses from missed fraud, the poor precision would lead to poor customer experience, with many legitimate transactions incorrectly declined or flagged for review.

To improve model performance, we need to address the class imbalance more effectively through techniques such as SMOTE, adjusting classification thresholds, or exploring ensemble methods. We will first try to employ other baseline models to compare performances

In [9]:
space = {
    'n_estimators' : scope.int(hp.uniform('n_estimators', 100, 250)), 
    'max_depth' : scope.int(hp.uniform('max_depth', 5, 15)),
    'min_samples_split' : hp.choice('min_samples_split',[2,5]),
    'min_samples_leaf' : hp.choice('min_samples_leaf', [1,2]),
    'max_features': hp.choice('max_features', ['sqrt', 'log2'])
 }

def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model', 'rf')
        mlflow.log_params(params)
        rf = RandomForestClassifier(n_estimators = params['n_estimators'], 
        max_depth = params['max_depth'],
        class_weight = 'balanced', 
        min_samples_split = params['min_samples_split'],
        min_samples_leaf = params['min_samples_leaf'],
        max_features = params['max_features'],
        random_state = 42,
        )
        rf.fit(x_train, y_train)
        y_pred = rf.predict(x_val)
        f1 = f1_score(y_val, y_pred)
        mlflow.log_metric("f1", f1)

        return {'loss': -f1, 'status': STATUS_OK}
        

In [10]:
best_result = fmin(
    fn = objective, 
    space = space, 
    algo = tpe.suggest, 
    max_evals = 20
)

100%|██████████| 20/20 [25:05<00:00, 75.29s/trial, best loss: -0.8260869565217391]


In [45]:

xgb_model = XGBClassifier(
    n_estimators=500,
    max_depth=10,
    learning_rate=0.01,
    random_state=42,
    subsample=0.9,                
    colsample_bytree=0.9,        
    gamma=0,
    reg_lambda=1.5,                
    scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]) ,  
    eval_metric='aucpr',
)



In [46]:
train = xgb.DMatrix(data = x_train, label = y_train)
valid = xgb.DMatrix(data = x_val, label = y_val)

In [53]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", 'xgboost')
        mlflow.log_params(params)
        booster = xgb.train(
            params = params, 
            dtrain = train, 
            num_boost_round = 200, 
            evals=[(train, "train"), (valid, "validation")],
            early_stopping_rounds = 50,
            verbose_eval = False
        )
        y_pred_probs = booster.predict(valid)
        y_pred = (y_pred_probs > 0.5).astype(int)
        f1 = f1_score(y_val, y_pred)
        mlflow.log_metric('f1', f1)
        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix = cm)
        plt.savefig("conf_matrix.png")
        mlflow.log_artifact("conf_matrix.png")
        plt.close()
        mlflow.xgboost.log_model(booster, name = "xgb_mlflow")
        return {'loss': -f1, 'status': STATUS_OK}
        
        



In [66]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
client.set_model_version_tag(
    name="fraud-detection-lgb",
    version=1,
    key="stage",
    value="staging"
)


In [None]:
client.search_runs(
    experiment_id = 1, 
    
)

In [54]:
space = {
    'n_estimators': scope.int(hp.uniform('n_estimators', 100, 250)),
    'max_depth': scope.int(hp.uniform('max_depth', 4, 100)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),  # 0.0067 to 1.0
    'subsample': hp.uniform('subsample', 0.7, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
    'min_child_weight': hp.choice('min_child_weight', [1, 3, 5, 7]),
    'gamma': hp.uniform('gamma', 0, 0.5),
    'reg_alpha': hp.loguniform('reg_alpha', -5, 2),  
    'reg_lambda': hp.loguniform('reg_lambda', -5, 2), 
    'seed' : 42, 
    'objective' : 'binary:hinge'
}

In [55]:
trials = Trials()
best_result = fmin(
    fn = objective, 
    space = space, 
    algo = tpe.suggest, 
    max_evals = 50
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




  2%|▏         | 1/50 [00:03<03:10,  3.89s/trial, best loss: -0.8085106382978723]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




  4%|▍         | 2/50 [00:06<02:26,  3.04s/trial, best loss: -0.8172043010752689]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




  6%|▌         | 3/50 [00:09<02:16,  2.90s/trial, best loss: -0.8275862068965517]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




  8%|▊         | 4/50 [00:11<02:09,  2.82s/trial, best loss: -0.8275862068965517]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 10%|█         | 5/50 [00:14<01:59,  2.66s/trial, best loss: -0.8602150537634409]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 12%|█▏        | 6/50 [00:16<01:52,  2.57s/trial, best loss: -0.8602150537634409]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 14%|█▍        | 7/50 [00:19<01:55,  2.70s/trial, best loss: -0.8602150537634409]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 16%|█▌        | 8/50 [00:21<01:49,  2.60s/trial, best loss: -0.8602150537634409]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 18%|█▊        | 9/50 [00:24<01:45,  2.57s/trial, best loss: -0.8602150537634409]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 20%|██        | 10/50 [00:26<01:37,  2.45s/trial, best loss: -0.8602150537634409]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 22%|██▏       | 11/50 [00:28<01:32,  2.37s/trial, best loss: -0.8602150537634409]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 24%|██▍       | 12/50 [00:31<01:33,  2.46s/trial, best loss: -0.8602150537634409]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 26%|██▌       | 13/50 [00:33<01:29,  2.42s/trial, best loss: -0.8602150537634409]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 28%|██▊       | 14/50 [00:36<01:25,  2.38s/trial, best loss: -0.8602150537634409]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 30%|███       | 15/50 [00:38<01:21,  2.34s/trial, best loss: -0.8602150537634409]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 32%|███▏      | 16/50 [00:40<01:22,  2.43s/trial, best loss: -0.8602150537634409]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 34%|███▍      | 17/50 [00:43<01:18,  2.37s/trial, best loss: -0.8602150537634409]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 36%|███▌      | 18/50 [00:45<01:17,  2.43s/trial, best loss: -0.8602150537634409]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 38%|███▊      | 19/50 [00:47<01:13,  2.37s/trial, best loss: -0.8602150537634409]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 40%|████      | 20/50 [00:50<01:12,  2.43s/trial, best loss: -0.8602150537634409]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 42%|████▏     | 21/50 [00:53<01:13,  2.52s/trial, best loss: -0.8636363636363636]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 44%|████▍     | 22/50 [00:55<01:10,  2.51s/trial, best loss: -0.8636363636363636]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 46%|████▌     | 23/50 [00:58<01:08,  2.52s/trial, best loss: -0.8636363636363636]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 48%|████▊     | 24/50 [01:00<01:05,  2.52s/trial, best loss: -0.8764044943820225]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 50%|█████     | 25/50 [01:03<01:04,  2.59s/trial, best loss: -0.8764044943820225]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 52%|█████▏    | 26/50 [01:06<01:02,  2.60s/trial, best loss: -0.8764044943820225]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 54%|█████▍    | 27/50 [01:08<00:58,  2.55s/trial, best loss: -0.8764044943820225]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 56%|█████▌    | 28/50 [01:10<00:54,  2.49s/trial, best loss: -0.8764044943820225]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 58%|█████▊    | 29/50 [01:13<00:52,  2.50s/trial, best loss: -0.8764044943820225]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 60%|██████    | 30/50 [01:15<00:48,  2.41s/trial, best loss: -0.8764044943820225]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 62%|██████▏   | 31/50 [01:18<00:46,  2.43s/trial, best loss: -0.8764044943820225]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 64%|██████▍   | 32/50 [01:20<00:44,  2.47s/trial, best loss: -0.8764044943820225]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 66%|██████▌   | 33/50 [01:23<00:41,  2.41s/trial, best loss: -0.8764044943820225]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 68%|██████▊   | 34/50 [01:25<00:38,  2.43s/trial, best loss: -0.8764044943820225]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 70%|███████   | 35/50 [01:28<00:38,  2.57s/trial, best loss: -0.8764044943820225]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 72%|███████▏  | 36/50 [01:30<00:35,  2.50s/trial, best loss: -0.8764044943820225]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 74%|███████▍  | 37/50 [01:33<00:33,  2.60s/trial, best loss: -0.8764044943820225]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 76%|███████▌  | 38/50 [01:36<00:30,  2.55s/trial, best loss: -0.8764044943820225]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 78%|███████▊  | 39/50 [01:38<00:27,  2.50s/trial, best loss: -0.8791208791208791]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 80%|████████  | 40/50 [01:40<00:24,  2.48s/trial, best loss: -0.8791208791208791]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 82%|████████▏ | 41/50 [01:43<00:22,  2.46s/trial, best loss: -0.8791208791208791]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 84%|████████▍ | 42/50 [01:45<00:20,  2.53s/trial, best loss: -0.8791208791208791]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 86%|████████▌ | 43/50 [01:48<00:17,  2.45s/trial, best loss: -0.8791208791208791]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 88%|████████▊ | 44/50 [01:50<00:15,  2.53s/trial, best loss: -0.8791208791208791]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 90%|█████████ | 45/50 [01:53<00:12,  2.54s/trial, best loss: -0.8791208791208791]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 92%|█████████▏| 46/50 [01:55<00:09,  2.45s/trial, best loss: -0.8791208791208791]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 94%|█████████▍| 47/50 [01:58<00:07,  2.51s/trial, best loss: -0.8888888888888888]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 96%|█████████▌| 48/50 [02:01<00:05,  2.61s/trial, best loss: -0.8888888888888888]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




 98%|█████████▊| 49/50 [02:03<00:02,  2.56s/trial, best loss: -0.8888888888888888]

Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()

  xgb_model.save_model(model_data_path)




100%|██████████| 50/50 [02:06<00:00,  2.53s/trial, best loss: -0.8888888888888888]


In [16]:
params = {
    'reg_lambda' : 0.02946709949617557,
    'gamma' : 0.4035327095549419,
    'seed' : 42,
    'max_depth' : 76,
    'min_child_weight' : 7,
    'learning_rate' : 0.08284903106602773,
    'objective' : 'binary:hinge',
   'n_estimators' :  141,
   'colsample_bytree' : 0.7141674994079779,
   'reg_alpha' : 0.048323273989931845,
   'subsample' : 0.8550371265419039
}


booster = xgb.train(
    params = params, 
    dtrain = train, 
    num_boost_round = 500, 
    evals = [(train, "train"), (valid, "validation")],
    early_stopping_rounds = 50,
    verbose_eval = False
)
y_pred_probs = booster.predict(valid)
y_pred = (y_pred_probs > 0.5).astype(int)
f1 = f1_score(y_val, y_pred)
mlflow.log_metric("f1", f1)


Parameters: { "n_estimators" } are not used.

  self.starting_round = model.num_boosted_rounds()


## Results

Both Random Forest (F1: 0.83) and XGBoost (F1: 0.82) perform similarly and way better than logistic regression. They're both catching about 72-75% of fraud with high precision (92-98%), but still missing ~25% of fraudulent transactions.

**Next steps:**
- Try RandomUnderSampler and SMOTE to balance classes better  
- Try out neural networks on the normal dataset, as well as the resampled dataset

In [18]:
over = SMOTE(sampling_strategy=0.5)
under = RandomUnderSampler(sampling_strategy=0.1)

In [19]:
steps =  [('under', under), ('over', over), ('model', lr)]
pipeline = Pipeline(steps=steps)

#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, x_train, y_train, scoring='roc_auc', cv= 3, n_jobs=-1)
print('Mean ROC AUC: %.3f' % np.mean(scores))

Mean ROC AUC: 0.980


In [20]:
y_pred = pipeline.predict(x_val)
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99     28432
           1       0.06      0.88      0.11        49

    accuracy                           0.97     28481
   macro avg       0.53      0.93      0.55     28481
weighted avg       1.00      0.97      0.99     28481

[[27711   721]
 [    6    43]]


In [29]:
mlflow.end_run()

In [38]:
space = {
    'max_depth': scope.int(hp.uniform('max_depth', 10, 30)),
    'num_leaves': scope.int(hp.uniform('num_leaves', 20, 100)),
    'min_child_samples': scope.int(hp.uniform('min_child_samples', 10, 50)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'n_estimators': scope.int(hp.uniform('n_estimators', 100, 1000)),
    'reg_alpha': hp.loguniform('reg_alpha', -5, 0),
    'reg_lambda': hp.loguniform('reg_lambda', -5, 0),
    'subsample': hp.uniform('subsample', 0.7, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.7, 1.0),
}
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model', 'lightgbm')
        mlflow.log_params(params)

        lgb_model = LGBMClassifier(
            class_weight='balanced',
            random_state=42,
            n_jobs=-1,
            verbose=-1,
            max_depth = params['max_depth'],
            num_leaves = params['num_leaves'],
            min_child_samples = params['min_child_samples'],
            n_estimators = params['n_estimators'],
            reg_alpha = params['reg_alpha'],
            reg_lambda = params['reg_lambda'],
            subsample = params['subsample'],
            colsample_bytree = params['colsample_bytree']
    )
        lgb_model.fit(x_train, y_train)
        y_pred = lgb_model.predict(x_val)
        f1 = f1_score(y_val, y_pred)
        mlflow.log_metric("f1", f1)
        mlflow.lightgbm.log_model(lgb_model, name = 'lgb_model_mlflow')
    return {'loss': -f1, 'status': STATUS_OK}

In [39]:
best_result = fmin(
    fn = objective, 
    space = space, 
    algo = tpe.suggest, 
    max_evals = 30
)

  0%|          | 0/30 [00:00<?, ?trial/s, best loss=?]




  3%|▎         | 1/30 [00:06<03:19,  6.87s/trial, best loss: -0.8421052631578947]




  7%|▋         | 2/30 [00:11<02:42,  5.81s/trial, best loss: -0.8631578947368421]




 10%|█         | 3/30 [00:20<03:06,  6.92s/trial, best loss: -0.8723404255319149]




 13%|█▎        | 4/30 [00:24<02:32,  5.88s/trial, best loss: -0.8723404255319149]




 17%|█▋        | 5/30 [00:32<02:44,  6.57s/trial, best loss: -0.8723404255319149]




 20%|██        | 6/30 [00:37<02:28,  6.18s/trial, best loss: -0.8723404255319149]




 23%|██▎       | 7/30 [00:43<02:19,  6.08s/trial, best loss: -0.8723404255319149]




 27%|██▋       | 8/30 [00:53<02:41,  7.35s/trial, best loss: -0.8817204301075269]




 30%|███       | 9/30 [01:02<02:42,  7.72s/trial, best loss: -0.8817204301075269]




 33%|███▎      | 10/30 [01:08<02:26,  7.32s/trial, best loss: -0.8817204301075269]




 37%|███▋      | 11/30 [01:17<02:29,  7.86s/trial, best loss: -0.8817204301075269]




 40%|████      | 12/30 [01:27<02:30,  8.36s/trial, best loss: -0.8817204301075269]




 43%|████▎     | 13/30 [01:34<02:14,  7.93s/trial, best loss: -0.8817204301075269]




 47%|████▋     | 14/30 [01:42<02:09,  8.09s/trial, best loss: -0.8817204301075269]




 50%|█████     | 15/30 [01:46<01:44,  6.95s/trial, best loss: -0.8817204301075269]




 53%|█████▎    | 16/30 [01:53<01:35,  6.80s/trial, best loss: -0.8817204301075269]




 57%|█████▋    | 17/30 [01:58<01:23,  6.40s/trial, best loss: -0.8817204301075269]




 60%|██████    | 18/30 [02:07<01:25,  7.09s/trial, best loss: -0.8817204301075269]




 63%|██████▎   | 19/30 [02:15<01:21,  7.39s/trial, best loss: -0.8817204301075269]




 67%|██████▋   | 20/30 [02:21<01:08,  6.84s/trial, best loss: -0.8817204301075269]




 70%|███████   | 21/30 [02:28<01:03,  7.04s/trial, best loss: -0.8817204301075269]




 73%|███████▎  | 22/30 [02:34<00:53,  6.71s/trial, best loss: -0.8817204301075269]




 77%|███████▋  | 23/30 [02:43<00:52,  7.48s/trial, best loss: -0.8817204301075269]




 80%|████████  | 24/30 [02:52<00:47,  7.87s/trial, best loss: -0.8817204301075269]




 83%|████████▎ | 25/30 [02:58<00:36,  7.31s/trial, best loss: -0.8817204301075269]




 87%|████████▋ | 26/30 [03:04<00:27,  6.84s/trial, best loss: -0.8817204301075269]




 90%|█████████ | 27/30 [03:11<00:20,  6.93s/trial, best loss: -0.8817204301075269]




 93%|█████████▎| 28/30 [03:22<00:16,  8.05s/trial, best loss: -0.8817204301075269]




 97%|█████████▋| 29/30 [03:31<00:08,  8.28s/trial, best loss: -0.8817204301075269]




100%|██████████| 30/30 [03:38<00:00,  7.28s/trial, best loss: -0.8817204301075269]


In [41]:
run_id = 'm-2a41626d46d54e74aa328b3b1c0aa8be'

lgb_loaded_model = mlflow.lightgbm.load_model(f"models:/{run_id}")

In [42]:
lgb_loaded_model

0,1,2
,boosting_type,'gbdt'
,num_leaves,33
,max_depth,26
,learning_rate,0.1
,n_estimators,765
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [None]:
steps =  [('under', under), ('over', over), ('model', lgb_model)]
pipeline = Pipeline(steps=steps)

#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, x_train, y_train, scoring='roc_auc', cv= 3, n_jobs=-1)
print('Mean ROC AUC: %.3f' % np.mean(scores))

Mean ROC AUC: 0.976


In [None]:
y_pred = xgb_model.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     56887
           1       0.05      0.85      0.09        75

    accuracy                           0.98     56962
   macro avg       0.52      0.92      0.54     56962
weighted avg       1.00      0.98      0.99     56962



In [None]:
with open('../models/lgb_model.pkl', 'wb') as f_out:
    pickle.dump(lgb_model, f_out)