In [1]:
!python -V

Python 3.10.18


### Imports

In [4]:
import mlflow
import numpy as np
import pandas as pd
import warnings

from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import make_pipeline
from category_encoders.one_hot import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

warnings.filterwarnings('ignore')

### Constants

In [5]:
DATA = "data/PS_20174392719_1491204439457_log.csv"
EXPERIMENT_NAME = "fraud detection"

### Setup

In [3]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='mlflow-artifacts:/102175048111124688', creation_time=1753969702375, experiment_id='102175048111124688', last_update_time=1753969702375, lifecycle_stage='active', name='fraud detection', tags={}>

### Read data

#### Data wrangle

In [6]:
def wrangle(filepath):
    df = pd.read_csv(filepath)

    # remove leak features
    cols = []
    cols.append("newbalanceOrig")
    cols.append("newbalanceDest")

    #dectection system result
    cols.append("isFlaggedFraud")

    #Select only transaction's type where there is a fraud
    trans_types = ["CASH_OUT", "TRANSFER"]
    df = df[df["type"].isin(trans_types)]
    
    #keep only type of customers M or C
    df["nameOrig"] = df["nameOrig"].str[0]
    df["nameDest"] = df["nameDest"].str[0]

    # Transaction's hour
    df["time"] = df["step"].apply(lambda step: (step - 1) % 24 + 1)
    cols.append("step")

    #filter amount between 10th and 90th percentile
    q10 = df.amount.quantile(0.1)
    q90 = df.amount.quantile(0.9)
    df = df[df["amount"].between(q10, q90)]

    #Filter oldbalanceOrg
    df = df[df.oldbalanceOrg > 0]
    q10 = df.oldbalanceOrg.quantile(0.1)
    q90 = df.oldbalanceOrg.quantile(0.9)
    df = df[df["oldbalanceOrg"].between(q10, q90)]

    #Filter oldbalanceDest
    q10 = df.oldbalanceDest.quantile(0.1)
    q90 = df.oldbalanceDest.quantile(0.9)
    df = df[df["oldbalanceDest"].between(q10, q90)]

    #Drop features with low dimensionality
    cols.append("nameOrig")
    cols.append("nameDest")
    
    # drop columns
    df.drop(columns=cols, inplace=True)
    return df

In [7]:
df = wrangle(DATA)
print("wrangled df", df.shape)

wrangled df (849078, 6)


In [19]:
df.to_csv("data/fraud-detection-mobile-money(short).csv", index=False)

#### Explore data

In [8]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 849078 entries, 15 to 6362611
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   type            849078 non-null  object 
 1   amount          849078 non-null  float64
 2   oldbalanceOrg   849078 non-null  float64
 3   oldbalanceDest  849078 non-null  float64
 4   isFraud         849078 non-null  int64  
 5   time            849078 non-null  int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 45.3+ MB


Unnamed: 0,type,amount,oldbalanceOrg,oldbalanceDest,isFraud,time
15,CASH_OUT,229133.94,15325.0,5083.0,0,1
24,TRANSFER,311685.89,10835.0,6267.0,0,1
42,CASH_OUT,110414.71,26845.41,288800.0,0,1
47,CASH_OUT,56953.9,1942.02,70253.0,0,1
58,TRANSFER,62610.8,79114.0,517.0,0,1


In [8]:
# Imbalanced Data
df.isFraud.value_counts(normalize=True)

isFraud
0    0.997662
1    0.002338
Name: proportion, dtype: float64

### Build model

In [10]:
target = "isFraud"
X = df.drop(columns=[target])
y = df[target]

In [10]:
# Split data
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=2000)

# OverSampling Data using SMOTENC method
X_samp, y_samp = SMOTENC(categorical_features=[0], random_state=42).fit_resample(X_train, y_train)

#### Baseline model

In [11]:
print("Baseline model accuracy", y_train.value_counts(normalize=True).max())

Baseline model accuracy 0.9976842514375897


#### Iterate

In [31]:
best_f1 = 0.92
C = np.linspace(0.5, 5, 10)
raw_data = X_samp.copy()
raw_data["target"] = y_samp
solvers = ["sag", "saga", "newton-cholesky"]

dataset = mlflow.data.from_pandas(raw_data, source="https://www.kaggle.com/datasets/ealaxi/paysim1", name="Fraud_detection_training_data")

In [None]:
for c in C:
    for solver in solvers:
        with mlflow.start_run():
            #log dataset 
            mlflow.log_input(dataset, context="training")
            try:
                model = make_pipeline(
                        OneHotEncoder(use_cat_names=True),
                        LogisticRegression(
                            random_state=42,
                            solver=solver,
                            n_jobs=2,
                            C=c
                        ))
                
                model.fit(X_samp, y_samp)
                print("Training is finished")
                y_pred = model.predict(X_samp)
                f1 = round(f1_score(y_pred=y_pred, y_true=y_samp), 2)
                acc = round(accuracy_score(y_pred=y_pred, y_true=y_samp), 2)
                recall = round(recall_score(y_pred=y_pred, y_true=y_samp),2)
                precision = round(precision_score(y_pred=y_pred, y_true=y_samp),2)
                training_metrics = {
                    "accuracy_training": acc,
                    "recall_training": recall,
                    "f1_score_training": f1,
                    "precision_training": precision
                }
                print(training_metrics)
                
                y_pred_test = model.predict(X_test)
                class_report = classification_report_imbalanced(
                    y_pred=y_pred_test, 
                    y_true=y_test,
                    target_names=target_names, 
                    output_dict=True)

                test_metrics = {
                    "avg_precision_test": round(class_report["avg_pre"], 2),
                    "avg_recall_test": round(class_report["avg_rec"], 2),
                    "avg_f1_score_test": round(class_report["avg_f1"], 2)
                }
                print(test_metrics)
                mlflow.log_metrics(test_metrics)
                mlflow.log_metrics(training_metrics)
                mlflow.log_params(
                    {"C":c,"solver":solver})
                print("metrics and parameters have been logged")

                if f1 >= best_f1:
                    best_f1 = f1
                        # Log model
                    mlflow.sklearn.log_model(model, name="model", input_example=X_samp)
                    print("The best model has been saved!")
            except Exception as e :
                print(e)
                continue

Training is finished
{'accuracy_training': 0.92, 'recall_training': 0.99, 'f1_score_training': 0.93, 'precision_training': 0.88}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.86), 'avg_f1_score_test': np.float64(0.92)}
metrics and parameters have been logged
The best model has been saved!
🏃 View run rare-sloth-871 at: http://localhost:5000/#/experiments/102175048111124688/runs/3171fd72e5004854a40c2abc52684ff6
🧪 View experiment at: http://localhost:5000/#/experiments/102175048111124688
Training is finished
{'accuracy_training': 0.92, 'recall_training': 0.99, 'f1_score_training': 0.93, 'precision_training': 0.88}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.86), 'avg_f1_score_test': np.float64(0.92)}
metrics and parameters have been logged


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x72a62bd24520>>
Traceback (most recent call last):
  File "/home/codespace/anaconda3/envs/fraud/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 781, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


The best model has been saved!
🏃 View run delightful-crane-583 at: http://localhost:5000/#/experiments/102175048111124688/runs/bf02e01c3e8b4e1a902115975c8f6cb9
🧪 View experiment at: http://localhost:5000/#/experiments/102175048111124688


Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=7.68011e-18): result may not be accurate.
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.iteration += _check_optimize_result("lbfgs", opt_res)


Training is finished
{'accuracy_training': 0.9, 'recall_training': 0.92, 'f1_score_training': 0.9, 'precision_training': 0.89}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.88), 'avg_f1_score_test': np.float64(0.94)}
metrics and parameters have been logged
🏃 View run likeable-stag-460 at: http://localhost:5000/#/experiments/102175048111124688/runs/eea8b7ad6461490fb01de2ba3bf2cd29
🧪 View experiment at: http://localhost:5000/#/experiments/102175048111124688
Training is finished
{'accuracy_training': 0.92, 'recall_training': 0.99, 'f1_score_training': 0.93, 'precision_training': 0.88}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.86), 'avg_f1_score_test': np.float64(0.92)}
metrics and parameters have been logged


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x72a62bd24520>>
Traceback (most recent call last):
  File "/home/codespace/anaconda3/envs/fraud/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 781, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


The best model has been saved!
🏃 View run serious-swan-266 at: http://localhost:5000/#/experiments/102175048111124688/runs/ddb4092ccf764a70a48be2ed60e231fb
🧪 View experiment at: http://localhost:5000/#/experiments/102175048111124688
Training is finished
{'accuracy_training': 0.92, 'recall_training': 0.99, 'f1_score_training': 0.93, 'precision_training': 0.88}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.86), 'avg_f1_score_test': np.float64(0.92)}
metrics and parameters have been logged
The best model has been saved!
🏃 View run secretive-hog-885 at: http://localhost:5000/#/experiments/102175048111124688/runs/69fa1f6400344ffba6773986917e36b4
🧪 View experiment at: http://localhost:5000/#/experiments/102175048111124688


Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=3.84017e-18): result may not be accurate.
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.iteration += _check_optimize_result("lbfgs", opt_res)


Training is finished
{'accuracy_training': 0.9, 'recall_training': 0.92, 'f1_score_training': 0.9, 'precision_training': 0.89}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.88), 'avg_f1_score_test': np.float64(0.94)}
metrics and parameters have been logged
🏃 View run stylish-mule-650 at: http://localhost:5000/#/experiments/102175048111124688/runs/9f3fadb2ab4f4b97b99c67d3c05a860e
🧪 View experiment at: http://localhost:5000/#/experiments/102175048111124688
Training is finished
{'accuracy_training': 0.92, 'recall_training': 0.99, 'f1_score_training': 0.93, 'precision_training': 0.88}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.86), 'avg_f1_score_test': np.float64(0.92)}
metrics and parameters have been logged
The best model has been saved!
🏃 View run spiffy-croc-99 at: http://localhost:5000/#/experiments/102175048111124688/runs/8ea6e336ea2b40c29a3330423a80002c
🧪 View experiment at: http://localhost:5000/#/experiments/102175048111124688

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=2.56013e-18): result may not be accurate.
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.iteration += _check_optimize_result("lbfgs", opt_res)


Training is finished
{'accuracy_training': 0.9, 'recall_training': 0.92, 'f1_score_training': 0.9, 'precision_training': 0.89}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.88), 'avg_f1_score_test': np.float64(0.94)}
metrics and parameters have been logged
🏃 View run bright-cat-795 at: http://localhost:5000/#/experiments/102175048111124688/runs/3cf725ea69ff40b8864aa87531f3e390
🧪 View experiment at: http://localhost:5000/#/experiments/102175048111124688
Training is finished
{'accuracy_training': 0.92, 'recall_training': 0.99, 'f1_score_training': 0.93, 'precision_training': 0.88}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.86), 'avg_f1_score_test': np.float64(0.92)}
metrics and parameters have been logged
The best model has been saved!
🏃 View run silent-cub-20 at: http://localhost:5000/#/experiments/102175048111124688/runs/13aea85d0b6b4ed2a3f5fb2752188097
🧪 View experiment at: http://localhost:5000/#/experiments/102175048111124688
Tr

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=1.92011e-18): result may not be accurate.
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.iteration += _check_optimize_result("lbfgs", opt_res)


Training is finished
{'accuracy_training': 0.9, 'recall_training': 0.92, 'f1_score_training': 0.9, 'precision_training': 0.89}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.88), 'avg_f1_score_test': np.float64(0.94)}
metrics and parameters have been logged
🏃 View run honorable-swan-68 at: http://localhost:5000/#/experiments/102175048111124688/runs/fdd8f7ada7d640cbbc05d186f0ac9d18
🧪 View experiment at: http://localhost:5000/#/experiments/102175048111124688
Training is finished
{'accuracy_training': 0.92, 'recall_training': 0.99, 'f1_score_training': 0.93, 'precision_training': 0.88}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.86), 'avg_f1_score_test': np.float64(0.92)}
metrics and parameters have been logged
The best model has been saved!
🏃 View run nimble-whale-422 at: http://localhost:5000/#/experiments/102175048111124688/runs/12376945345f4facaf02d9c8b8a4db40
🧪 View experiment at: http://localhost:5000/#/experiments/102175048111124

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=1.53609e-18): result may not be accurate.
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.iteration += _check_optimize_result("lbfgs", opt_res)


Training is finished
{'accuracy_training': 0.9, 'recall_training': 0.92, 'f1_score_training': 0.9, 'precision_training': 0.89}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.88), 'avg_f1_score_test': np.float64(0.94)}
metrics and parameters have been logged
🏃 View run skittish-sloth-603 at: http://localhost:5000/#/experiments/102175048111124688/runs/dc5ccc24a59f41379af14ffb4d57de46
🧪 View experiment at: http://localhost:5000/#/experiments/102175048111124688
Training is finished
{'accuracy_training': 0.92, 'recall_training': 0.99, 'f1_score_training': 0.93, 'precision_training': 0.88}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.86), 'avg_f1_score_test': np.float64(0.92)}
metrics and parameters have been logged
The best model has been saved!
🏃 View run traveling-duck-444 at: http://localhost:5000/#/experiments/102175048111124688/runs/d02ebe0eeecc44d694cfc2a32b6e956e
🧪 View experiment at: http://localhost:5000/#/experiments/102175048111

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=1.28008e-18): result may not be accurate.
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.iteration += _check_optimize_result("lbfgs", opt_res)


Training is finished
{'accuracy_training': 0.9, 'recall_training': 0.92, 'f1_score_training': 0.9, 'precision_training': 0.89}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.88), 'avg_f1_score_test': np.float64(0.94)}
metrics and parameters have been logged
🏃 View run amusing-snipe-234 at: http://localhost:5000/#/experiments/102175048111124688/runs/fb27874c75bf40d082ae7c882735aa59
🧪 View experiment at: http://localhost:5000/#/experiments/102175048111124688
Training is finished
{'accuracy_training': 0.92, 'recall_training': 0.99, 'f1_score_training': 0.93, 'precision_training': 0.88}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.86), 'avg_f1_score_test': np.float64(0.92)}
metrics and parameters have been logged
The best model has been saved!
🏃 View run polite-yak-280 at: http://localhost:5000/#/experiments/102175048111124688/runs/068ec67b396541f4b4b6795a1df6695c
🧪 View experiment at: http://localhost:5000/#/experiments/10217504811112468

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=1.09721e-18): result may not be accurate.
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.iteration += _check_optimize_result("lbfgs", opt_res)


Training is finished
{'accuracy_training': 0.9, 'recall_training': 0.92, 'f1_score_training': 0.9, 'precision_training': 0.89}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.88), 'avg_f1_score_test': np.float64(0.94)}
metrics and parameters have been logged
🏃 View run burly-jay-926 at: http://localhost:5000/#/experiments/102175048111124688/runs/8a17aa391f3b4f829192a3522690bfd3
🧪 View experiment at: http://localhost:5000/#/experiments/102175048111124688
Training is finished
{'accuracy_training': 0.92, 'recall_training': 0.99, 'f1_score_training': 0.93, 'precision_training': 0.88}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.86), 'avg_f1_score_test': np.float64(0.92)}
metrics and parameters have been logged
The best model has been saved!
🏃 View run nebulous-owl-300 at: http://localhost:5000/#/experiments/102175048111124688/runs/4182410d10e446629cbf63b27219f3ce
🧪 View experiment at: http://localhost:5000/#/experiments/102175048111124688


Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=9.60059e-19): result may not be accurate.
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.iteration += _check_optimize_result("lbfgs", opt_res)


Training is finished
{'accuracy_training': 0.9, 'recall_training': 0.92, 'f1_score_training': 0.9, 'precision_training': 0.89}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.88), 'avg_f1_score_test': np.float64(0.94)}
metrics and parameters have been logged
🏃 View run adventurous-rook-862 at: http://localhost:5000/#/experiments/102175048111124688/runs/da211cd7a34940d7ba1abeffadf30282
🧪 View experiment at: http://localhost:5000/#/experiments/102175048111124688
Training is finished
{'accuracy_training': 0.92, 'recall_training': 0.99, 'f1_score_training': 0.93, 'precision_training': 0.88}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.86), 'avg_f1_score_test': np.float64(0.92)}
metrics and parameters have been logged
The best model has been saved!
🏃 View run brawny-jay-169 at: http://localhost:5000/#/experiments/102175048111124688/runs/07e691f46f004a51bdcd5cf2c86aefc7
🧪 View experiment at: http://localhost:5000/#/experiments/10217504811112

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=8.53386e-19): result may not be accurate.
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.iteration += _check_optimize_result("lbfgs", opt_res)


Training is finished
{'accuracy_training': 0.9, 'recall_training': 0.92, 'f1_score_training': 0.9, 'precision_training': 0.89}
{'avg_precision_test': np.float64(1.0), 'avg_recall_test': np.float64(0.88), 'avg_f1_score_test': np.float64(0.94)}
metrics and parameters have been logged
🏃 View run dapper-whale-488 at: http://localhost:5000/#/experiments/102175048111124688/runs/f64cfb006af04ebcb8f618832e8c5fba
🧪 View experiment at: http://localhost:5000/#/experiments/102175048111124688


#### Model registry

best hyper : 
* c = 1 
* solver : sag
* model_id : m-dd9c830c43774d399b430923a66f30b0
* run_id : ddb4092ccf764a70a48be2ed60e231fb

In [10]:
run_id = "ddb4092ccf764a70a48be2ed60e231fb"
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri, "FraudDetectionModel")

print("Model saved in the Registry model")

Successfully registered model 'FraudDetectionModel'.
2025/08/01 13:46:54 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: FraudDetectionModel, version 1


Model saved in the Registry model


Created version '1' of model 'FraudDetectionModel'.


In [None]:
data = pd.read_csv("data/PS_20174392719_1491204439457_log.csv")
data.head()