In [67]:
from pathlib import Path

import lightgbm as lgbm
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split

PROJECT_ROOT = Path(__name__).resolve().parent.parent.parent
INPUT_FOLDER = PROJECT_ROOT / "data/input"
OUTPUT_FOLDER = PROJECT_ROOT / "data/output"
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

In [68]:
ifolder = INPUT_FOLDER / "Kaggle_creditcardfraud"
ofolder = OUTPUT_FOLDER / "Kaggle_creditcardfraud"

mod_df = pd.read_pickle(ofolder / "real_df_model.pkl")
syn1_df = pd.read_pickle(ofolder / "syn_df_balanced1.pkl")
syn2_df = pd.read_pickle(ofolder / "syn_df_balanced2.pkl")
synu_df = pd.read_pickle(ofolder / "syn_df_unbalanced.pkl")
val_df = pd.read_pickle(ofolder / "real_df_validation.pkl")

# Validation data
x_val = val_df.drop(["Class", "rand10"], axis=1)
y_val = val_df.Class

In [79]:
# Train gbm using synthesized data set
x = mod_df.drop(["Class", "rand10"], axis=1)
y = mod_df.Class
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.5, random_state=100
)
model1 = lgbm.LGBMClassifier(learning_rate=0.005, max_depth=-1, n_estimators=300, random_state=100)
model1.fit(
    x_train,
    y_train,
    eval_set=[(x_test, y_test), (x_train, y_train)],
    eval_metric="logloss",
)
model1_report = {
    "test": metrics.classification_report(
        y_test, model1.predict(x_test), output_dict=True
    ),
    "validation": metrics.classification_report(
        y_val, model1.predict(x_val), output_dict=True
    ),
    "validation_cm": metrics.confusion_matrix(y_val, model1.predict(x_val)),
}

[LightGBM] [Info] Number of positive: 215, number of negative: 126342
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005640 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 126557, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001699 -> initscore=-6.376110
[LightGBM] [Info] Start training from score -6.376110


In [80]:
# Train gbm using synthesized data set
x = syn1_df.drop(["Class", "rand10"], axis=1)
y = syn1_df.Class
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.5, random_state=100
)
model2 = lgbm.LGBMClassifier(learning_rate=0.005, max_depth=-1, n_estimators=300, random_state=100)
model2.fit(
    x_train,
    y_train,
    eval_set=[(x_test, y_test), (x_train, y_train)],
    eval_metric="logloss",
)
model2_report = {
    "test": metrics.classification_report(
        y_test, model2.predict(x_test), output_dict=True
    ),
    "validation": metrics.classification_report(
        y_val, model2.predict(x_val), output_dict=True
    ),
    "validation_cm": metrics.confusion_matrix(y_val, model2.predict(x_val)),
}

[LightGBM] [Info] Number of positive: 126158, number of negative: 126524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019245 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 252682, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499276 -> initscore=-0.002897
[LightGBM] [Info] Start training from score -0.002897


In [81]:
# Train gbm using synthesized data set
x = syn2_df.drop(["Class", "rand10"], axis=1)
y = syn2_df.Class
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.5, random_state=100
)
model3 = lgbm.LGBMClassifier(learning_rate=0.005, max_depth=-1, n_estimators=300, random_state=100)
model3.fit(
    x_train,
    y_train,
    eval_set=[(x_test, y_test), (x_train, y_train)],
    eval_metric="logloss",
)
model3_report = {
    "test": metrics.classification_report(
        y_test, model3.predict(x_test), output_dict=True
    ),
    "validation": metrics.classification_report(
        y_val, model3.predict(x_val), output_dict=True
    ),
    "validation_cm": metrics.confusion_matrix(y_val, model3.predict(x_val)),
}

[LightGBM] [Info] Number of positive: 5317, number of negative: 5285
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000790 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 10602, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501509 -> initscore=0.006037
[LightGBM] [Info] Start training from score 0.006037


In [82]:
# Train gbm using synthesized data set
x = synu_df.drop(["Class", "rand10"], axis=1)
y = synu_df.Class
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.5, random_state=100
)
model4 = lgbm.LGBMClassifier(learning_rate=0.005, max_depth=-1, n_estimators=300, random_state=100)
model4.fit(
    x_train,
    y_train,
    eval_set=[(x_test, y_test), (x_train, y_train)],
    eval_metric="logloss",
)
model4_report = {
    "test": metrics.classification_report(
        y_test, model4.predict(x_test), output_dict=True
    ),
    "validation": metrics.classification_report(
        y_val, model4.predict(x_val), output_dict=True
    ),
    "validation_cm": metrics.confusion_matrix(y_val, model4.predict(x_val)),
}

[LightGBM] [Info] Number of positive: 1758, number of negative: 5288
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000559 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 7046, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.249503 -> initscore=-1.101263
[LightGBM] [Info] Start training from score -1.101263


In [83]:
# all data model
print(round(model1_report.get("validation").get("macro avg").get("f1-score"), 4))
print(round(model1_report.get("validation").get("accuracy"), 4))
print(model1_report.get("validation_cm"))

0.8959
0.9993
[[31631     2]
 [   19    40]]


In [84]:
# large balanced data
print(round(model2_report.get("validation").get("macro avg").get("f1-score"), 4))
print(round(model2_report.get("validation").get("accuracy"), 4))
print(model2_report.get("validation_cm"))

0.8619
0.9989
[[31611    22]
 [   13    46]]


In [85]:
# smaller balanced data
print(round(model3_report.get("validation").get("macro avg").get("f1-score"), 4))
print(round(model3_report.get("validation").get("accuracy"), 4))
print(model3_report.get("validation_cm"))

0.7894
0.9978
[[31572    61]
 [   10    49]]


In [86]:
# smaller unbalanced data
print(round(model4_report.get("validation").get("macro avg").get("f1-score"), 4))
print(round(model4_report.get("validation").get("accuracy"), 4))
print(model4_report.get("validation_cm"))

0.8352
0.9985
[[31595    38]
 [   10    49]]
