In [2]:
from pathlib import Path

import lightgbm as lgbm
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split

PROJECT_ROOT = Path(__name__).resolve().parent.parent.parent
INPUT_FOLDER = PROJECT_ROOT / "data/input"
OUTPUT_FOLDER = PROJECT_ROOT / "data/output"
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

In [3]:
ifolder = INPUT_FOLDER / "Kaggle_creditcardfraud"
ofolder = OUTPUT_FOLDER / "Kaggle_creditcardfraud"

mod_df = pd.read_pickle(ofolder / "real_df_model.pkl")
syn1_df = pd.read_pickle(ofolder / "syn_df_balanced1.pkl")
syn2_df = pd.read_pickle(ofolder / "syn_df_balanced2.pkl")
synu_df = pd.read_pickle(ofolder / "syn_df_unbalanced.pkl")
val_df = pd.read_pickle(ofolder / "real_df_validation.pkl")

# Validation data
x_val = val_df.drop(["Class", "rand10"], axis=1)
y_val = val_df.Class

In [4]:
# Train gbm using real data set
x = mod_df.drop(["Class", "rand10"], axis=1)
y = mod_df.Class
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=100
)
model1 = lgbm.LGBMClassifier(learning_rate=0.005, max_depth=-1, n_estimators=300, random_state=100)
model1.fit(
    x_train,
    y_train,
    eval_set=[(x_test, y_test), (x_train, y_train)],
    eval_metric="logloss",
)
model1_report = {
    "test": metrics.classification_report(
        y_test, model1.predict(x_test), output_dict=True
    ),
    "validation": metrics.classification_report(
        y_val, model1.predict(x_val), output_dict=True
    ),
    "validation_cm": metrics.confusion_matrix(y_val, model1.predict(x_val)),
}

[LightGBM] [Info] Number of positive: 326, number of negative: 189510
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011225 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 189836, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001717 -> initscore=-6.365300
[LightGBM] [Info] Start training from score -6.365300


In [5]:
# Train gbm using synthesized data set
x = syn1_df.drop(["Class", "rand10"], axis=1)
y = syn1_df.Class
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=100
)
model2 = lgbm.LGBMClassifier(learning_rate=0.005, max_depth=-1, n_estimators=300, random_state=100)
model2.fit(
    x_train,
    y_train,
    eval_set=[(x_test, y_test), (x_train, y_train)],
    eval_metric="logloss",
)
model2_report = {
    "test": metrics.classification_report(
        y_test, model2.predict(x_test), output_dict=True
    ),
    "validation": metrics.classification_report(
        y_val, model2.predict(x_val), output_dict=True
    ),
    "validation_cm": metrics.confusion_matrix(y_val, model2.predict(x_val)),
}

[LightGBM] [Info] Number of positive: 189401, number of negative: 189622
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016253 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 379023, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499708 -> initscore=-0.001166
[LightGBM] [Info] Start training from score -0.001166


In [6]:
# Train gbm using synthesized data set
x = syn2_df.drop(["Class", "rand10"], axis=1)
y = syn2_df.Class
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=100
)
model3 = lgbm.LGBMClassifier(learning_rate=0.005, max_depth=-1, n_estimators=300, random_state=100)
model3.fit(
    x_train,
    y_train,
    eval_set=[(x_test, y_test), (x_train, y_train)],
    eval_metric="logloss",
)
model3_report = {
    "test": metrics.classification_report(
        y_test, model3.predict(x_test), output_dict=True
    ),
    "validation": metrics.classification_report(
        y_val, model3.predict(x_val), output_dict=True
    ),
    "validation_cm": metrics.confusion_matrix(y_val, model3.predict(x_val)),
}

[LightGBM] [Info] Number of positive: 7999, number of negative: 7904
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001204 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 15903, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502987 -> initscore=0.011948
[LightGBM] [Info] Start training from score 0.011948


In [None]:
# Train gbm using synthesized data set
x = synu_df.drop(["Class", "rand10"], axis=1)
y = synu_df.Class
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=100
)
model4 = lgbm.LGBMClassifier(learning_rate=0.005, max_depth=-1, n_estimators=300, random_state=100)
model4.fit(
    x_train,
    y_train,
    eval_set=[(x_test, y_test), (x_train, y_train)],
    eval_metric="logloss",
)
model4_report = {
    "test": metrics.classification_report(
        y_test, model4.predict(x_test), output_dict=True
    ),
    "validation": metrics.classification_report(
        y_val, model4.predict(x_val), output_dict=True
    ),
    # Rows are actual, columns are predicted
    "validation_cm": metrics.confusion_matrix(y_val, model4.predict(x_val)),
}

[LightGBM] [Info] Number of positive: 2630, number of negative: 7939
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000943 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 10569, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.248841 -> initscore=-1.104803
[LightGBM] [Info] Start training from score -1.104803


In [None]:
# real model
print(round(model1_report.get("validation").get("macro avg").get("f1-score"), 4))
print(round(model1_report.get("validation").get("accuracy"), 4))
print(model1_report.get("validation_cm"))

0.9285
0.9995
[[31632     1]
 [   14    45]]


In [9]:
# large balanced data
print(round(model2_report.get("validation").get("macro avg").get("f1-score"), 4))
print(round(model2_report.get("validation").get("accuracy"), 4))
print(model2_report.get("validation_cm"))

0.8799
0.9991
[[31617    16]
 [   13    46]]


In [10]:
# smaller balanced data
print(round(model3_report.get("validation").get("macro avg").get("f1-score"), 4))
print(round(model3_report.get("validation").get("accuracy"), 4))
print(model3_report.get("validation_cm"))

0.8157
0.9982
[[31586    47]
 [   10    49]]


In [11]:
# smaller unbalanced data
print(round(model4_report.get("validation").get("macro avg").get("f1-score"), 4))
print(round(model4_report.get("validation").get("accuracy"), 4))
print(model4_report.get("validation_cm"))

0.8285
0.9984
[[31592    41]
 [   10    49]]
