In [1]:
import pickle
import os
import numpy as np
import matplotlib.pyplot as plt

filepaths = os.listdir("data/creditcard/")
for fp in filepaths:
    with open("data/creditcard/" + fp, 'rb') as f:
        globals()[fp.replace(".pickle", "")] = pickle.load(f)

Zbiór ten zawiera 29 predyktorów (28 zakodowanych zmiennych bankowych przy pomocy PCA + znormalizowaną wielkość tranzakcji). Jest on ekstremalnie niezbalansowany.

In [2]:
print(creditcard_train_Y.sum() / len(creditcard_train_Y) * 100)
print(creditcard_test_Y.sum() / len(creditcard_test_Y) * 100)

0.17424126050604577
0.1667778519012675


Tak jak poprzednio zacznijmy od modelu referencyjnego - regresji logistycznej:

In [3]:
import statsmodels.api as sm

logit_model = sm.Logit(creditcard_train_Y, creditcard_train_X).fit()
logit_model.summary2()

Optimization terminated successfully.
         Current function value: 0.003871
         Iterations 14


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.698
Dependent Variable:,y,AIC:,1822.1019
Date:,2023-03-09 17:05,BIC:,2121.8581
No. Observations:,227845,Log-Likelihood:,-882.05
Df Model:,28,LL-Null:,-2918.6
Df Residuals:,227816,LLR p-value:,0.0
Converged:,1.0000,Scale:,1.0
No. Iterations:,14.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
x1,0.0843,0.0443,1.9049,0.0568,-0.0024,0.1711
x2,0.0033,0.0602,0.0552,0.9560,-0.1146,0.1213
x3,0.0748,0.0503,1.4888,0.1365,-0.0237,0.1733
x4,0.7432,0.0869,8.5474,0.0000,0.5727,0.9136
x5,0.0748,0.0718,1.0425,0.2972,-0.0658,0.2155
x6,-0.0964,0.0850,-1.1350,0.2564,-0.2629,0.0701
x7,-0.0912,0.0625,-1.4589,0.1446,-0.2138,0.0313
x8,-0.1664,0.0319,-5.2123,0.0000,-0.2289,-0.1038
x9,-0.2541,0.1325,-1.9179,0.0551,-0.5137,0.0056


Jak widać otrzymaliśmy duzą ilość niesistotnych mziennych. Naszym kolejnym krokiem powinno być odrzucenie nieistotnych zmiennych (pojedynczo!), sprawdzenie współliniowości miedzy zmiennymi, do czasu aż dojdziemy do modelu, który jest poprawnie skonstruowany i widać w nim objawów overfittingu. 

Zamiast robić to wszystko ręcznie i krok po kroku, spróbujmy czegoś innego.

# Regularyzacja LASSO, Ridge i Elastic Net w H2O

In [4]:
import h2o
import pandas as pd
h2o.init(ip = "localhost",
        port = 54321,
        nthreads = -1,
        min_mem_size = "20g")

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.18" 2023-01-17; OpenJDK Runtime Environment (build 11.0.18+10-post-Ubuntu-0ubuntu122.04); OpenJDK 64-Bit Server VM (build 11.0.18+10-post-Ubuntu-0ubuntu122.04, mixed mode, sharing)
  Starting server from /home/maju116/anaconda3/lib/python3.9/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpvnybflt3
  JVM stdout: /tmp/tmpvnybflt3/h2o_maju116_started_from_python.out
  JVM stderr: /tmp/tmpvnybflt3/h2o_maju116_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,06 secs
H2O_cluster_timezone:,Europe/Warsaw
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.1
H2O_cluster_version_age:,1 month
H2O_cluster_name:,H2O_from_python_maju116_nfp3ca
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,20 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [5]:
creditcard_train = pd.DataFrame(np.column_stack((creditcard_train_X, creditcard_train_Y)))
creditcard_train.columns = ["V" + str(i) for i in range(28)] + ["amount", "fraud"]
creditcard_train["fraud"] = ["F" if x == 1 else "NF" for x in creditcard_train["fraud"]]
creditcard_test = pd.DataFrame(np.column_stack((creditcard_test_X, creditcard_test_Y)))
creditcard_test.columns = ["V" + str(i) for i in range(28)] + ["amount", "fraud"]
creditcard_test["fraud"] = ["F" if x == 1 else "NF" for x in creditcard_test["fraud"]]

In [6]:
creditcard_train_h2o = h2o.H2OFrame(creditcard_train, destination_frame="creditcard_train")
creditcard_test_h2o = h2o.H2OFrame(creditcard_test, destination_frame="creditcard_test")
h2o.ls()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,key
0,creditcard_test
1,creditcard_train


In [7]:
creditcard_train_h2o["fraud"] = creditcard_train_h2o["fraud"].asfactor()
creditcard_test_h2o["fraud"] = creditcard_test_h2o["fraud"].asfactor()

Następnie zbudujemy 3 modele:

In [None]:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

# LASSO
card_lasso_balanced = H2OGeneralizedLinearEstimator(alpha=1,
                                                    family="binomial",
                                                    lambda_search=True,
                                                    nfolds=5,
                                                    balance_classes=True,
                                                    class_sampling_factors=[0.5, 0.5],
                                                    seed=1234,
                                                    score_each_iteration=True,
                                                    model_id="card_lasso_balanced")

card_lasso_balanced.train(x=list(range(0, 29)),
                          y="fraud",
                          training_frame=h2o.get_frame("creditcard_train"),
                          validation_frame=h2o.get_frame("creditcard_test"))

# Ridge
card_ridge_balanced = H2OGeneralizedLinearEstimator(alpha=0,
                                                    family="binomial",
                                                    lambda_search=True,
                                                    nfolds=5,
                                                    balance_classes=True,
                                                    class_sampling_factors=[0.5, 0.5],
                                                    seed=1234,
                                                    score_each_iteration=True,
                                                    model_id="card_ridge_balanced")

card_ridge_balanced.train(x=list(range(0, 29)),
                          y="fraud",
                          training_frame=h2o.get_frame("creditcard_train"),
                          validation_frame=h2o.get_frame("creditcard_test"))

# Elastic Net
card_elastic_net_balanced = H2OGeneralizedLinearEstimator(alpha=0.5,
                                                          family="binomial",
                                                          lambda_search=True,
                                                          nfolds=5,
                                                          balance_classes=True,
                                                          class_sampling_factors=[0.5, 0.5],
                                                          seed=1234,
                                                          score_each_iteration=True,
                                                          model_id="card_elastic_net_balanced")

card_elastic_net_balanced.train(x=list(range(0, 29)),
                                y="fraud",
                                training_frame=h2o.get_frame("creditcard_train"),
                                validation_frame=h2o.get_frame("creditcard_test"))

glm Model Build progress: |

Sprawdźmy teraz jak wyglądają współczynniki naszych modeli:

In [None]:
card_lasso_balanced.coef()

In [None]:
card_ridge_balanced.coef()

In [None]:
card_elastic_net_balanced.coef()

Oraz miary dopasowania:

In [None]:
pred_lasso_balanced = card_lasso_balanced.predict(creditcard_test_h2o)
perf_lasso_balanced = card_lasso_balanced.model_performance(creditcard_test_h2o)

print(perf_lasso_balanced.auc())
print(perf_lasso_balanced.gini())
print(perf_lasso_balanced.aic())

cm_lasso_balanced = card_lasso_balanced.confusion_matrix(valid = True, metrics='f2')

fpr = perf_lasso_balanced.fprs
tpr = perf_lasso_balanced.tprs
plt.plot(fpr, tpr)
plt.show()

print(card_lasso_balanced._model_json['output']['lambda_best'])

pred_ridge_balanced = card_ridge_balanced.predict(creditcard_test_h2o)
perf_ridge_balanced = card_ridge_balanced.model_performance(creditcard_test_h2o)

print(perf_ridge_balanced.auc())
print(perf_ridge_balanced.gini())
print(perf_ridge_balanced.aic())

cm_ridge_balanced = card_ridge_balanced.confusion_matrix(valid = True, metrics='f2')

fpr = perf_ridge_balanced.fprs
tpr = perf_ridge_balanced.tprs
plt.plot(fpr, tpr)
plt.show()

print(card_ridge_balanced._model_json['output']['lambda_best'])

pred_elastic_net_balanced = card_elastic_net_balanced.predict(creditcard_test_h2o)
perf_elastic_net_balanced = card_elastic_net_balanced.model_performance(creditcard_test_h2o)

print(perf_elastic_net_balanced.auc())
print(perf_elastic_net_balanced.gini())
print(perf_elastic_net_balanced.aic())

cm_elastic_net_balanced = card_elastic_net_balanced.confusion_matrix(valid=True, metrics='f2')

fpr = perf_elastic_net_balanced.fprs
tpr = perf_elastic_net_balanced.tprs
plt.plot(fpr, tpr)
plt.show()

print(card_elastic_net_balanced._model_json['output']['lambda_best'])

Na zakończenie zapiszmy modele:

In [None]:
card_lasso_balanced.save_mojo(os.getcwd())
card_ridge_balanced.save_mojo(os.getcwd())
card_elastic_net_balanced.save_mojo(os.getcwd())

I zamkniemy cluster:

In [None]:
h2o.shutdown()