In [None]:
import pickle
import os
import numpy as np
import matplotlib.pyplot as plt

filepaths = os.listdir("data/creditcard/")
for fp in filepaths:
    with open("data/creditcard/" + fp, 'rb') as f:
        globals()[fp.replace(".pickle", "")] = pickle.load(f)

Zbiór ten zawiera 29 predyktorów (28 zakodowanych zmiennych bankowych przy pomocy PCA + znormalizowaną wielkość tranzakcji). Jest on ekstremalnie niezbalansowany.

In [None]:
print(creditcard_train_Y.sum() / len(creditcard_train_Y) * 100)
print(creditcard_test_Y.sum() / len(creditcard_test_Y) * 100)

Tak jak poprzednio zacznijmy od modelu referencyjnego - regresji logistycznej:

In [None]:
import statsmodels.api as sm

logit_model = sm.Logit(creditcard_train_Y, creditcard_train_X).fit()
logit_model.summary2()

Jak widać otrzymaliśmy duzą ilość niesistotnych mziennych. Naszym kolejnym krokiem powinno być odrzucenie nieistotnych zmiennych (pojedynczo!), sprawdzenie współliniowości miedzy zmiennymi, do czasu aż dojdziemy do modelu, który jest poprawnie skonstruowany i widać w nim objawów overfittingu. 

Zamiast robić to wszystko ręcznie i krok po kroku, spróbujmy czegoś innego.

# Regularyzacja LASSO, Ridge i Elastic Net w H2O

In [None]:
import h2o
import pandas as pd
h2o.init(ip = "localhost",
        port = 54321,
        nthreads = -1,
        min_mem_size = "20g")

In [None]:
creditcard_train = pd.DataFrame(np.column_stack((creditcard_train_X, creditcard_train_Y)))
creditcard_train.columns = ["V" + str(i) for i in range(28)] + ["amount", "fraud"]
creditcard_train["fraud"] = ["F" if x == 1 else "NF" for x in creditcard_train["fraud"]]
creditcard_test = pd.DataFrame(np.column_stack((creditcard_test_X, creditcard_test_Y)))
creditcard_test.columns = ["V" + str(i) for i in range(28)] + ["amount", "fraud"]
creditcard_test["fraud"] = ["F" if x == 1 else "NF" for x in creditcard_test["fraud"]]

In [None]:
creditcard_train_h2o = h2o.H2OFrame(creditcard_train, destination_frame="creditcard_train")
creditcard_test_h2o = h2o.H2OFrame(creditcard_test, destination_frame="creditcard_test")
h2o.ls()

In [None]:
creditcard_train_h2o["fraud"] = creditcard_train_h2o["fraud"].asfactor()
creditcard_test_h2o["fraud"] = creditcard_test_h2o["fraud"].asfactor()

Następnie zbudujemy 3 modele:

In [None]:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

# LASSO
card_lasso_balanced = H2OGeneralizedLinearEstimator(alpha=1,
                                                    family="binomial",
                                                    lambda_search=True,
                                                    nfolds=5,
                                                    balance_classes=True,
                                                    class_sampling_factors=[0.5, 0.5],
                                                    seed=1234,
                                                    score_each_iteration=True,
                                                    model_id="card_lasso_balanced")

card_lasso_balanced.train(x=list(range(0, 29)),
                          y="fraud",
                          training_frame=h2o.get_frame("creditcard_train"),
                          validation_frame=h2o.get_frame("creditcard_test"))

# Ridge
card_ridge_balanced = H2OGeneralizedLinearEstimator(alpha=0,
                                                    family="binomial",
                                                    lambda_search=True,
                                                    nfolds=5,
                                                    balance_classes=True,
                                                    class_sampling_factors=[0.5, 0.5],
                                                    seed=1234,
                                                    score_each_iteration=True,
                                                    model_id="card_ridge_balanced")

card_ridge_balanced.train(x=list(range(0, 29)),
                          y="fraud",
                          training_frame=h2o.get_frame("creditcard_train"),
                          validation_frame=h2o.get_frame("creditcard_test"))

# Elastic Net
card_elastic_net_balanced = H2OGeneralizedLinearEstimator(alpha=0.5,
                                                          family="binomial",
                                                          lambda_search=True,
                                                          nfolds=5,
                                                          balance_classes=True,
                                                          class_sampling_factors=[0.5, 0.5],
                                                          seed=1234,
                                                          score_each_iteration=True,
                                                          model_id="card_elastic_net_balanced")

card_elastic_net_balanced.train(x=list(range(0, 29)),
                                y="fraud",
                                training_frame=h2o.get_frame("creditcard_train"),
                                validation_frame=h2o.get_frame("creditcard_test"))

Sprawdźmy teraz jak wyglądają współczynniki naszych modeli:

In [None]:
card_lasso_balanced.coef()

In [None]:
card_ridge_balanced.coef()

In [None]:
card_elastic_net_balanced.coef()

Oraz miary dopasowania:

In [None]:
pred_lasso_balanced = card_lasso_balanced.predict(creditcard_test_h2o)
perf_lasso_balanced = card_lasso_balanced.model_performance(creditcard_test_h2o)

print(perf_lasso_balanced.auc())
print(perf_lasso_balanced.gini())
print(perf_lasso_balanced.aic())

cm_lasso_balanced = card_lasso_balanced.confusion_matrix(valid = True, metrics='f2')

fpr = perf_lasso_balanced.fprs
tpr = perf_lasso_balanced.tprs
plt.plot(fpr, tpr)
plt.show()

print(card_lasso_balanced._model_json['output']['lambda_best'])

pred_ridge_balanced = card_ridge_balanced.predict(creditcard_test_h2o)
perf_ridge_balanced = card_ridge_balanced.model_performance(creditcard_test_h2o)

print(perf_ridge_balanced.auc())
print(perf_ridge_balanced.gini())
print(perf_ridge_balanced.aic())

cm_ridge_balanced = card_ridge_balanced.confusion_matrix(valid = True, metrics='f2')

fpr = perf_ridge_balanced.fprs
tpr = perf_ridge_balanced.tprs
plt.plot(fpr, tpr)
plt.show()

print(card_ridge_balanced._model_json['output']['lambda_best'])

pred_elastic_net_balanced = card_elastic_net_balanced.predict(creditcard_test_h2o)
perf_elastic_net_balanced = card_elastic_net_balanced.model_performance(creditcard_test_h2o)

print(perf_elastic_net_balanced.auc())
print(perf_elastic_net_balanced.gini())
print(perf_elastic_net_balanced.aic())

cm_elastic_net_balanced = card_elastic_net_balanced.confusion_matrix(valid=True, metrics='f2')

fpr = perf_elastic_net_balanced.fprs
tpr = perf_elastic_net_balanced.tprs
plt.plot(fpr, tpr)
plt.show()

print(card_elastic_net_balanced._model_json['output']['lambda_best'])

Na zakończenie zapiszmy modele:

In [None]:
card_lasso_balanced.save_mojo(os.getcwd())
card_ridge_balanced.save_mojo(os.getcwd())
card_elastic_net_balanced.save_mojo(os.getcwd())

I zamkniemy cluster:

In [None]:
h2o.shutdown()