In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import cross_val_score

d = pd.read_csv("https://raw.githubusercontent.com/maxleungtszchun/Statistical-Learning-with-customer-data/main/data/d.csv", na_values = "NA")
train_d = d.sample(frac = 0.8, random_state = 5)
test_d = d.drop(train_d.index)

train_X = train_d[["negative_r_zScore.x", "f_zScore.x", "m_zScore.x"]]
test_X = test_d[["negative_r_zScore.x", "f_zScore.x", "m_zScore.x"]]

train_y = train_d["return.y"]
test_y = test_d["return.y"]

def build_model(X, y, X_test, y_test,
                loss = "log", penalty = "l2", alpha = 0.0001, l1_ratio = 0,
                cv_score = False, cv_num = 3,
                SGD = True, solver = "lbfgs"):
    # l1_ratio is not used except penalty == "elasticnet"
    start = time.time()
    if penalty not in ["l1", "l2", "elasticnet"]:
        raise Exception() # exclude penalty == "none" case for LogisticRegression()
    if SGD == True:
        learning_rate = "optimal"
        eta0 = 0
        if alpha == 0:
            learning_rate = "constant"
            eta0 = 0.1
        clf = SGDClassifier(loss = loss, penalty = penalty, alpha = alpha, l1_ratio = l1_ratio,
                            learning_rate = learning_rate, eta0 = eta0)
    else:
        if alpha == 0:
            penalty = "none"
            C = float("inf")
        else:
            C = alpha ** -1
        clf = LogisticRegression(penalty = penalty, C = C, l1_ratio = l1_ratio, solver = solver, max_iter = 1000)
    clf.fit(X, y)
    y_predict = clf.predict(X)
    in_sample_accu = np.mean(y_predict == y)
    y_test_predict = clf.predict(X_test)
    out_sample_accu = np.mean(y_test_predict == y_test)
    cv_accu = np.nan
    if cv_score == True:
        cv_accu = cross_val_score(clf, X, y, cv = cv_num, scoring = "accuracy")
    sparsity = np.mean(clf.coef_ == 0)
    print("%.2f mins" % ((time.time() - start) / 60))
    return in_sample_accu, out_sample_accu, cv_accu, sparsity

# Unregularized Logistic Regression (SGD optimized)
# with shrinkage para. set to 0
print(build_model(train_X, train_y, test_X, test_y, alpha = 0, cv_score = True))

# L2 (Ridge-type) regularized Logistic Regression (SGD optimized)
# with shrinkage para. set to 0.0001
print(build_model(train_X, train_y, test_X, test_y, cv_score = True))

# L1 (Lasso-type) regularized Logistic Regression (SGD optimized)
# with shrinkage para. set to 0.0001
print(build_model(train_X, train_y, test_X, test_y, penalty = "l1", cv_score = True))

# Elastic-net Logistic Regression (SGD optimized)
# with shrinkage para. set to 0.0001
print(build_model(train_X, train_y, test_X, test_y, penalty = "elasticnet", l1_ratio = 0.5, cv_score = True))


# Unregularized Logistic Regression (Quasi-Newton (LBFGS) optimized)
# with shrinkage para. set to 0
print(build_model(train_X, train_y, test_X, test_y, SGD = False, alpha = 0))

# L2 (Ridge-type) regularized Logistic Regression (Quasi-Newton (LBFGS) optimized)
# with shrinkage para. set to 0.0001
print(build_model(train_X, train_y, test_X, test_y, SGD = False))

# L1 (Lasso-type) regularized Logistic Regression (Saga optimized)
# with shrinkage para. set to 0.0001
print(build_model(train_X, train_y, test_X, test_y, SGD = False, penalty = "l1", solver = "saga"))

# Elastic-net Logistic Regression (Saga optimized)
# with shrinkage para. set to 0.0001
print(build_model(train_X, train_y, test_X, test_y, SGD = False, penalty = "elasticnet", l1_ratio = 0.5, solver = "saga"))

0.00 mins
(0.6780128794848206, 0.7058823529411765, array([0.69103448, 0.71310345, 0.69198895]), 0.0)
0.00 mins
(0.6853725850965962, 0.6875, array([0.69241379, 0.70896552, 0.69198895]), 0.0)
0.00 mins
(0.6913523459061638, 0.6966911764705882, array([0.65517241, 0.7062069 , 0.69337017]), 0.0)
0.00 mins
(0.6931922723091076, 0.7150735294117647, array([0.65103448, 0.70758621, 0.69337017]), 0.3333333333333333)
0.00 mins
(0.6927322907083716, 0.7150735294117647, nan, 0.0)
0.00 mins
(0.6927322907083716, 0.7150735294117647, nan, 0.0)
0.00 mins
(0.6922723091076357, 0.7150735294117647, nan, 0.0)
0.00 mins
(0.6922723091076357, 0.7150735294117647, nan, 0.0)


  "(penalty={})".format(self.penalty)
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "(penalty={})".format(self.penalty)
  "(penalty={})".format(self.penalty)
