In [5]:
import time
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import cross_val_score

t = pd.read_csv("https://raw.githubusercontent.com/maxleungtszchun/Statistical-Learning-with-customer-data/main/data/d.csv", na_values = "NA")
X_train = t[["negative_r_zScore.x", "f_zScore.x", "m_zScore.x"]]
y_train = t["return.y"]

def build_model(X, y, X_test, y_test,
                loss = "log", penalty = "l2", alpha = 0.0001, l1_ratio = 0,
                cv_score = False, cv_num = 3,
                SGD = True, solver = "lbfgs"):
    # l1_ratio is not used except penalty == "elasticnet"
    start = time.time()
    if penalty not in ["l1", "l2", "elasticnet"]:
        raise Exception() # exclude penalty == "none" case for LogisticRegression()
    if SGD == True:
        learning_rate = "optimal"
        eta0 = 0
        if alpha == 0:
            learning_rate = "constant"
            eta0 = 0.1
        clf = SGDClassifier(loss = loss, penalty = penalty, alpha = alpha, l1_ratio = l1_ratio,
                            learning_rate = learning_rate, eta0 = eta0)
    else:
        if alpha == 0:
            penalty = "none"
            C = float("inf")
        else:
            C = alpha ** -1
        clf = LogisticRegression(penalty = penalty, C = C, l1_ratio = l1_ratio, solver = solver, max_iter = 1000)
    clf.fit(X, y)
    y_predict = clf.predict(X)
    in_sample_accu = np.mean(y_predict == y)
    y_test_predict = clf.predict(X_test)
    out_sample_accu = np.mean(y_test_predict == y_test)
    cv_accu = np.nan
    if cv_score == True:
        cv_accu = cross_val_score(clf, X, y, cv = cv_num, scoring = "accuracy")
    sparsity = np.mean(clf.coef_ == 0)
    print("%.2f mins" % ((time.time() - start) / 60))
    return in_sample_accu, out_sample_accu, cv_accu, sparsity

# Unregularized Logistic Regression (SGD optimized)
# with shrinkage para. set to 0
print(build_model(X_train, y_train, X_train, y_train, alpha = 0, cv_score = True))

# L2 (Ridge-type) regularized Logistic Regression (SGD optimized)
# with shrinkage para. set to 0.0001
print(build_model(X_train, y_train, X_train, y_train, cv_score = True))

# L1 (Lasso-type) regularized Logistic Regression (SGD optimized)
# with shrinkage para. set to 0.0001
print(build_model(X_train, y_train, X_train, y_train, penalty = "l1", cv_score = True))

# Elastic-net Logistic Regression (SGD optimized)
# with shrinkage para. set to 0.0001
print(build_model(X_train, y_train, X_train, y_train, penalty = "elasticnet", l1_ratio = 0.5, cv_score = True))


# Unregularized Logistic Regression (Quasi-Newton (LBFGS) optimized)
# with shrinkage para. set to 0
print(build_model(X_train, y_train, X_train, y_train, SGD = False, alpha = 0))

# L2 (Ridge-type) regularized Logistic Regression (Quasi-Newton (LBFGS) optimized)
# with shrinkage para. set to 0.0001
print(build_model(X_train, y_train, X_train, y_train, SGD = False))

# L1 (Lasso-type) regularized Logistic Regression (Saga optimized)
# with shrinkage para. set to 0.0001
print(build_model(X_train, y_train, X_train, y_train, SGD = False, penalty = "l1", solver = "saga"))

# Elastic-net Logistic Regression (Saga optimized)
# with shrinkage para. set to 0.0001
print(build_model(X_train, y_train, X_train, y_train, SGD = False, penalty = "elasticnet", l1_ratio = 0.5, solver = "saga"))

0.00 mins
(0.6924208977189109, 0.6924208977189109, array([0.69426049, 0.69757174, 0.69757174]), 0.0)
0.00 mins
(0.6972038263428992, 0.6972038263428992, array([0.68984547, 0.70860927, 0.66556291]), 0.0)
0.00 mins
(0.6968359087564385, 0.6968359087564385, array([0.67880795, 0.69646799, 0.69536424]), 0.3333333333333333)
0.00 mins
(0.6957321559970566, 0.6957321559970566, array([0.69536424, 0.70198675, 0.69646799]), 0.3333333333333333)
0.00 mins
(0.6986754966887417, 0.6986754966887417, nan, 0.0)
0.00 mins
(0.6986754966887417, 0.6986754966887417, nan, 0.0)
0.00 mins
(0.6986754966887417, 0.6986754966887417, nan, 0.0)
0.00 mins
(0.6986754966887417, 0.6986754966887417, nan, 0.0)


  "(penalty={})".format(self.penalty)
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
  "(penalty={})".format(self.penalty)
  "(penalty={})".format(self.penalty)
