In [5]:
import time
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
import warnings
warnings.filterwarnings("ignore")

d = pd.read_csv("https://raw.githubusercontent.com/maxleungtszchun/Statistical-Learning-with-customer-data/main/data/d.csv", na_values = "NA")
train_d = d.sample(frac = 0.8, random_state = 5)
test_d = d.drop(train_d.index)

train_X = train_d[["negative_r_zScore.x", "f_zScore.x", "m_zScore.x"]]
test_X = test_d[["negative_r_zScore.x", "f_zScore.x", "m_zScore.x"]]

train_y = train_d["return.y"]
test_y = test_d["return.y"]

def get_accuracy(clf, X, y, X_test, y_test):
    clf.fit(X, y)
    y_predict = clf.predict(X)
    in_sample_accu = np.mean(y_predict == y)
    y_test_predict = clf.predict(X_test)
    out_sample_accu = np.mean(y_test_predict == y_test)
    return in_sample_accu, out_sample_accu

def build_model(X, y, X_test, y_test,
                loss = "log", penalty = "l2", alpha = 0.0001, l1_ratio = 0,
                cv_score = False, cv_num = 3,
                SGD = True, solver = "lbfgs",
                timer = False):
    # l1_ratio is not used except penalty == "elasticnet"
    start = time.time()
    if penalty not in ["l1", "l2", "elasticnet"]:
        raise Exception() # exclude penalty == "none" case for LogisticRegression()
    if SGD == True:
        learning_rate = "optimal"
        eta0 = 0
        if alpha == 0:
            learning_rate = "constant"
            eta0 = 0.1
        clf = SGDClassifier(loss = loss, penalty = penalty, alpha = alpha, l1_ratio = l1_ratio,
                            learning_rate = learning_rate, eta0 = eta0)
    else:
        if alpha == 0:
            penalty = "none"
            C = float("inf")
        else:
            C = alpha ** -1
        clf = LogisticRegression(penalty = penalty, C = C, l1_ratio = l1_ratio, solver = solver, max_iter = 1000)
    in_sample_accu, out_sample_accu = get_accuracy(clf = clf, X = X, y = y, X_test = X_test, y_test = y_test)
    cv_accu = np.nan
    if cv_score == True:
        cv_accu = cross_val_score(clf, X, y, cv = cv_num, scoring = "accuracy")
    sparsity = np.mean(clf.coef_ == 0)
    if timer == True: print("%.2f mins" % ((time.time() - start) / 60))
    return in_sample_accu, out_sample_accu, cv_accu, sparsity, clf

# Unregularized Logistic Regression (SGD optimized)
# with shrinkage para. set to 0
print(build_model(train_X, train_y, test_X, test_y, alpha = 0, cv_score = True)[:-1])

# L2 (Ridge-type) regularized Logistic Regression (SGD optimized)
# with shrinkage para. set to 0.0001
print(build_model(train_X, train_y, test_X, test_y, cv_score = True)[:-1])

# L1 (Lasso-type) regularized Logistic Regression (SGD optimized)
# with shrinkage para. set to 0.0001
print(build_model(train_X, train_y, test_X, test_y, penalty = "l1", cv_score = True)[:-1])

# Elastic-net Logistic Regression (SGD optimized)
# with shrinkage para. set to 0.0001
print(build_model(train_X, train_y, test_X, test_y, penalty = "elasticnet", l1_ratio = 0.5, cv_score = True)[:-1])


# Unregularized Logistic Regression (Quasi-Newton (LBFGS) optimized)
# with shrinkage para. set to 0
print(build_model(train_X, train_y, test_X, test_y, SGD = False, alpha = 0)[:-1])

# L2 (Ridge-type) regularized Logistic Regression (Quasi-Newton (LBFGS) optimized)
# with shrinkage para. set to 0.0001
print(build_model(train_X, train_y, test_X, test_y, SGD = False)[:-1])

# L1 (Lasso-type) regularized Logistic Regression (Saga optimized)
# with shrinkage para. set to 0.0001
print(build_model(train_X, train_y, test_X, test_y, SGD = False, penalty = "l1", solver = "saga")[:-1])

# Elastic-net Logistic Regression (Saga optimized)
# with shrinkage para. set to 0.0001
print(build_model(train_X, train_y, test_X, test_y, SGD = False, penalty = "elasticnet", l1_ratio = 0.5, solver = "saga")[:-1])

# Voting Classifier
voting_clf = VotingClassifier(
    estimators=[("logistic_reg_sgd", build_model(train_X, train_y, test_X, test_y, alpha = 0, cv_score = True)[-1]),
                ("l2_logistic_reg_sgd", build_model(train_X, train_y, test_X, test_y, cv_score = True)[-1]),
                ("l1_logistic_reg_sgd", build_model(train_X, train_y, test_X, test_y, penalty = "l1", cv_score = True)[-1]),
                ("elastic_net_logistic_reg_sgd", build_model(train_X, train_y, test_X, test_y, penalty = "elasticnet", l1_ratio = 0.5, cv_score = True)[-1]),
                ("logistic_reg_lbfgs", build_model(train_X, train_y, test_X, test_y, SGD = False, alpha = 0)[-1]),
                ("l2_logistic_reg_lbfgs", build_model(train_X, train_y, test_X, test_y, SGD = False)[-1]),
                ("l1_logistic_reg_saga", build_model(train_X, train_y, test_X, test_y, SGD = False, penalty = "l1", solver = "saga")[-1]),
                ("elastic_net_logistic_reg_saga", build_model(train_X, train_y, test_X, test_y, SGD = False, penalty = "elasticnet", l1_ratio = 0.5, solver = "saga")[-1])],
    voting = "hard")
print(get_accuracy(voting_clf, train_X, train_y, test_X, test_y))


(0.6918123275068997, 0.7224264705882353, array([0.67586207, 0.69241379, 0.69889503]), 0.0)
(0.6826126954921803, 0.7077205882352942, array([0.67310345, 0.71034483, 0.69475138]), 0.0)
(0.6922723091076357, 0.71875, array([0.65241379, 0.70896552, 0.69475138]), 0.3333333333333333)
(0.6844526218951242, 0.6838235294117647, array([0.65655172, 0.72413793, 0.6878453 ]), 0.3333333333333333)
(0.6927322907083716, 0.7150735294117647, nan, 0.0)
(0.6927322907083716, 0.7150735294117647, nan, 0.0)
(0.6922723091076357, 0.7150735294117647, nan, 0.0)
(0.6922723091076357, 0.7150735294117647, nan, 0.0)
(0.6922723091076357, 0.7150735294117647)
