In [3]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, VotingClassifier

d = pd.read_csv("https://raw.githubusercontent.com/maxleungtszchun/Statistical-Learning-with-customer-data/main/data/d.csv", na_values = "NA")
train_d = d.sample(frac = 0.8, random_state = 5)
test_d = d.drop(train_d.index)

train_X = train_d[["negative_r_zScore.x", "f_zScore.x", "m_zScore.x"]]
test_X = test_d[["negative_r_zScore.x", "f_zScore.x", "m_zScore.x"]]

train_y = train_d["return.y"]
test_y = test_d["return.y"]

num_trees = 500
max_depth = 3

def get_accuracy(clf, X, y, X_test, y_test):
    clf.fit(X, y)
    y_predict = clf.predict(X)
    in_sample_accu = np.mean(y_predict == y)
    y_test_predict = clf.predict(X_test)
    out_sample_accu = np.mean(y_test_predict == y_test)
    return in_sample_accu, out_sample_accu

# Decision Tree
tree_clf = DecisionTreeClassifier(max_depth = max_depth)
print(get_accuracy(tree_clf, train_X, train_y, test_X, test_y))

# Bagging
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(max_depth = max_depth),
    n_estimators = num_trees,   # 500 trees / bootstrapped samples
    max_samples = len(train_y), # num. of obs. in each bootstrapped sample = num. of obs. in training data
    bootstrap = True,
    n_jobs = -1)                # using all CPUs
print(get_accuracy(bag_clf, train_X, train_y, test_X, test_y))

# Bagging considers all features in each split
# Random Forest only considers a random subset of all features in each split (only using square root of all features)
bag_rnd_clf = BaggingClassifier(
    DecisionTreeClassifier(max_depth = max_depth, splitter = "random"),
    n_estimators = num_trees,
    max_samples = len(train_y),
    bootstrap = True,
    n_jobs = -1)
print(get_accuracy(bag_rnd_clf, train_X, train_y, test_X, test_y))

# or using RandomForestClassifier() directly
rnf_clf = RandomForestClassifier(
    n_estimators = num_trees,
    max_depth = max_depth,
    # max_leaf_nodes = 2 ** max_depth,
    n_jobs = -1)
print(get_accuracy(rnf_clf, train_X, train_y, test_X, test_y))

# AdaBoost
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth = 1),
    n_estimators = num_trees,
    algorithm = "SAMME.R",
    learning_rate = 0.5)
print(get_accuracy(ada_clf, train_X, train_y, test_X, test_y))

# Voting Classifier
voting_clf = VotingClassifier(
    estimators=[("tree_clf", tree_clf),
                ("bag_clf", bag_clf),
                ("bag_rnd_clf", bag_rnd_clf),
                ("rnf_clf", rnf_clf),
                ("ada_clf", ada_clf)],
    voting = "hard")
print(get_accuracy(voting_clf, train_X, train_y, test_X, test_y))

(0.7097516099356026, 0.7022058823529411)
(0.7203311867525299, 0.7205882352941176)
(0.6964121435142594, 0.7132352941176471)
(0.7115915363385464, 0.7205882352941176)
(0.7327506899724011, 0.6893382352941176)
(0.7161913523459061, 0.71875)
