## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier, XGBRFClassifier 
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPClassifier

## Set up Train and Test Sets

In [26]:
train_set = pd.read_csv("train_set_ensembles.csv").sample(frac=1).reset_index(drop=True)
print(train_set.head())
X = train_set.drop(columns=['CLASS'])
y = train_set['CLASS'].values

   CLASS  feature#1  feature#2  feature#3  feature#4  feature#5  feature#6  \
0      0     0.2482        0.0        0.0      1.337        0.0        0.0   
1      0     0.0000        0.0        0.0      0.000        0.0        0.0   
2      1     0.0000        0.0        0.0      0.000        0.0        0.0   
3      0     0.0000        0.0        0.0      2.314        0.0        0.0   
4      1     0.0000        0.0        0.0      0.000        0.0        0.0   

   feature#7  feature#8  feature#9  ...  feature#4087  feature#4088  \
0        0.0        0.0        0.0  ...         1.152           0.0   
1        0.0        0.0        0.0  ...         0.000           0.0   
2        0.0        0.0        0.0  ...         0.000           0.0   
3        0.0        0.0        0.0  ...         0.000           0.0   
4        0.0        0.0        0.0  ...         0.000           0.0   

   feature#4089  feature#4090  feature#4091  feature#4092  feature#4093  \
0         0.000           0.0

## Voting Classifiers

In [6]:
cls1 = LogisticRegression(n_jobs = -1) # Classifier #1 
cls2 = KNeighborsClassifier(n_jobs = -1) # Classifier #2 
cls3 = DecisionTreeClassifier() # Classifier #3
soft_vcls = VotingClassifier(estimators=[('lr', cls1), ('knn', cls2), ('dt', cls3)], voting='soft', n_jobs = -1) # Voting Classifier
hard_vcls = VotingClassifier(estimators=[('lr', cls1), ('knn', cls2), ('dt', cls3)], voting='hard', n_jobs = -1) # Voting Classifier

svlcs_scores = cross_validate(soft_vcls, X, y, cv=10, scoring = ('accuracy', 'f1_weighted'))
s_avg_fmeasure = np.average(svlcs_scores['test_f1_weighted']) # The average f-measure
s_avg_accuracy = np.average(svlcs_scores['test_accuracy']) # The average accuracy

hvlcs_scores = cross_validate(hard_vcls, X, y, cv=10, scoring = ('accuracy', 'f1_weighted'))
h_avg_fmeasure = np.average(hvlcs_scores['test_f1_weighted']) # The average f-measure
h_avg_accuracy = np.average(hvlcs_scores['test_accuracy']) # The average accuracy

In [7]:
print("Classifier:")
print(soft_vcls)
print("F1 Weighted-Score: {} & Balanced Accuracy: {}".format(round(s_avg_fmeasure,4), round(s_avg_accuracy,4)))

print("\nClassifier:")
print(hard_vcls)
print("F1 Weighted-Score: {} & Balanced Accuracy: {}".format(round(h_avg_fmeasure,4), round(h_avg_accuracy,4)))

Classifier:
VotingClassifier(estimators=[('lr', LogisticRegression(n_jobs=-1)),
                             ('knn', KNeighborsClassifier(n_jobs=-1)),
                             ('dt', DecisionTreeClassifier())],
                 n_jobs=-1, voting='soft')
F1 Weighted-Score: 0.8239 & Balanced Accuracy: 0.8244

Classifier:
VotingClassifier(estimators=[('lr', LogisticRegression(n_jobs=-1)),
                             ('knn', KNeighborsClassifier(n_jobs=-1)),
                             ('dt', DecisionTreeClassifier())],
                 n_jobs=-1)
F1 Weighted-Score: 0.8234 & Balanced Accuracy: 0.8239


## Stacking Classifiers

In [None]:
cls1 = RandomForestClassifier(n_jobs = -1, n_estimators = 25) # Classifier #1 
cls2 = GradientBoostingClassifier(n_estimators = 25) # Classifier #2 
cls3 = LogisticRegression(n_jobs = -1) # Classifier #3
cls4 = SVC()
cls5 = MLPClassifier(hidden_layer_sizes = (25,))
scls = StackingClassifier(estimators=[('svc', cls4), ('mlp', cls5)], final_estimator = cls3, n_jobs=-1, cv = 5) # Stacking Classifier

scores = cross_validate(scls, X, y, cv=10, scoring = ('accuracy', 'f1_weighted')) 
avg_fmeasure = np.average(scores['test_f1_weighted']) # The average f-measure
avg_accuracy = np.average(scores['test_accuracy']) # The average accuracy

In [None]:
print("Classifier:")
print(scls)
print("F1 Weighted Score: {} & Balanced Accuracy: {}".format(round(avg_fmeasure,4), round(avg_accuracy,4)))

## Homogenous Ensembles

In [None]:
ens1 = BaggingClassifier(n_estimators = 25, max_samples=0.4).fit(X, y) # Bagging with replacement by default (bootstrap)
ens2 = BaggingClassifier(n_estimators = 25, max_features=0.4, bootstrap=False).fit(X, y) # Random Subspace without replacement by default (we have the default value for max_samples and we use bootstrap=False to get all of the samples once - just on a different order - so that we can see the effect of random subspace selection on its own without using different subsets of data ech time / getting random patches)
ens3 = RandomForestClassifier(n_estimators = 25, n_jobs = -1) # Random Forest
tree = DecisionTreeClassifier() # decision tree for comparison

scores1 = cross_validate(ens1, X, y, cv=10, scoring = ('accuracy', 'f1_weighted'))
scores2 = cross_validate(ens2, X, y, cv=10, scoring = ('accuracy', 'f1_weighted'))
scores3 = cross_validate(ens3, X, y, cv=10, scoring = ('accuracy', 'f1_weighted'))
scoresTree = cross_validate(tree, X, y, cv=10, scoring = ('accuracy', 'f1_weighted'))

f_measures = dict(Bagging = np.average(scores1['test_f1_weighted']), RandomSubspace = np.average(scores2['test_f1_weighted']), RandomForest = np.average(scores3['test_f1_weighted']), Tree = np.average(scoresTree['test_f1_weighted']))
accuracies = dict(Bagging = np.average(scores1['test_accuracy']), RandomSubspace = np.average(scores2['test_accuracy']), RandomForest = np.average(scores3['test_accuracy']), Tree = np.average(scoresTree['test_accuracy']))


The three models used are:
1. A bagging classifier using 0.4 of the available samples on 25 decision tree estimators to get a final result
2. A random subspace classifier (bagging classifier) using 0.4 of the avalable features on 25 tree estimators to get a final result
3. A Random Forest Classifier using 25 tree estimators to get a result
4. A simple Decision Tree Estimator

Initially when running tests with the default values the random forest algorithm did the best, something expected as it uses by default 100 estimators while the other methods use only 10. When using the same number of estimators however across the different methods we see that the random subspace algorithm seems to be giving us the best results followed by the bagging algorithm with random forest coming third. Of course as expected the simple decision tree falls short by a lot as it only effectively uses one estimator while the other algorithms use 25, each of them applied on different subsets of the data/features. 

In [13]:
for name,score in f_measures.items():
    print("Classifier: {} -  F1 Weighted: {}".format(name,round(score,4)))
for name,score in accuracies.items():
    print("Classifier: {} -  Balanced Accuracy: {}".format(name,round(score,4)))

Classifier: Bagging -  F1 Weighted: 0.7958
Classifier: RandomSubspace -  F1 Weighted: 0.8105
Classifier: RandomForest -  F1 Weighted: 0.7855
Classifier: Tree -  F1 Weighted: 0.7036
Classifier: Bagging -  Balanced Accuracy: 0.7984
Classifier: RandomSubspace -  Balanced Accuracy: 0.8129
Classifier: RandomForest -  Balanced Accuracy: 0.7886
Classifier: Tree -  Balanced Accuracy: 0.7033


## Testing Ensembles

In [15]:
#cls1 = GradientBoostingClassifier(n_estimators = 100)
cls2 = XGBClassifier(n_estimators=100,tree_method='hist', subsample=0.5)
#cls3 = XGBRFClassifier()
#cls4 = AdaBoostClassifier(RandomForestClassifier(n_estimators=10), n_estimators=100)
#cls5 = LogisticRegression(n_jobs = -1) # Classifier #3
#cls6 = MLPClassifier(hidden_layer_sizes = (25,))
cls7 = SVC()
best_cls = StackingClassifier(estimators=[('xgb', cls2), ('svc', cls7)], final_estimator = cls3, n_jobs=-1, cv = 3)
#cls8 = StackingClassifier(estimators=[('mlp', cls6), ('xgb', cls2)], final_estimator = cls3, n_jobs=-1, cv = 3)

scores = cross_validate(cls7, X, y, cv=10, scoring = ('accuracy', 'f1_weighted'))
#scores2 = cross_validate(cls2, X, y, cv=10, scoring = ('accuracy', 'f1_weighted'))
#scores3 = cross_validate(cls3, X, y, cv=10, scoring = ('accuracy', 'f1_weighted'))

best_fmeasure = np.average(scores['test_f1_weighted'])
best_accuracy = np.average(scores['test_accuracy'])

In [24]:
print("Classifier:")
#print(best_cls)
print("F1 Weighted-Score:{} & Balanced Accuracy:{}".format(best_fmeasure, best_accuracy))

Classifier:
F1 Weighted-Score:0.8601178182662387 & Balanced Accuracy:0.8608424366405274


### Results
#### GradientBoosting
f_measure: 0.8174, accuracy: 0.8191
#### XGBClassifier (method = 'exact)
f_measure: 0.8393, accuracy: 0.8403
#### XGBClassifier (method = 'hist', also used subsample = o.5 to try a bagging method on every tree)
f_measure: 0.8285, accuracy: 0.8296 (a bit worse than method='exact' but indeed much faster)
#### XGBRFClasssifier (XBG Random Forest)
f_measure: 0.7657, accuracy: 0.7719 (not much better than a simple random forest)
#### AdaBoost + Random Forest
f_measure: 0.8015, accuracy: 0.8084
#### Stacking MLP + XGBClassifier(hist+subsample) combined with Logistic Regression:
f_measure: 0.8572, accuracy: 0.8578 (better result but definitely not worth it over just XGB as it took almost an hour to compute)
#### Stacking SVC + XGBClassifier(hist+subsample)
f_measure: 0.8601, accuracy: 0.8608 (by far the most efficient option - time is almost the same as with XBG-hist alone)