# Ensemble methods. Exercises


In this section we have only two exercise:

1. Find the best three classifier in the stacking method using the classifiers from scikit-learn package.

2. Build arcing arc-x4 method. 

In [74]:
%store -r data_set
%store -r labels
%store -r test_data_set
%store -r test_labels
%store -r unique_labels

## Exercise 1: Find the best three classifier in the stacking method

Please use the following classifiers:

* Linear regression,
* Nearest Neighbors,
* Linear SVM,
* Decision Tree,
* Naive Bayes,
* QDA.

In [135]:
import numpy as np
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split

In [150]:
def classes_from_regression(predictions):
    return np.digitize(predictions, bins=[0.5, 1.5])

def build_classifiers():
    regression = LogisticRegression(max_iter=1000)
    knn = KNeighborsClassifier()
    svm = SVC(kernel='linear')
    tree = DecisionTreeClassifier()
    gauss = GaussianNB()
    qda = QuadraticDiscriminantAnalysis()
    
    all_classifiers = [regression, knn, svm, tree, gauss, qda]
    
    for _classifier in all_classifiers:
        _classifier.fit(data_set, labels)
    
    return all_classifiers

In [151]:
def build_stacked_classifier(classifiers):
    output = []
    for classifier in classifiers:
        output.append(classifier.predict(data_set))
    output = np.array(output).reshape((130,3))
    
    # stacked classifier part:
    stacked_classifier = DecisionTreeClassifier() # set here
    stacked_classifier.fit(output.reshape((130,3)), labels.reshape((130,)))
    test_set = []
    for classifier in classifiers:
        test_set.append(classifier.predict(test_data_set))
        
    test_set = np.array(test_set).reshape((len(test_set[0]),3))
    predicted = stacked_classifier.predict(test_set)
    return predicted

In [154]:
import itertools
classifiers = build_classifiers()
acc = []
combinations = itertools.combinations(classifiers, 3)
permutations = []
for comb in combinations:
    permutations.extend(itertools.permutations(comb))
print(f'Number of possible classifiers combinations: {len(permutations)}')

for permutation in permutations[:10]:
    predicted = build_stacked_classifier(permutation)
    acc.append(accuracy_score(test_labels, predicted))
    
best_combination = np.argmax(acc)
print(f'The best accuracy: {acc[best_combination]}')
print(f"The best classifiers: {permutations[best_combination]}")

Number of possible classifiers combinations: 120
The best accuracy: 0.85
The best classifiers: (LogisticRegression(max_iter=1000), KNeighborsClassifier(), DecisionTreeClassifier())


## Exercise 2: 

Use the boosting method and change the code to fullfilt the following requirements:

* the weights should be calculated as:
$w_{n}^{(t+1)}=\frac{1+ I(y_{n}\neq h_{t}(x_{n})}{\sum_{i=1}^{N}1+I(y_{n}\neq h_{t}(x_{n})}$,
* the prediction is done with a voting method.

In [240]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# prepare data set

def generate_data(sample_number, feature_number, label_number):
    data_set = np.random.random_sample((sample_number, feature_number))
    labels = np.random.choice(label_number, sample_number)
    return data_set, labels

labels = 2
dimension = 2
test_set_size = 1000
train_set_size = 5000
train_set, train_labels = generate_data(train_set_size, dimension, labels)
test_set, test_labels = generate_data(test_set_size, dimension, labels)

# init weights
number_of_iterations = 10
weights = np.ones((test_set_size,)) / test_set_size


def train_model(classifier, weights):
    return classifier.fit(X=test_set, y=test_labels, sample_weight=weights)

def calculate_error(model):
    predicted = model.predict(test_set)
    I=calculate_accuracy_vector(predicted, test_labels)
    Z=np.sum(I)
    return (1+Z)/1.0

Fill the two functions below:

In [229]:
def set_new_weights(model):
    predictions = model.predict(test_set)
    I = np.array([1 if predictions[i] != test_labels[i] else 0 for i in range(len(predictions))])
    new_weights = (1 + I) / (1 + I).sum()
    return new_weights

Train the classifier with the code below:

In [246]:
classifier = DecisionTreeClassifier(max_depth=1, random_state=1)
classifier.fit(X=train_set, y=train_labels)
alphas = []
classifiers = []
for iteration in range(number_of_iterations):
    model = train_model(classifier, weights)
    weights = set_new_weights(model)
    classifiers.append(model)
    classifier = DecisionTreeClassifier(max_depth=1)
    print(weights[:10], "\n")


validate_x, validate_label = generate_data(1, dimension, labels)

[0.00067613 0.00067613 0.00067613 0.00135227 0.00067613 0.00067613
 0.00135227 0.00067613 0.00135227 0.00135227] 

[0.00132275 0.00132275 0.00066138 0.00066138 0.00132275 0.00132275
 0.00066138 0.00132275 0.00066138 0.00066138] 

[0.00067613 0.00067613 0.00067613 0.00135227 0.00067613 0.00067613
 0.00135227 0.00067613 0.00135227 0.00135227] 

[0.00132275 0.00132275 0.00066138 0.00066138 0.00132275 0.00132275
 0.00066138 0.00132275 0.00066138 0.00066138] 

[0.00067613 0.00067613 0.00067613 0.00135227 0.00067613 0.00067613
 0.00135227 0.00067613 0.00135227 0.00135227] 

[0.00132275 0.00132275 0.00066138 0.00066138 0.00132275 0.00132275
 0.00066138 0.00132275 0.00066138 0.00066138] 

[0.00067613 0.00067613 0.00067613 0.00135227 0.00067613 0.00067613
 0.00135227 0.00067613 0.00135227 0.00135227] 

[0.00132275 0.00132275 0.00066138 0.00066138 0.00132275 0.00132275
 0.00066138 0.00132275 0.00066138 0.00066138] 

[0.00067613 0.00067613 0.00067613 0.00135227 0.00067613 0.00067613
 0.00135227 0

In [247]:
print(classifiers)
print(classifiers[0] == classifiers[1])

[DecisionTreeClassifier(max_depth=1, random_state=1), DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=1)]
False


Set the validation data set:

In [224]:
validate_x, validate_label = generate_data(100, dimension, labels)
print(validate_x, validate_label)

[[0.48320468 0.46403311]
 [0.82868386 0.6364117 ]
 [0.23805566 0.59796486]
 [0.45256974 0.12106635]
 [0.01533993 0.31799323]
 [0.57278834 0.45074462]
 [0.08977755 0.29569337]
 [0.27885456 0.63557724]
 [0.63928058 0.25555633]
 [0.87224998 0.81888687]
 [0.58199792 0.71965585]
 [0.79115184 0.22609615]
 [0.38571893 0.25229345]
 [0.95587698 0.21809795]
 [0.98321387 0.72063214]
 [0.71847402 0.81528615]
 [0.23984744 0.28261264]
 [0.874309   0.24762284]
 [0.84580293 0.93138012]
 [0.20390524 0.59520019]
 [0.2506776  0.76890301]
 [0.58293193 0.33757944]
 [0.3504995  0.73841293]
 [0.1244934  0.60348564]
 [0.32567993 0.28027165]
 [0.51617251 0.56275035]
 [0.21402777 0.31850351]
 [0.89857744 0.49074954]
 [0.37674889 0.43374012]
 [0.31013194 0.77228303]
 [0.87506375 0.93131107]
 [0.7667465  0.59134422]
 [0.2797702  0.39243817]
 [0.31831097 0.22859283]
 [0.30262612 0.91190095]
 [0.33537778 0.37308444]
 [0.01430595 0.5888628 ]
 [0.11741963 0.37292709]
 [0.95431559 0.95266976]
 [0.75890917 0.92041003]


Fill the prediction code:

In [248]:
def get_prediction(x):
    # fill the code here (5-6 lines)
    predictions = np.array([classifier.predict(x) for classifier in classifiers])
    print(predictions)
    predicted = []
    for i in range(len(x)):
        classified = predictions[:, i]
        predicted.append(np.argmax(np.bincount(classified)))
    return predicted

Test it:

In [249]:
predictions = get_prediction(validate_x)
print(predictions)

[[1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]]
[np.int64(0)]


In [250]:
print(accuracy_score(validate_label, predictions))

1.0


In [251]:
print(accuracy_score(test_labels, get_prediction(test_set)))

[[0 0 1 ... 0 1 0]
 [1 1 1 ... 1 0 1]
 [0 0 1 ... 0 1 0]
 ...
 [1 1 1 ... 1 0 1]
 [0 0 1 ... 0 1 0]
 [1 1 1 ... 1 0 1]]
0.489


In [252]:
print(sum(test_labels==1) / len(test_labels))

0.52
