# Ensemble methods. Exercises


In this section we have only two exercise:

1. Find the best three classifier in the stacking method using the classifiers from scikit-learn package.

2. Build arcing arc-x4 method. 

In [1]:
%store -r data_set
%store -r labels
%store -r test_data_set
%store -r test_labels
%store -r unique_labels

## Exercise 1: Find the best three classifier in the stacking method

Please use the following classifiers:

* Linear regression,
* Nearest Neighbors,
* Linear SVM,
* Decision Tree,
* Naive Bayes,
* QDA.

In [2]:
import numpy as np
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split

In [7]:
def classes_from_regression(predictions):
    return np.digitize(predictions, bins=[0.5, 1.5])

def build_classifiers():
    regression = LogisticRegression(max_iter=1000)
    knn = KNeighborsClassifier()
    svm = SVC(kernel='linear')
    tree = DecisionTreeClassifier()
    gauss = GaussianNB()
    qda = QuadraticDiscriminantAnalysis()
    
    all_classifiers = [regression, knn, svm, tree, gauss, qda]
    
    for _classifier in all_classifiers:
        _classifier.fit(data_set, labels)
    
    return all_classifiers

In [8]:
def build_stacked_classifier(classifiers):
    output = []
    for classifier in classifiers:
        output.append(classifier.predict(data_set))
    output = np.array(output).reshape((130,3))
    
    # stacked classifier part:
    stacked_classifier = DecisionTreeClassifier() # set here
    stacked_classifier.fit(output.reshape((130,3)), labels.reshape((130,)))
    test_set = []
    for classifier in classifiers:
        test_set.append(classifier.predict(test_data_set))
        
    test_set = np.array(test_set).reshape((len(test_set[0]),3))
    predicted = stacked_classifier.predict(test_set)
    return predicted

In [11]:
import itertools
classifiers = build_classifiers()
acc = []
combinations = itertools.combinations(classifiers, 3)
permutations = []
for comb in combinations:
    permutations.extend(itertools.permutations(comb))
print(f'Number of possible classifiers combinations: {len(permutations)}')

for permutation in permutations[:10]:
    predicted = build_stacked_classifier(permutation)
    acc.append(accuracy_score(test_labels, predicted))
    
best_combination = np.argmax(acc)
print(f'The best accuracy: {acc[best_combination]}')
print(f"The best classifiers: {permutations[best_combination]}")

Number of possible classifiers combinations: 120
The best accuracy: 0.85
The best classifiers: (LogisticRegression(max_iter=1000), KNeighborsClassifier(), DecisionTreeClassifier())


## Exercise 2: 

Use the boosting method and change the code to fullfilt the following requirements:

* the weights should be calculated as:
$\Large w_{n}^{(t+1)}=\frac{1+ I(y_{n}\neq h_{t}(x_{n})}{\sum_{i=1}^{N}1+I(y_{n}\neq h_{t}(x_{n})}$,
* the prediction is done with a voting method.

In [28]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# prepare data set

def generate_data(sample_number, feature_number, label_number):
    data_set = np.random.random_sample((sample_number, feature_number))
    labels = np.random.choice(label_number, sample_number)
    return data_set, labels

labels = 2
dimension = 2
test_set_size = 1000
train_set_size = 5000
train_set, train_labels = generate_data(train_set_size, dimension, labels)
test_set, test_labels = generate_data(test_set_size, dimension, labels)

# init weights
number_of_iterations = 10
weights = np.ones((test_set_size,)) / test_set_size


def train_model(classifier, weights):
    return classifier.fit(X=test_set, y=test_labels, sample_weight=weights)

def calculate_error(model):
    predicted = model.predict(test_set)
    I=calculate_accuracy_vector(predicted, test_labels)
    Z=np.sum(I)
    return (1+Z)/1.0

Fill the two functions below:

In [29]:
def set_new_weights(model):
    predictions = model.predict(test_set)
    I = np.array([1 if predictions[i] != test_labels[i] else 0 for i in range(len(predictions))])
    new_weights = (1 + I) / (1 + I).sum()
    return new_weights

Train the classifier with the code below:

In [43]:
classifier = DecisionTreeClassifier(max_depth=1, random_state=1)
classifier.fit(X=train_set, y=train_labels)
alphas = []
classifiers = []
for iteration in range(number_of_iterations):
    model = train_model(classifier, weights)
    weights = set_new_weights(model)
    classifiers.append(model)
    classifier = DecisionTreeClassifier(max_depth=1)
    print(weights[:5])


validate_x, validate_label = generate_data(1, dimension, labels)

[0.00068306 0.00136612 0.00068306 0.00068306 0.00068306]
[0.00131579 0.00065789 0.00131579 0.00131579 0.00131579]
[0.00068306 0.00136612 0.00068306 0.00068306 0.00068306]
[0.00131579 0.00065789 0.00131579 0.00131579 0.00131579]
[0.00068306 0.00136612 0.00068306 0.00068306 0.00068306]
[0.00131579 0.00065789 0.00131579 0.00131579 0.00131579]
[0.00068306 0.00136612 0.00068306 0.00068306 0.00068306]
[0.00131579 0.00065789 0.00131579 0.00131579 0.00131579]
[0.00068306 0.00136612 0.00068306 0.00068306 0.00068306]
[0.00131579 0.00065789 0.00131579 0.00131579 0.00131579]


Set the validation data set:

In [44]:
validate_x, validate_label = generate_data(100, dimension, labels)
print(validate_x[:10], validate_label[:10])

[[0.21992157 0.47588905]
 [0.10281373 0.74239744]
 [0.28797198 0.69950756]
 [0.34147098 0.6150149 ]
 [0.91923419 0.68838876]
 [0.68165879 0.33532587]
 [0.11488571 0.28540851]
 [0.22572872 0.71318189]
 [0.97552082 0.80124945]
 [0.52903212 0.5957221 ]] [1 0 0 1 1 0 0 0 0 1]


Fill the prediction code:

In [55]:
from collections import Counter
def get_prediction(x, print_votes=False):
    # fill the code here (5-6 lines)
    predictions = np.array([classifier.predict(x) for classifier in classifiers])
    final_predictions = []
    for i in range(len(x)):
        votes  = predictions[:, i]
        vote_counts = Counter(votes)
        if print_votes:
            print(vote_counts)
        final_predictions.append(vote_counts.most_common(1)[0][0])
    return final_predictions

Test it:

In [56]:
predictions = get_prediction(validate_x, True)
print(predictions)

Counter({np.int64(1): 5, np.int64(0): 5})
Counter({np.int64(0): 5, np.int64(1): 5})
Counter({np.int64(0): 5, np.int64(1): 5})
Counter({np.int64(1): 5, np.int64(0): 5})
Counter({np.int64(1): 10})
Counter({np.int64(1): 5, np.int64(0): 5})
Counter({np.int64(1): 5, np.int64(0): 5})
Counter({np.int64(0): 5, np.int64(1): 5})
Counter({np.int64(0): 5, np.int64(1): 5})
Counter({np.int64(1): 5, np.int64(0): 5})
Counter({np.int64(1): 5, np.int64(0): 5})
Counter({np.int64(1): 5, np.int64(0): 5})
Counter({np.int64(1): 10})
Counter({np.int64(1): 5, np.int64(0): 5})
Counter({np.int64(0): 5, np.int64(1): 5})
Counter({np.int64(0): 5, np.int64(1): 5})
Counter({np.int64(1): 5, np.int64(0): 5})
Counter({np.int64(1): 5, np.int64(0): 5})
Counter({np.int64(1): 5, np.int64(0): 5})
Counter({np.int64(0): 5, np.int64(1): 5})
Counter({np.int64(1): 5, np.int64(0): 5})
Counter({np.int64(1): 5, np.int64(0): 5})
Counter({np.int64(0): 5, np.int64(1): 5})
Counter({np.int64(1): 5, np.int64(0): 5})
Counter({np.int64(1): 

In [57]:
print(accuracy_score(validate_label, predictions))

0.51


In [58]:
print(accuracy_score(test_labels, get_prediction(test_set)))

0.536


In [59]:
print(sum(test_labels==1) / len(test_labels))

0.488
