# Ensemble methods. Exercises


In this section we have only two exercise:

1. Find the best three classifier in the stacking method using the classifiers from scikit-learn package.

2. Build arcing arc-x4 method. 

In [81]:
%store -r data_set
%store -r labels
%store -r test_data_set
%store -r test_labels
%store -r unique_labels

## Exercise 1: Find the best three classifier in the stacking method

Please use the following classifiers:

* Linear regression,
* Nearest Neighbors,
* Linear SVM,
* Decision Tree,
* Naive Bayes,
* QDA.

In [82]:
import numpy as np
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from itertools import permutations

In [83]:
def build_classifiers():
    classifiers = [LinearRegression, KNeighborsClassifier, SVC, DecisionTreeClassifier, GaussianNB, QuadraticDiscriminantAnalysis]
    built_classifiers = [classifier().fit(data_set, labels) for classifier in classifiers]
        
    return built_classifiers

In [84]:
def build_stacked_classifier(classifiers):
    output = []
    for classifier in classifiers:
        output.append(classifier.predict(data_set))
    output = np.array(output).reshape((130,3))
    
    # stacked classifier part:
    stacked_classifier = DecisionTreeClassifier()
    stacked_classifier.fit(output.reshape((130,3)), labels.reshape((130,)))
    test_set = []
    for classifier in classifiers:
        test_set.append(classifier.predict(test_data_set))
    test_set = np.array(test_set).reshape((len(test_set[0]),3))
    predicted = stacked_classifier.predict(test_set)
    return predicted

In [85]:
def get_type_name(obj):
    return type(obj).__name__

classifiers = build_classifiers()
results = []

for permutation in permutations(classifiers, r=3):
    predicted = build_stacked_classifier(permutation)
    accuracy = accuracy_score(test_labels, predicted)
    results.append((accuracy, get_type_name(permutation[0]), get_type_name(permutation[1]), get_type_name(permutation[2])))

results = sorted(results, key=lambda x: x[0], reverse=True)

print('\t\t\tAccuracy\t\t\tClassifier 1\t\t\tClassifier 2\t\t\tClassifier 3')
for i, clas1 in enumerate(results[:20]):
    line = str(i)
    for i in range(4):
        line += '\t\t\t' + str(clas1[i])
    print(line)

			Accuracy			Classifier 1			Classifier 2			Classifier 3
0			0.9			DecisionTreeClassifier			KNeighborsClassifier			QuadraticDiscriminantAnalysis
1			0.9			GaussianNB			KNeighborsClassifier			QuadraticDiscriminantAnalysis
2			0.85			KNeighborsClassifier			SVC			GaussianNB
3			0.85			KNeighborsClassifier			SVC			QuadraticDiscriminantAnalysis
4			0.85			KNeighborsClassifier			GaussianNB			QuadraticDiscriminantAnalysis
5			0.85			KNeighborsClassifier			QuadraticDiscriminantAnalysis			DecisionTreeClassifier
6			0.85			SVC			KNeighborsClassifier			GaussianNB
7			0.85			SVC			KNeighborsClassifier			QuadraticDiscriminantAnalysis
8			0.85			GaussianNB			KNeighborsClassifier			DecisionTreeClassifier
9			0.85			GaussianNB			SVC			KNeighborsClassifier
10			0.85			QuadraticDiscriminantAnalysis			KNeighborsClassifier			DecisionTreeClassifier
11			0.8			KNeighborsClassifier			SVC			DecisionTreeClassifier
12			0.8			KNeighborsClassifier			DecisionTreeClassifier			QuadraticDiscriminantAnalysis
13			0.8

## Exercise 2: 

Use the boosting method and change the code to fullfilt the following requirements:

* the weights should be calculated as:
$w_{n}^{(t+1)}=\frac{1+ I(y_{n}\neq h_{t}(x_{n})}{\sum_{i=1}^{N}1+I(y_{n}\neq h_{t}(x_{n})}$,
* the prediction is done with a voting method.

In [86]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# prepare data set

def generate_data(sample_number, feature_number, label_number):
    data_set = np.random.random_sample((sample_number, feature_number))
    labels = np.random.choice(label_number, sample_number)
    return data_set, labels

labels = 2
dimension = 2
test_set_size = 1000
train_set_size = 5000
train_set, train_labels = generate_data(train_set_size, dimension, labels)
test_set, test_labels = generate_data(test_set_size, dimension, labels)

# init weights
number_of_iterations = 10
weights = np.ones((test_set_size,)) / test_set_size


def train_model(classifier, weights):
    return classifier.fit(X=test_set, y=test_labels, sample_weight=weights)

def calculate_error(model):
    predicted = model.predict(test_set)
    I=calculate_accuracy_vector(predicted, test_labels)
    Z=np.sum(I)
    return (1+Z)/1.0

Fill the two functions below:

In [87]:
def set_new_weights(model):
    # fill the code here (two lines)
    pass 

Train the classifier with the code below:

In [88]:
classifier = DecisionTreeClassifier(max_depth=1, random_state=1)
classifier.fit(X=train_set, y=train_labels)
alphas = []
classifiers = []
for iteration in range(number_of_iterations):
    model = train_model(classifier, weights)
    weights = set_new_weights(model)
    classifiers.append(model)

print(weights)


validate_x, validate_label = generate_data(1, dimension, labels)

None


Set the validation data set:

In [89]:
validate_x, validate_label = generate_data(1, dimension, labels)

Fill the prediction code:

In [90]:
def get_prediction(x):
    # fill the code here (5-6 lines)
    pass

Test it:

In [91]:
prediction = get_prediction(validate_x)[0]

print(prediction)

TypeError: 'NoneType' object is not subscriptable