# AdaBoost Classifier

### 1 - Define the base classifier
- Potential Bug:  

After splitting the dataset, sometimes the left and right label may have the same result because of the same frequency of labels on each side, this will lead to a wrong choice of splitted feature and value.  

Finally, the iterations of weight will go into the same loop, and the error of model will not be convergent. That is what we dont hope.

In [320]:
import numpy as np

# Use decision tree stump as base learner of AdaBoost model
class BaseClassifier():
    
    def __init__(self,X_train, Y_train, weights):
        """
        Initialize the parameters of decision tree stump, and create the decision tree stump as basic clasfier
        
        Arguments:
        e -- the error distribution of this model
        X_train -- training dataset with the shape of [n_samples, n_features]
        Y_train -- corresponding labels of training dataset, with the length of n_samples
        weights -- the weights of importance between each data point
        left -- label corresponding to the left leaf
        right -- label corresponding to the right leaf
        signs -- the signs of y_hat*y_true
       
        """
        # Initialize the parameters
        self.e = None
        self.left = None
        self.right = None
        self.split_dim = None
        self.split_value = None
        
        # Fit  model with training data under the given weights
        self.fit(X_train, Y_train, weights)
        # Get the label of left leaf and right leaf
        index_left = np.where(X_train[:,self.split_dim]<=self.split_value)
        self.left = self.compute_label(Y_train[index_left])
        index_right = np.where(X_train[:,self.split_dim]>self.split_value)
        self.right = self.compute_label(Y_train[index_right])
        # Get the signs of y_hat and y_true     
        self.signs = self.pred_sign(X_train, Y_train)
        
        return None
    
    
    def fit(self, X_train, Y_train, weights):
        """
        Fit the decision tree stump under given weights
        
        Arguments:
        X_train -- training dataset with the shape of [n_samples, n_features]
        Y_train -- corresponding labels of training dataset, with the length of n_samples
        weights -- the weights of importance between each data point
        
        Returns:
        split_dim -- the dimension which dataset will be splitted along
        split_value -- the value which dataset will be splitted dependent on
        
        """
        # Get the shape of feature space
        num_x = X_train.shape[0]
        dim_x = X_train.shape[1]
        
        # Find the optimal split-dimension and split-value
        self.e = np.inf
        for dim in range(dim_x):
            temp_slice = X_train[:,dim]
            for value in np.unique(temp_slice):
                # Compute current y_hats and error under this status 
                y_hat = self.compute_y_hat(X_train, Y_train, dim, value)
                error = self.compute_error(y_hat, Y_train, weights)
                if error < self.e:
                    self.e = error
                    self.split_dim = dim
                    self.split_value = value
        
        return  None
    
    
    def compute_y_hat(self, X_train, Y_train, split_dim, split_value):
        """
        Compute the prediction of training data under splitting
        
        Arguments:
        X_train -- training dataset with the shape of [n_samples, n_features]
        Y_train -- corresponding labels of training dataset, with the length of n_samples
        split_dim -- the dimension which dataset will be splitted along
        split_value -- the value which dataset will be splitted dependent on
        
        Return:
        y_hats -- the predictions of training data under current model
        """
        # Get the corresponding label of each leaf
        index_left = np.where(X_train[:,split_dim]<=split_value)
        left_label = self.compute_label(Y_train[index_left])
        index_right = np.where(X_train[:,split_dim]>split_value)
        right_label = self.compute_label(Y_train[index_right])
        # Calculate the final y_hats of training data
        y_hats = []
        for data in X_train:
            if data[split_dim] <= split_value:
                y_hats.append(left_label)
            else:
                y_hats.append(right_label)
        
        return y_hats
    
    
    def compute_error(self, y_hat, y_true, weights):
        """
        Compute the error between y_hat and y_true under given weights
        
        Arguments:
        y_hat -- the predictions of training data 
        y_true -- the true values of training data
        weights -- the weights of importance between each data point
        
        Return:
        error -- the total error under this classification model
        
        """
        error = 0
        for i in range(len(y_hat)):
            if y_hat[i] != y_true[i]:
                error += weights[i]
                
        return error
    

    def compute_label(self, data_list):
        """
        Compute the prediction of training data under splitting
        
        Argument:
        data_list -- a list of labels 
        
        Return:
        the label of this group
        
        """
        d= {}
        # Calculate the frequency of different labels
        for n in data_list:
            if n in d:
                d[n] += 1
            else:
                d[n] = 1
                
        # Return the label which has max frequency
        if len(d) == 0:
            return None
        else:
            return max(d, key=d.get)
        
        
    def predict(self, x):
        """
        Predict the result of given x
        
        Argument:
        x -- input feature vector with the shape of [1, n_dimensions]
        
        Return:
        the prediction of given x
        
        """
        if x[self.split_dim] <= self.split_value:
            return self.left
        else:
            return self.right
    
    
    def pred_sign(self, X_train, Y_train):
        """
        Compute the signs of y_hat*y_true
        
        Arguments:
        X_train -- training dataset with the shape of [n_samples, n_features]
        Y_train -- corresponding labels of training dataset, with the length of n_samples
        
        Return:
        the list consist of signs of y_hat*y_true
        
        """
        signs = []
        for i in range(len(X_train)):
            pred = self.predict(X_train[i])
            if pred == Y_train[i]:
                signs.append(1)
            else:
                signs.append(-1)
        
        return np.array(signs)
    

### 2 - Define the AdaBoost Classifier

In [321]:
# Boosting Algorithm based on Decision Tree Stump
class AdaBoost():
    
    def __init__(self, n_estimators=3, min_error=0.01):
        """
        Initialize the parameters
        
        Arguments:
        alpha -- a vector consist of each coefficient of each child-model
        estimators -- a list of child estimators in ensemble leaning model
        n_estimators -- the number of estimators in model
        min_error -- stop condition for the training
        
        """     
        self.alpha = np.array([])
        self.estimators = []
        self.min_error = min_error
        self.n_estimators = n_estimators
    
        return None
    
    
    def fit(self, X_train, Y_train, verbose=False):
        """
        Fit the model with training data
        
        Arguments:
        X_train -- training dataset with the shape of [n_samples, n_features]
        Y_train -- corresponding labels of training dataset, with the length of n_samples
        
        """  
        # Get the shape of feature space
        num_x = X_train.shape[0]
        dim_x = X_train.shape[1]
        
        # Set the initial weight as uniform distribution
        weight = np.ones(num_x) / num_x
        # Generate model step by step
        for i in range(self.n_estimators):
            # Create current optimal model under given weight
            model = BaseClassifier(X_train, Y_train, weight)
            # Calculate the error of current model
            e = model.e
             
            # Whether need to stop training
            if e <=self.min_error:
                self.estimators = [model]
                self.alpha = np.array([1])
                break
            
            # Calculate the alpha-coefficient of current model
            a = 0.5 * np.log((1-e)/e)
            # Get the sign of each prediction of current model
            s = model.signs
            
            # Update weight and normalize
            weight = weight * np.exp(-1*a*s)
            weight = weight / np.sum(weight)
            
            # Add alpha and estimators into model sequence
            self.alpha = np.append(self.alpha, a)
            self.estimators.append(model)
            
            # Whether end training
            if self.accuracy(X_train, Y_train) <=self.min_error:
                break
            
            if verbose == True:
                print('Step',i)
                print('---------------------')
                print('error:', e)
                print('alpha:', a)
                print('weight:', weight)
                print('\n')
                
        return None
    
    
    def predict(self, x):
        """
        Get the prediciton of given x
        
        Argument:
        x -- input feature vector 
        
        Return:
        the prediction of AdaBoost model
        """  
        # Set the initial frequency dict 
        d = {}
        # Calculate the predction of each model and summarize them
        for i in range(len(self.estimators)):
            model = self.estimators[i]
            pred = model.predict(x)
            if pred in d:
                d[pred] += self.alpha[i]
            else:
                d[pred] = self.alpha[i]
                
        # Find the key which has the max value
        return max(d, key=d.get)
    
    
    def accuracy(self, X_train, Y_train):
        """
        Get the accuracy of predictions
        
        Arguments:
        X_train -- training dataset with the shape of [n_samples, n_features]
        Y_train -- corresponding labels of training dataset, with the length of n_samples
        
        Return:
        accuracy -- the accuracy of classification
        
        """  
        # Get the predictions
        preds = []
        for i in X_train:
            pred = self.predict(i)
            preds.append(pred)
        
        # Calculate the accuracy
        accuracy = 0
        num = len(preds)
        for i in range(num):
            if preds[i] == Y_train[i]:
                accuracy += 1 / num
        
        return accuracy
        

### Test

*P40 例 8.1*  
给定如下表所示训练数据集。假设弱分类器由x<v或x>v产生，其阈值v使该分类器在训练数据集上分类误差率最低。试用AdaBoost算法学习一个强分类器。

| 序号|  1 |  2 |  3 |  4 |  5 |  6 |  7 |  8 |  9 | 10 |
| :--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
|  x  |  0 |  1 |  2 |  3 |  4 |  5 |  6 |  7 |  8 |  9 |
|  y  | -1 |  1 |  1 | -1 | -1 | -1 |  1 |  1 |  1 | -1 |

#### generate artificial samples

In [322]:
x = np.array([[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]])
y = np.array([-1, 1, 1,-1,-1,-1, 1, 1, 1,-1])

#### creating and training model

In [323]:
model = AdaBoost()
model.fit(x, y, verbose=True)

Step 0
---------------------
error: 0.3
alpha: 0.423648930194
weight: [ 0.07142857  0.16666667  0.16666667  0.07142857  0.07142857  0.07142857
  0.07142857  0.07142857  0.07142857  0.16666667]


Step 1
---------------------
error: 0.285714285714
alpha: 0.458145365937
weight: [ 0.125       0.11666667  0.11666667  0.05        0.05        0.05        0.125
  0.125       0.125       0.11666667]


Step 2
---------------------
error: 0.266666666667
alpha: 0.505800455839
weight: [ 0.08522727  0.07954545  0.07954545  0.09375     0.09375     0.09375
  0.08522727  0.08522727  0.08522727  0.21875   ]




#### predicting

In [324]:
preds = []
for i in range(len(x)):
    pred = model.predict(x[i])
    preds.append(pred)

print('Predictions:', preds)
print('Accuracy:', model.accuracy(x,y))

Predictions: [-1, 1, 1, -1, -1, -1, 1, 1, 1, 1]
Accuracy: 0.8999999999999999


#### improvement
- increasing the number of estimators can enhance the performance of AdaBoost obviously

In [325]:
improved_model = AdaBoost(n_estimators=100)
improved_model.fit(x, y)

print('Accuracy:', improved_model.accuracy(x, y))

Accuracy: 0.9999999999999999
