In [13]:
"""
Create a Decision Stump
"""
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt
from matplotlib import style
style.use('fivethirtyeight')
import scipy.stats as sps


In [30]:
# Load in the data and define the column labels
dataset = pd.read_csv('combinedDataSet.csv')

#dataset = dataset.sample(frac=1)
dataset.columns = ['Smoking', 'Gender', 'Age', 'Height', 'Weight', 'DiabeticStatus', 'HR', 'HRRange', 'pNN50', 'RMSSD', 'stdNN', 'MeanSE', 'VarSE', 'IQRSE', 'SkewSE', 'VarLogE', 'IQRLogE', 'stdKTE', 'MeanKTE', 'IQRKTE', 'SkewKTE', 'BMI', 'Class']


In [32]:
# Encode the feature values from strings to integers since the sklearn DecisionTreeClassifier only takes numerical values
for label in dataset.columns:
    dataset[label] = LabelEncoder().fit(dataset[label]).transform(dataset[label])
        
Tree_model = DecisionTreeClassifier(criterion = "entropy",max_depth = 1)

X = dataset.drop('Class',axis=1)
Y = dataset['Class'].where(dataset['Class']==1,-1)

predictions = np.mean(cross_validate(Tree_model,X,Y,cv = 100)['test_score'])

print('The accuracy is: ',predictions*100,'%')

The accuracy is:  80.57810457516342 %


0      -1
1      -1
2      -1
3      -1
4      -1
5      -1
6      -1
7      -1
8      -1
9      -1
10     -1
11     -1
12     -1
13     -1
14     -1
15     -1
16     -1
17     -1
18     -1
19     -1
20     -1
21     -1
22     -1
23     -1
24     -1
25     -1
26     -1
27     -1
28     -1
29     -1
       ..
3506    1
3507    1
3508    1
3509    1
3510    1
3511    1
3512    1
3513    1
3514    1
3515    1
3516    1
3517    1
3518    1
3519    1
3520    1
3521    1
3522    1
3523    1
3524    1
3525    1
3526    1
3527    1
3528    1
3529    1
3530    1
3531    1
3532    1
3533    1
3534    1
3535    1
Name: Class, Length: 3536, dtype: int64

In [None]:
class Boosting:
    def __init__(self,dataset,T,test_dataset):
        self.dataset = dataset
        self.T = T
        self.test_dataset = test_dataset
        self.alphas = None
        self.models = None
        self.accuracy = []
        self.predictions = None
    
    def fit(self):
        # Set the descriptive features and the target feature
        X = self.dataset.drop(['Class'],axis=1)
        Y = self.dataset['Class'].where(self.dataset['Class']==1,-1)
        
        # Initialize the weights of each sample with wi = 1/N and create a dataframe where the evaluation is computed
        evaluation = pd.DataFrame(Y.copy())
        evaluation['weights'] = 1/len(self.dataset) # Set the initial weights w = 1/N
        
        # Run the boosting algorithm by creating T "weighted models"
        alphas = [] 
        models = []
        
        for t in range(self.T):
            # Train the Decision Stump(s)
            Tree_model = DecisionTreeClassifier(criterion="entropy",max_depth=1) #depth one --> Decision Stump
            
            # Train the stumps on weighted datasets where the w depend on the results of previous decision stumps. 
            # To accomplish that, we use the 'weights' column of the 'evaluation dataframe' together with the 
            # sample_weight parameter of the fit method.
            
            model = Tree_model.fit(X,Y,sample_weight=np.array(evaluation['weights'])) 
            
            # Append the single weak classifiers to a list which is later on used to make the weighted decision.
            
            models.append(model)
            predictions = model.predict(X)
            score = model.score(X,Y)
            
            # Add values to the Evaluation DataFrame
            Evaluation['predictions'] = predictions
            Evaluation['evaluation'] = np.where(Evaluation['predictions'] == Evaluation['target'],1,0)
            Evaluation['misclassified'] = np.where(Evaluation['predictions'] != Evaluation['target'],1,0)
            
            # Calculate the misclassification rate and accuracy
            accuracy = sum(Evaluation['evaluation'])/len(Evaluation['evaluation'])
            misclassification = sum(Evaluation['misclassified'])/len(Evaluation['misclassified'])
            
            # Caclulate the error
            err = np.sum(Evaluation['weights']*Evaluation['misclassified'])/np.sum(Evaluation['weights'])
 
   
            # Calculate the alpha values
            alpha = np.log((1-err)/err)
            alphas.append(alpha)
            
            # Update the weights wi --> These updated weights are used in the sample_weight parameter
            # for the training of the next decision stump. 
            Evaluation['weights'] *= np.exp(alpha*Evaluation['misclassified'])
            
            #print('The Accuracy of the {0}. model is : '.format(t+1),accuracy*100,'%')
            #print('The missclassification rate is: ',misclassification*100,'%')
        
        self.alphas = alphas
        self.models = models
            
    def predict(self):
        X_test = self.test_dataset.drop(['target'],axis=1).reindex(range(len(self.test_dataset)))
        Y_test = self.test_dataset['target'].reindex(range(len(self.test_dataset))).where(self.dataset['target']==1,-1)
    
        # With each model in the self.model list, make a prediction 
        
        accuracy = []
        predictions = []
        
        for alpha,model in zip(self.alphas,self.models):
            # We use the predict method for the single decisiontreeclassifier models in the list
            prediction = alpha*model.predict(X_test) 
            predictions.append(prediction)
            self.accuracy.append(np.sum(np.sign(np.sum(np.array(predictions),axis=0))==Y_test.values)/len(predictions[0]))
            
            '''Describing the above line:
            
            Goal: Create a list of accuracies which can be used to plot the accuracy against the number of base learners 
            used for the model.
            
            1. np.array(predictions): this is the array which contains the predictions of the single models. 
            shape 8124xn, looks like [[0.998,0.87,...0.87...],[...],[...],[0.99,1.23,...,1.05,0,99...]] 
            
            2. np.sum(np.array(predictions),axis=0): Sums up the first elements of the lists, that is 0,998+...+...+0.99. 
            This is done as the formula for the prediction wants the sum of the predictions of all models for each 
            instance in the dataset. For example if you have 3 models then the predictions array has the shape 8124x3 
            (a table with 3 columns and 8124 rows). The nth column contains the predictions for the nth model. 
            The results from column/model n-1 alter the weights of the nth model. 
            
            3. np.sign(np.sum(np.array(predictions),axis=0)): Since the test target data are elements of {-1,1} and the 
            prediction should be the same format --> use the sign function. So each column in the accuracy array 
            looks like [-0.998,1.002,1.24,...,-0.89] and each element represents the combined and weighted prediction of 
            all models up this column. For example, in the nth column and for the ith instance we find the value -0.989, 
            this value represents the ith instance of a weighted prediction of a boosted model with n base learners.
            I'm nterested in the sign of these combined predictions. A high positive value defines a likely positive 
            classification, a high negative value defines a likely negative or opposite) classification. 
            
            4. np.sum(np.sign(np.sum(np.array(predictions),axis=0))==Y_test.values)/len(predictions[0]): 
            With the last step we transform the array into the shape 8124x1 where the instances are elements {-1,1} 
            so that I can compare these predictions with our targets. The target feature array also has shape 8124x1. 
            The comparison is done with “ == Y_test.value ” . As a result you get an array of shape 8124x1 where the i
            nstances are elements of {True,False};  True if the prediction matches the target feature value and False if not.
            Dividing the sum of 1s (True predictions) by the total length with “len(predictions[0])” returns a fraction 
            of correct predictions for a  %.
            
            5. full line: add the result to the self.accuracy list. 
            This list has the shape n x 1; for a model with 5 base learners this list has 5 entries where the 5th entry 
            represents the accuracy of the model when all 5 base learners are combined etc.''' 

        self.predictions = np.sign(np.sum(np.array(predictions),axis=0))
   
        
        
#Plot the accuracy of the model against the number of stumps (weak classifiers) used

number_of_base_learners = 50
fig = plt.figure(figsize=(10,10))
ax0 = fig.add_subplot(111)
for i in range(number_of_base_learners):
    model = Boosting(dataset,i,dataset)
    model.fit()
    model.predict()
ax0.plot(range(len(model.accuracy)),model.accuracy,'-b')
ax0.set_xlabel('# models used for Boosting ')
ax0.set_ylabel('accuracy')
print('With a number of ',number_of_base_learners,'base models we receive an accuracy of ',model.accuracy[-1]*100,'%')    
                 
plt.show()  