# Mustererkennung/Machine Learning - Assignment 6



In [143]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

###Load the spam dataset:

In [144]:
data = np.array(pd.read_csv('../data/spambase.data', header=None))

X = data[:,:-1] # features
y = data[:,-1] # Last column is label

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, shuffle=True, stratify=y)

#print(y_test[0:200])

In order to test the prediction accuracy of the classifier, one needs to split the dataset into a training and test set. 

Exercise 1.Decision Trees
Implement a decision tree (classification tree to be precise) using Python (incl. Numpy etc.)and use it on the SPAM-Dataset1. 
Use a metric of your choice as a loss function.

In [327]:
# Loss functions
# Gini index and entropy are the criteria for calculating information gain. 

def calculate_gini_index(labels):
    # We first need to calculate the proportion of classes in each group.
    proportion_ones = np.count_nonzero(labels) / len(labels)

    proportion_zeros = np.count_nonzero(labels == 0) / len(labels)

    gini = 2 * proportion_zeros * proportion_ones
    print("Gini Index is {}".format(gini))
    
    return gini

def cross_entropy(p):
        if p == 1 or p == 0: 
            # The entropy is zero if one event is certain
            return 0
        return - (p * np.log(p) + (1-p) * np.log((1-p)))

# Weight of a child node is number of samples in the node/total samples of all child nodes. 
# Similarly information gain is calculated with gini score. 
def children_entropy(feature, y):
    right = (feature == True).sum()/len(feature)
    left = 1 - right
    
    p = np.sum(y[feature])/len(y[feature]) 
    q = np.sum(y[np.invert(feature)])/len(y[np.invert(feature)])
    
    entropy_right = right * cross_entropy(p)
    entropy_left = left * cross_entropy(q)
    total_entropy = entropy_right + entropy_left
    return total_entropy, q, p

In [510]:
# because an E-Mail can be either SPAM or not SPAM so we will use Gini Index for a 2 classes classifcation tree 
class DecisionTree():
    
    def __init__(self, height=7):
        self.min_size = 4
        self.height = 7
    
    # fit a basic binary tree for 2 classes classificaton 
    def fit(self, X, y):
        self.tree_size = 2**self.height - 1
        #print(self.tree_size)
        self.tmp_size = 2**(self.height + 1) - 1
        self.features = X.shape[1]
        self.tree = np.full(self.tmp_size, -1)
        self.tree_tmp = np.full(self.tmp_size + 1, -1)
        self.split_tree(X, y, 0)
    
    # binary tree
    def left_tree(self, leaf):
        return 2 * leaf + 1
    
    def right_tree(self, leaf):
        return 2 * leaf + 2
    
    
    def predict(self, X):
        predictions = []
        for x in X:
            idx = 0
            leaf = self.tree[idx]
            while self.tree[self.left_tree(idx)] != -1 or self.tree[self.right_tree(idx)] != -1:
                #print("idx:", idx)
                #print("leaf:", idx)

                if leaf >= self.tree_size:
                    return
                
                if x[leaf]:
                    idx = self.right_tree(idx)
                    #print("--------> right")
                else:
                    idx = self.left_tree(idx)
                    #print("left <--------")
                prediction = self.tree_tmp[idx]
                leaf = self.tree[idx]
            predictions += [prediction]
        return predictions
    
    
    def split_data(self, index, value, X):
        left, right = list(), list()
        for row in X:
            if row[index] < value:
                left.append(row)
            else:
                right.append(row)
        return left, right
        
    
    def split_tree(self, X, y, leaf):
  
        # parent node is a leaf
        #print("leaf", leaf)
        if leaf >= self.tree_size:
            return
        
        entropies = np.full(self.features, np.inf) 
        left = np.empty(self.features)
        right = np.empty(self.features)
        
        # for every feature variable
        for i, feature in enumerate(X.T):
            if np.sum(feature) == 0 or np.sum(np.invert(feature)) == 0:
                continue 
            entropies[i], left[i], right[i] = children_entropy(feature, y)
        
        min_entropy = np.argmin(entropies)
        
        right = X[:,min_entropy]
        left = np.invert(right)
        #print(left)
        
        #print("min_entropy", min_entropy)
        self.tree[leaf] = min_entropy
        if min_entropy < len(self.tree_tmp):
            if (min_entropy < len(left)) and (min_entropy < len(right)):
                self.tree_tmp[self.left_tree(leaf)] = left[min_entropy]
                self.tree_tmp[self.right_tree(leaf)] = right[min_entropy]
        
        if len(y[right]) == 0 or len(y[left]) == 0:
            return
        # grow tree 
        if leaf >= self.min_size:
            return
        self.split_tree(X[left], y[left], self.left_tree(leaf))
        self.split_tree(X[right], y[right], self.right_tree(leaf))
            
        '''
        # calculate split variable
        z = (two_rows[0,feature] + two_rows[1,feature]) / 2
        print(z)

        c1_0 = list((X_tmp[:,feature] <= z) & (y == 0))
        c1_1 = list((X_tmp[:,feature] <= z) & (y == 1))
        #print(len(c1_1) == len(c1_0))

        c2_0 = list((X_tmp[:,feature] > z) & (y == 0))
        c2_1 = list((X_tmp[:,feature] > z) & (y == 1))

        if len(c1_0) == len(c1_1):
            c1 = len(c1_0)
        else:
            c1 = np.argmax(len(c1_0), len(c1_1))

        if len(c2_0) == len(c2_1):
            c2 = len(c2_0)
        else:
            c2 = np.argmax(len(c2_0), len(c2_1))
        '''

In [511]:
#gini_train = calculate_gini_index(y_train)
#gini_test = calculate_gini_index(y_test)

In [512]:
# mean of trues and falses 

means = (np.mean(X_train[y_train==1], axis=0) + np.mean(X_train[y_train==0])) / 2 
                  
X_train_means = (X_train > means)
X_test_means = X_test > means

tree = DecisionTree(height = 7)
tree.fit(X_train_means, y_train)
predictions = tree.predict(X_test_means)

print(predictions)



[1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 

In [513]:
from sklearn.metrics import confusion_matrix
estimates = (np.array(predictions) > 0.5)
#print(predictions)
print(confusion_matrix(predictions, estimates))


[[204   0]
 [  0 947]]


Use your tree to analyze feature importance. 
Plot the difference between the top 5 features (check spambase.names to check what features those belong to)

In [610]:
# mean of trues and falses 

# word_freq_address 1
# word_freq_free 15
# word_freq_money 23
# word_freq_direct 39
# word_freq_re 44
#idx = [1, 15, 23, 39, 44]
X_5 = data[:,idx]

X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(X_5, y, random_state=0, shuffle=True, stratify=y)

means_5 = (np.mean(X_train_5[y_train_5==1], axis=0) + np.mean(X_train_5[y_train_5==0])) / 2 
                  
X_train_means_5 = (X_train_5 > means_5)
X_test_means_5 = X_test_5 > means_5

tree.fit(X_train_means_5, y_train_5)
predictions_5 = tree.predict(X_test_means_5)

#print(predictions_5)

estimates_5 = (np.array(predictions_5) > 0.5)
#print(predictions)
print(confusion_matrix(predictions_5, estimates_5))



[[274   0]
 [  0 877]]


We used: 
word_freq_address 1 
word_freq_free 15
word_freq_money 23
word_freq_direct 39
word_freq_re 44

**There are more true positive results than on the whole set of features**

Implement a Random Forest and use it on the SPAM-Dataset.

a) Print a confusion matrix (you can use package implementations here).

b) What is a good number of trees in the forest?

In [571]:
class RandomForest:
    
    def __init__(self, height=7, n_trees = 100):
        self.n_trees = n_trees
        self.height = height
        self.trees = [DecisionTree(height = height)
                      for _ in range(n_trees)]
    
    def fit(self, X, y, n_samples = 500):        
        for tree in self.trees:
            # pick randomly the datapoints
            #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, shuffle=True, stratify=y)
            random_samples = np.random.randint(0, high=len(X), size=n_samples)
            X_train = X[random_samples]
            y_train = y[random_samples]
            random_features = np.random.randint(0, high=len(X.T), size=self.height*2)
            X_train = X_train[:,random_features]          
            
            means = (np.mean(X_train[y_train==1], axis=0) + np.mean(X_train[y_train==0])) / 2       
            X_train_means = (X_train > means)
            tree.fit(X_train_means, y_train)
        
    def predict(self,X):
        # Version 1
        #preds = np.array([tree.predict(X) for tree in self.trees])
        #preds = np.mean(preds, axis=0)
        #return preds
        
        # Version 2 
        forest_predictions = np.array(self.trees[0].predict(X))
        #print(forest_predictions.shape)

        # make it as row vector by inserting an axis along first dimension
        forest_predictions = forest_predictions[:, np.newaxis]
        #print(forest_predictions.shape)
        
        for i in range(1, self.n_trees):
            pred = np.array(self.trees[i].predict(X))
            # When axis is specified, values must have the correct shape.
            forest_predictions = np.append(forest_predictions, pred[:, np.newaxis], axis=1)
        

        forest_average = np.array(np.mean(forest_predictions, axis=0))
        return forest_average

In [579]:
for t in range(25, 201, 25):
    random_forest = RandomForest(height=7, n_trees=t)
    random_forest.fit(X, y, n_samples = 1000)
    predictions_rf = random_forest.predict(X_test_means)
    #print(predictions_rf)

    estimates_rf = (np.array(predictions_rf) > 0.5)
    print("-----")
    print("trees: ", t)
    print(confusion_matrix(predictions_rf.round(), estimates_rf.round()))
    print("-----")
    

-----
trees:  25
[[ 2  0]
 [ 0 23]]
-----
-----
trees:  50
[[ 3  0]
 [ 0 47]]
-----
-----
trees:  75
[[12  0]
 [ 0 63]]
-----
-----
trees:  100
[[18  0]
 [ 0 82]]
-----
-----
trees:  125
[[ 11   0]
 [  0 114]]
-----
-----
trees:  150
[[ 22   0]
 [  0 128]]
-----
-----
trees:  175
[[ 16   0]
 [  0 159]]
-----
-----
trees:  200
[[ 25   0]
 [  0 175]]
-----
