In [1]:
import random
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import math

# Boosting

On travaille uniquement sur les données Wisconsin Diagnostic Breast Cancer.

## Fonctions

In [16]:
def bootstrap(x):
    return np.sort(np.random.choice(x, len(x)))
                   
def out_of_bag(bootstrap_sample, x):
    return x[np.where(not bootstrap_sample in x)]

def erreur_prediction(weights, pred, true_pred):
        return np.dot(weights, pred == true_pred)
                   
class boosting:

    def __init__(self, nb_iterations=100, model = "decision_tree"):
        self.nb_iterations = nb_iterations
        self.model = model
        self.pred = []
        self.trees = []
        self.alpha = []
        self.weights = [] #D_i
    
    #epsilon_t
    def erreur(self, y, y_pred):
        err = 0
        for i in range(len(y)):
            if y[i] != y_pred[i]:
                err += self.weights[i]
        return err/len(y)
        
    def one_adaboost_iteration(self, train_data, y, n, max_depth):
        if self.model == "decision_tree" :
            one_tree = DecisionTreeClassifier(max_depth=max_depth)
        self.trees.append(one_tree.fit(X = train_data, y=y, sample_weight=self.weights))
        predictions = one_tree.predict(train_data)
        
        error_rate = self.erreur(y, predictions)
        new_alpha = np.log((1-error_rate)/error_rate)/2
        self.alpha.append(new_alpha)
        for i in range(n):
            if predictions[i] == y[i]:
                self.weights[i] = self.weights[i] * np.exp(-new_alpha)
            else:
                self.weights[i] = self.weights[i] * np.exp(new_alpha)
        self.weights = self.weights/sum(self.weights) #Normalisation
            
        
    def fit(self, train_data, y, max_depth=1):
        if not isinstance(y, np.ndarray):
            y = np.array(y)
        n = np.shape(train_data)[0]
        self.weights = np.ones(n)/n
        for i in range(self.nb_iterations):
            self.one_adaboost_iteration(train_data, y, n, max_depth)
            
    def votes(self, predictions, categories, seuil=0.5):
        nb_individus = np.shape(predictions)[1]
        nb_classifieurs = np.shape(predictions)[0]
        res_votes = []

        for j in range(nb_individus):
            res_votes.append(self.vote(predictions[j], categories, nb_classifieurs, seuil))

        return res_votes

    def vote(self, predictions, categories, nb_classifieurs, seuil=0.5):
        nb_votes = np.zeros(len(categories))
        for i in range(nb_classifieurs):
            for c in range(len(categories)):
                if predictions[i] == categories[c]:
                    nb_votes[c] += self.alpha[i]
        nb_votes = nb_votes/sum(self.alpha)

        temp = []
        for c in range(len(categories)):
            if (nb_votes[c] >= seuil):
                temp.append(categories[c])
        if temp == []:
            return "NA"
        else:
            return temp

    def predict(self, new_data, categories, seuil=0.5):
        self.pred = []
        for t in self.trees:
            self.pred.append(t.predict(new_data))
        self.pred = pd.DataFrame(self.pred)
        return self.votes(self.pred, categories= categories, seuil=seuil)

# Chargement des données

In [3]:
cancer_data = pd.read_csv("phpAmSP4g.csv")
cancer_categories = np.unique(cancer_data["Class"])
y = cancer_data["Class"]
X = cancer_data.drop(labels=["Class"], axis=1)
cancer_data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,V29,V30,Class
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,2
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,2
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,2
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,2
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,2


In [4]:
np.round(100*cancer_data["Class"].value_counts()/np.shape(cancer_data)[0], 1)

1    62.7
2    37.3
Name: Class, dtype: float64

In [5]:
cancer_data.dtypes

V1       float64
V2       float64
V3       float64
V4       float64
V5       float64
V6       float64
V7       float64
V8       float64
V9       float64
V10      float64
V11      float64
V12      float64
V13      float64
V14      float64
V15      float64
V16      float64
V17      float64
V18      float64
V19      float64
V20      float64
V21      float64
V22      float64
V23      float64
V24      float64
V25      float64
V26      float64
V27      float64
V28      float64
V29      float64
V30      float64
Class      int64
dtype: object

C'est un problème de classification a deux classes déséquilibré. Nous avons 30 variables numériques.

## Application

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
boost_model = boosting(nb_iterations=100, model = "decision_tree")
boost_model.fit(train_data = X_train, y = y_train)

In [7]:
pred = boost_model.predict(new_data= X_train, categories = cancer_categories, seuil = 0.5)
y_chap = np.array(pred)
acc_train = accuracy_score(y_train, y_chap)

In [8]:
pred = boost_model.predict(new_data= X_test, categories = cancer_categories, seuil = 0.5)
print(pred[1:10])
y_chap = np.array(pred)
acc_test = accuracy_score(y_test, y_chap)

[[2], [2], [1], [1], [2], [2], [2], [1], [1]]


In [9]:
print("Accuracy on train data = %0.3f"%acc_train)
print("Accuracy on test data = %0.3f"%acc_test)

Accuracy on train data = 1.000
Accuracy on test data = 0.959


Les accuracy sont satisfaisantes. On s'attendait à une plus grande erreur de généralisation car les modèles de boosting ont tendances à apprendre "par coeur" les données d'apprentissage ce qui peut provoquer l'apparition d'un grand biais à cause d'un phénomène de sur-apprentissage. 
Ce n'est pas le cas pour cette instance du modèle de boosting qui semble avoir formulé des hypothèses cohérentes. Cela peut être dû a un jeu de données facile à apprendre ou grâce a de la chance.

In [13]:
acc_train = []
acc_test = []
for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    boost_model = boosting(nb_iterations=100, model = "decision_tree")
    boost_model.fit(train_data = X_train, y = y_train)
    
    pred = boost_model.predict(new_data= X_train, categories = cancer_categories, seuil = 0.5)
    y_chap = np.array(pred)
    acc_train.append(accuracy_score(y_train, y_chap))
    
    pred = boost_model.predict(new_data= X_test, categories = cancer_categories, seuil = 0.5)
    y_chap = np.array(pred)
    acc_test.append(accuracy_score(y_test, y_chap))
    
print("Accuracy on train data : ")
print("\t -mean = %0.3f"%np.mean(acc_train))
print("\t -standard deviation = %0.3f"%np.std(acc_train))

print("Accuracy on test data : ")
print("\t -mean = %0.3f"%np.mean(acc_test))
print("\t -standard deviation = %0.3f"%np.std(acc_test))

Accuracy on train data : 
	 -mean = 0.590
	 -standard deviation = 0.404
Accuracy on test data : 
	 -mean = 0.572
	 -standard deviation = 0.388


In [14]:
acc_train

[0.09045226130653267, 0.1306532663316583, 1.0, 1.0, 0.7311557788944724]

In [15]:
acc_test

[0.0935672514619883,
 0.13450292397660818,
 0.9649122807017544,
 0.9766081871345029,
 0.6900584795321637]

Il semblerait que la chance soit intervenu. On constate que deux instances possèdent d'excellents indicateurs de performances, tandis que les autres sont loin d'être satisfaisants. Il est possible que notre modèle n'a pas assez de liberté pour pouvoir explorer et identifier les informations pertinentes. Nous utilisions jusqu'à présent des arbres de profondeur égale a 1. Voyons ce qu'il se produit si nous utilisons des arbres de profondeur 2.

In [17]:
acc_train = []
acc_test = []
for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    boost_model = boosting(nb_iterations=100, model = "decision_tree")
    boost_model.fit(train_data = X_train, y = y_train, max_depth=2)
        
    pred = boost_model.predict(new_data= X_train, categories = cancer_categories, seuil = 0.5)
    y_chap = np.array(pred)
    acc_train.append(accuracy_score(y_train, y_chap))
    
    pred = boost_model.predict(new_data= X_test, categories = cancer_categories, seuil = 0.5)
    y_chap = np.array(pred)
    acc_test.append(accuracy_score(y_test, y_chap))
    
print("Accuracy on train data : ")
print("\t -mean = %0.3f"%np.mean(acc_train))
print("\t -standard deviation = %0.3f"%np.std(acc_train))

print("Accuracy on test data : ")
print("\t -mean = %0.3f"%np.mean(acc_test))
print("\t -standard deviation = %0.3f"%np.std(acc_test))

Accuracy on train data : 
	 -mean = 1.000
	 -standard deviation = 0.000
Accuracy on test data : 
	 -mean = 0.972
	 -standard deviation = 0.011


Nous constatons que notre hypothèse semble vérifiée : le jeu de données est facile à apprendre à condition que le modèle possède suffisamment de capacité d'exploration.