In [1]:
import random
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
from sklearn.model_selection import StratifiedKFold

# Fonctions utiles

In [2]:
#predictions[i,j] = categorie prédites par le i-ème classifieur pour l'individu j
def votes(predictions, categories, seuil=0.5):
    nb_individus = np.shape(predictions)[1]
    nb_classifieurs = np.shape(predictions)[0]
    res_votes = []
    
    for j in range(nb_individus):
        res_votes.append(vote(predictions[j], categories, nb_classifieurs, seuil))
    
    return res_votes

#Vote pour un seul individu
def vote(predictions, categories, nb_classifieurs, seuil = 0.5):
    nb_votes = np.zeros(len(categories))
    for i in range(nb_classifieurs):
        for c in range(len(categories)):
            if predictions[i] == categories[c]:
                nb_votes[c] +=1

    temp = []
    for c in range(len(categories)):
        if (nb_votes[c]/nb_classifieurs >= seuil):
            temp.append(categories[c])
    if temp == []:
        return "NA"
    else:
        return temp

# Chargement des données

**Remarque :** la méthode ECOC fonctionne bien sur des problèmes de classification avec un grand nombre de classes. Elle repose sur le principe qui consiste à prédire un groupe de classes contre les classes restantes. Elle n'a donc pas d'intéret sur les données _cancer_ car ce problème ne comporte que deux classes à prédire. 

In [3]:
satimage_data = pd.read_csv("dataset_186_satimage.csv")
satimage_categories = np.unique(satimage_data["class"])
y = satimage_data["class"]
X = satimage_data.drop(labels=["class"], axis=1)
satimage_data.head()

Unnamed: 0,Aattr,Battr,Cattr,Dattr,Eattr,Fattr,A1attr,B2attr,C3attr,D4attr,...,D22attr,E23attr,F24attr,A25attr,B26attr,C27attr,D28attr,E29attr,F30attr,class
0,0.117596,1.241362,1.184036,0.815302,-0.158561,1.256483,1.193546,0.818486,-0.141965,0.879481,...,0.807707,-0.069968,1.21916,1.250463,0.597678,-0.054291,1.233342,1.262255,0.603258,1.0
1,-1.205362,-1.249654,-0.077532,0.444886,-0.895959,-0.447579,-0.78676,-0.554203,-0.364672,0.092157,...,-0.192752,-0.736996,-0.969292,-0.844805,-0.40003,-0.725852,-0.344432,-0.594534,-0.183967,5.0
2,0.779075,0.148811,0.042617,-0.24303,0.800057,0.164136,0.05337,-0.448612,0.154978,-0.345245,...,-0.877277,0.671174,-0.006373,-0.425752,-0.662584,0.691889,0.356801,-0.175259,-0.236449,7.0
3,1.146564,0.585831,0.342991,0.021553,0.947536,0.601074,0.353416,0.02655,1.788164,1.010702,...,0.28115,1.412317,1.044084,0.532085,0.282612,1.438068,1.058033,0.842981,0.130923,3.0
4,-0.764376,-1.16225,-0.137607,0.180303,-0.969698,-1.146681,-0.126658,0.184937,-0.735851,-1.132569,...,-0.192752,-0.885225,-1.231906,-0.784941,-0.347519,-0.875088,-1.220973,-0.774223,-0.551339,5.0


In [4]:
np.round(100*satimage_data["class"].value_counts()/np.shape(satimage_data)[0], 1)

1.0    23.8
7.0    23.5
3.0    21.1
5.0    11.0
2.0    10.9
4.0     9.7
Name: class, dtype: float64

On a seulement 6 classes donc on peut utiliser les codes exhaustifs i.e. utiliser $2^{6-1}-1 = 31 $ classifieurs.

# Error-correcting Output Codes

## Fonctions

In [5]:
class ECOC:

    #Génère les codes exhaustifs
    def generer_codes(self, nb_classes):
        nb_classifieurs = 2**(nb_classes-1)-1
        codes = np.zeros((nb_classes, nb_classifieurs))
        for i in range(nb_classifieurs):
            temp = format(i+1 , 'b') #code binaire du i-ème classifieur
            for j in range(len(temp)):
                codes[nb_classes-j-1][i] = int(temp[len(temp)-j-1])

        return codes
        
    def __init__(self, nb_categories, model = "decision_tree", nb_classifieurs=0):
        self.model = model
        self.pred = []
        self.trees = []
        self.subspaces = []
        self.codes = self.generer_codes(nb_classes=nb_categories)
        if nb_classifieurs !=0:
            self.nb_classifieurs = nb_classifieurs
        else:
            self.nb_classifieurs = 2**(nb_categories-1)-1
            
    def regroupement_classes(self, y, code, categories):
        categories_i = categories[code == 1]
        new_y = np.repeat(False, len(y))
        for c in categories_i:
            new_y = np.logical_or(new_y, np.where(y == c, True, False))
            
        return new_y*1
        
        
    def fit(self, train_data, y, categories, max_depth=1):
        if not isinstance(y, np.ndarray):
            y = np.array(y)
        nrow = np.shape(train_data)[0]
        predictions = []
        variables_names = train_data.columns
        for i in range(self.nb_classifieurs):
            if self.model == "decision_tree" :
                new_model = DecisionTreeClassifier(max_depth=max_depth)
            new_y = self.regroupement_classes(y=y, code=self.codes[:,i], categories=categories)
            self.trees.append(new_model.fit(X = train_data, y=new_y))
            
    def predict_codes(self, new_data, categories):
        self.codes_pred = []
        for t in self.trees:
            self.codes_pred.append(t.predict(new_data))
        self.codes_pred = pd.DataFrame(self.codes_pred)
        
    def hamming_distances(self, code1, code2):
        return len(code1) - sum( np.array(code1) == np.array(code2))
        
    def predict(self, new_data, categories):
        self.pred = []
        self.predict_codes(new_data, categories)
        
        for i in range(np.shape(new_data)[0]):
            cp = np.array(self.codes_pred[i])
            count = 0
            min_dist = self.nb_classifieurs
            min_categorie = categories[0]
            for c in self.codes:
                dist_temp = self.hamming_distances(cp, np.array(c))
                if min_dist > dist_temp:
                    min_dist = dist_temp
                    min_categorie = categories[count]
                count+=1
            self.pred.append(min_categorie)
                
        self.pred = np.array(self.pred)
        return self.pred

# Application de l'ECOC sur les données satimage 

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
p = len(satimage_data.columns)
p

37

In [7]:
nb_categories = len(satimage_categories)
ecoc_model = ECOC(nb_categories=nb_categories, model="decision_tree")
pd.DataFrame(ecoc_model.codes)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
4,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
5,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0


Chaque colonne est un code binaire. Par exemple, le premier code regroupe les 5 premières vairables en un groupe et la dernière variable constitue le deuxième groupe.

Les codes sont cohérents car on a bien 31 codes et les colonnes sont bien indépendantes deux à deux.

In [8]:
ecoc_model.fit(train_data = X_train, y = y_train, max_depth=1, categories=satimage_categories)
pred = ecoc_model.predict(X_train, categories = satimage_categories)
print(pred)
y_chap = np.array(pred)
acc_train = accuracy_score(y_train, y_chap)

[3. 1. 3. ... 2. 1. 7.]


In [9]:
pred = ecoc_model.predict(X_test, categories = satimage_categories)
y_chap = np.array(pred)
acc_test = accuracy_score(y_test, y_chap)

In [10]:
print("Accuracy on train data = %0.3f"%acc_train)
print("Accuracy on test data = %0.3f"%acc_test)

Accuracy on train data = 0.738
Accuracy on test data = 0.734


L'accuracy est satisfaisante sachant que l'on a utilisé seulement 31 arbres de profondeur égale à 1. On sent que cette méthode a du potentiel. Par exemple, on pourrait prendre plus d'arbres pour prédire un même code. Bien entendu, il ne faut pas les entrainer sur les mêmes données sinon nous aurions simplement plusieurs arbres identiques. Plus simplement, nous pourrions aussi augmenter la profondeur des arbres.

In [11]:
ecoc_model = ECOC(nb_categories=nb_categories, model="decision_tree")
pd.DataFrame(ecoc_model.codes)

ecoc_model.fit(train_data = X_train, y = y_train, max_depth=2, categories=satimage_categories)
pred = ecoc_model.predict(X_train, categories = satimage_categories)
y_chap = np.array(pred)
acc_train = accuracy_score(y_train, y_chap)

pred = ecoc_model.predict(X_test, categories = satimage_categories)
y_chap = np.array(pred)
acc_test = accuracy_score(y_test, y_chap)

print("Accuracy on train data = %0.3f"%acc_train)
print("Accuracy on test data = %0.3f"%acc_test)

Accuracy on train data = 0.804
Accuracy on test data = 0.793


Nous obtenons en effet des meilleurs résultats en augmentant la profondeur des arbres.