In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

np.random.seed(42)

# <font color='green'>Quatrième partie: Bagging versus Boosting 
    
## <font color='blue'> Etape 1. Analyser la base de donnée "breast_cancer"

In [None]:
cancer = load_breast_cancer()
# print the names of the 13 features
print("Features: ", cancer.feature_names)

# print the label type of cancer('malignant' 'benign')
print("Labels: ", cancer.target_names)

# print data(feature)shape
cancer.data.shape

In [None]:
# print the cancer data features (top 5 records)
print(cancer.data[0:2])

# print the cancer labels (0:malignant, 1:benign)
print(cancer.target)

In [None]:
x = cancer['data']
y = cancer['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)  # 70% training and 30% test


## Travail d'analyse
- Décrire la base de données de l'étude.

## <font color='blue'>Etape 2. Comparer : DT, RF, Bagging, Boosting

In [None]:
from sklearn import tree

# Bagging Classifier with 1 model (A decision tree)
clf = tree.DecisionTreeClassifier(random_state=42)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
accuracy = clf.score(x_test,y_test)
print("Accuracy value:","%.3f" %(100*accuracy),"%")


In [None]:
from sklearn.ensemble import BaggingClassifier
# Bagging Classifier with several decision trees
clf = BaggingClassifier(tree.DecisionTreeClassifier(), n_estimators=300, random_state=42)

clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
accuracy = clf.score(x_test,y_test)
print("Accuracy value:","%.3f" %(100*accuracy),"%")


In [None]:
from sklearn.ensemble import RandomForestClassifier
# Random Forest
clf = RandomForestClassifier(n_estimators=300, random_state=42)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
accuracy = clf.score(x_test,y_test)
print("Accuracy value:","%.3f" %(100*accuracy),"%")

In [None]:
from sklearn.ensemble import AdaBoostClassifier
# Adaboost 
weak_learner = DecisionTreeClassifier(max_depth=1) #One level decision tree (decision stump)
model_ada_tree = AdaBoostClassifier(estimator=weak_learner, random_state=42)

model_ada_tree.fit(x_train, y_train)
y_pred = model_ada_tree.predict(x_test)

print("Accuracy value:","%.3f" %(100*np.mean(y_test==y_pred)),"%")


## Travail d'analyse
- Comparer les résultats de ces quatres classifieurs, tous exploitant des arbres. Expliquer.

## <font color='blue'> Etape 3. Etude des hyperparamètres de Adaboost

### Hyperparamètre 1 : base_estimators
- Il définit le type d'algorithme à utiliser.

In [None]:
#Estimator: decision tree
weak_learner = DecisionTreeClassifier(max_depth=1) # One level decision tree 
model_ada_tree = AdaBoostClassifier(estimator=weak_learner)

model_ada_tree.fit(x_train, y_train)
y_pred = model_ada_tree.predict(x_test)

print("Accuracy value:","%.3f" %(100*np.mean(y_test==y_pred)),"%")

In [None]:
#Estimator: decision tree
weak_learner = DecisionTreeClassifier(max_depth=10) 
model_ada_tree = AdaBoostClassifier(estimator=weak_learner)

model_ada_tree.fit(x_train, y_train)
y_pred = model_ada_tree.predict(x_test)

print("Accuracy value:","%.3f" %(100*np.mean(y_test==y_pred)),"%")

In [None]:
#Estimator: SVM
weak_learner = SVC(probability=True, kernel='linear')
model_ada_rl = AdaBoostClassifier(estimator=weak_learner)

scaler = StandardScaler().fit(x_train)

x_train_scale = scaler.transform(x_train)
model_ada_rl.fit(x_train_scale, y_train)

x_test_scale = scaler.transform(x_test)
y_pred = model_ada_rl.predict(x_test_scale)

print("Accuracy value:","%.3f" %(100*np.mean(y_test==y_pred)),"%")

In [None]:
#Estimator: logistic regression
weak_learner = LogisticRegression()
model_ada_rl = AdaBoostClassifier(estimator=weak_learner)

scaler = StandardScaler().fit(x_train)

x_train_scale = scaler.transform(x_train)
model_ada_rl.fit(x_train_scale, y_train)

x_test_scale = scaler.transform(x_test)
y_pred = model_ada_rl.predict(x_test_scale)

print("Accuracy value:","%.3f" %(100*np.mean(y_test==y_pred)),"%")

## Travail d'analyse
- Comparer les résultats de Adaboost avec ces quatres types d'estimateurs. Expliquer.

### Hyperparamètre 2 : n_estimators
- Il définit le nombre d'estimateurs utlisés pour la construction de l'ensemble

In [None]:
weak_learner = DecisionTreeClassifier(max_depth=1)
model_ada_tree = AdaBoostClassifier(estimator=weak_learner, n_estimators=5)

model_ada_tree.fit(x_train, y_train)
y_pred = model_ada_tree.predict(x_test)

print("Accuracy value:","%.3f" %(100*np.mean(y_test==y_pred)),"%")

In [None]:
weak_learner = DecisionTreeClassifier(max_depth=1)
model_ada_tree = AdaBoostClassifier(estimator=weak_learner, n_estimators=100)

model_ada_tree.fit(x_train, y_train)
y_pred = model_ada_tree.predict(x_test)

print("Accuracy value:","%.3f" %(100*np.mean(y_test==y_pred)),"%")

## Travail d'analyse
- Etudier l'impact du nombre d'estimateurs.

## Hyperparamètre 3 : learning_rate
- Il contrôle la vitesse à laquelle les poids changent à chaque itération lors de l'apprentissage.

Evaluer les performances de Adaboost pour les cas de figure suivants : 

    (i) n_estimators=100 et learning_rate=1

    (ii) n_estimators=100 and learning_rate=2

    (iii) learning_rate=0.5 and n_estimators=100

    (iv) with learning_rate=0.5 and n_estimators=500

In [None]:
weak_learner = DecisionTreeClassifier(max_depth=1)

model_ada_tree = AdaBoostClassifier(estimator=weak_learner, n_estimators=100, learning_rate=1, random_state=42)

model_ada_tree.fit(x_train, y_train)
y_pred = model_ada_tree.predict(x_test)

print("Accuracy value:","%.3f" %(100*np.mean(y_test==y_pred)),"%")

## Travail d'analyse
- Analyser l'impact du learning rate et étudier sa relation avec le nombre d'estimateurs.

## <font color='blue'> Etape 4. Visualiser la contribution (le poids) et les erreurs de chaque estimateur
    
- "estimator_weights_" indique la contribution de chaque estimateur 
- "estimator_errors_" indique la performance de chaque estimateur

In [None]:
weak_learner = DecisionTreeClassifier(max_depth=1)
#By default: algorithm=SAMME.R : a variant that works with classifiers that can output prediction probabilities.
model_ada_tree = AdaBoostClassifier(estimator=weak_learner, n_estimators=100, algorithm="SAMME", learning_rate=1,random_state=42)

model_ada_tree.fit(x_train, y_train)
y_pred = model_ada_tree.predict(x_test)

print("Accuracy value:","%.3f" %(100*np.mean(y_test==y_pred)),"%")

wj=model_ada_tree.estimator_weights_
plt.plot(wj,'b-')
plt.xlabel('Nmber of estimators')
plt.ylabel('Estimator weights')
plt.show()

In [None]:
print('Accuracy =', round(np.mean(y_test==y_pred)*100,3),'%')

e=model_ada_tree.estimator_errors_
plt.plot(e,'r--')
plt.xlabel('Nmber of estimators')
plt.ylabel('Estimator erros')
plt.show()

## Travail d'analyse
- Interpréter les deux courbes obtenues.

## <font color='blue'> Etape 5. Sélection de variables

In [None]:
model_ada_tree.feature_importances_

In [None]:
pd.DataFrame([model_ada_tree.feature_importances_], columns=cancer['feature_names'])

## Travail d'analyse
- Quelles sont les variables qui semblent pertinentes?

## Travail supplémentaire
- Appliquer Adaboost pour la classification sur la base "iris de fisher" avec un SVM linéaire comme weak learner.