# Déséquilibre en classification et Challenge

## 1 - Classification à partir d'un jeu de données déséqilibré 

### 1.1 Jeu de données artificiellement généré 

In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix
import numpy as np 

In [2]:
X,Y = make_classification(n_samples=150, n_features=20, n_informative=10,
                                n_redundant=5, n_repeated=5, n_classes=2,
                                n_clusters_per_class=2, weights=[0.9, 0.1],
                                flip_y=0.01, class_sep=1.5, hypercube=True)
cnames=["M","m"]

X_app,X_test,Y_app,Y_test=train_test_split(X,Y,test_size=0.30,random_state=12)

print("*********** Rapport avec déséquilibre ************") 

*********** Rapport avec déséquilibre ************


In [3]:
clf_nb = DummyClassifier(strategy="stratified") 
clf_nb.fit(X_app,Y_app)
y_pred_nb = clf_nb.predict(X_test)
print("Maj:",classification_report(Y_test, y_pred_nb, target_names=cnames))

Maj:               precision    recall  f1-score   support

           M       0.89      0.83      0.86        41
           m       0.00      0.00      0.00         4

    accuracy                           0.76        45
   macro avg       0.45      0.41      0.43        45
weighted avg       0.82      0.76      0.78        45



In [4]:
clf_nb = GaussianNB()
clf_nb.fit(X_app,Y_app)
y_pred_nb = clf_nb.predict(X_test)
print("NB: ",classification_report(Y_test, y_pred_nb, target_names=cnames))

NB:                precision    recall  f1-score   support

           M       0.98      1.00      0.99        41
           m       1.00      0.75      0.86         4

    accuracy                           0.98        45
   macro avg       0.99      0.88      0.92        45
weighted avg       0.98      0.98      0.98        45



In [5]:
clf_dt = DecisionTreeClassifier()
clf_dt.fit(X_app,Y_app)
y_pred_nb = clf_dt.predict(X_test)
print("DT: ",classification_report(Y_test, y_pred_nb, target_names=cnames))


DT:                precision    recall  f1-score   support

           M       0.98      1.00      0.99        41
           m       1.00      0.75      0.86         4

    accuracy                           0.98        45
   macro avg       0.99      0.88      0.92        45
weighted avg       0.98      0.98      0.98        45



In [5]:
clf_kppv = KNeighborsClassifier()
clf_kppv.fit(X_app,Y_app)
y_pred_nb = clf_kppv.predict(X_test)
print("KP: ",classification_report(Y_test, y_pred_nb, target_names=cnames))

KP:                precision    recall  f1-score   support

           M       0.91      1.00      0.95        41
           m       0.00      0.00      0.00         4

    accuracy                           0.91        45
   macro avg       0.46      0.50      0.48        45
weighted avg       0.83      0.91      0.87        45

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Question 1**

Le code ci-dessus renvoie le rapport de classification de plusieurs classifieur sur un jeu de données déséquililbré. 

Le rapport renvoit comme information : 

- Precision : le rapport entre les vrais positifs et la somme des vrais et faux positifs. C'est l'exactitude du classifieur pour une classe. 
    - *Accuracy of positive predictions.*
    - *TP/(TP + FP)*
- Recall : mesure de la complétude du classifieur, c'est-à-dire la capacité d'un classifieur à trouver correctement toutes les instances positives.
    - *Fraction of positives that were correctly identified.*
    - *TP/(TP+FN)*
- f1-score : moyenne harmonique pondérée de la précision et du rappel
    - *En règle générale, la moyenne pondérée de F1 devrait être utilisée pour comparer les modèles de classificateurs, et non la précision globale.*
    - *2x(Recall x Precision) / (Recall + Precision)*
- Support : le nombre d'occurrences réelles de la classe dans l'ensemble de données spécifié


- macro avg : moyenne non pondérée 
- weighted avg : moyenne pondérée par le support (le nombre d'instances vraies pour chaque étiquette). Cela modifie " macro " pour prendre en compte le déséquilibre des étiquettes ; cela peut donner un score F qui ne se situe pas entre la précision et le rappel.

---
[Reference 1](https://www.scikit-yb.org/en/latest/api/classifier/classification_report.html)
[Reference 2](https://muthu.co/understanding-the-classification-report-in-sklearn/)
[Reference 3](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html#sklearn.metrics.precision_recall_fscore_support)

---

**Question 2** 

In [6]:
l_weights=[[0.1, 0.9],[0.2,0.8],[0.3,0.7],[0.5,0.5]]

for i in l_weights : 
    X,Y = make_classification(n_samples=150, n_features=20, n_informative=10,
                                n_redundant=5, n_repeated=5, n_classes=2,
                                n_clusters_per_class=2, weights=[0.9, 0.1],
                                flip_y=0.01, class_sep=1.5, hypercube=True)
    cnames=["M","m"]

    X_app,X_test,Y_app,Y_test=train_test_split(X,Y,test_size=0.30,random_state=12)

    print("*********** Rapport avec déséquilibre / weights = {0}************".format(i)) 
    
    clf_nb = DummyClassifier(strategy="stratified") 
    clf_nb.fit(X_app,Y_app)
    y_pred_nb = clf_nb.predict(X_test)
    print("Maj:",classification_report(Y_test, y_pred_nb, target_names=cnames))
    
    clf_nb = GaussianNB()
    clf_nb.fit(X_app,Y_app)
    y_pred_nb = clf_nb.predict(X_test)
    print("NB: ",classification_report(Y_test, y_pred_nb, target_names=cnames))

    clf_dt = DecisionTreeClassifier()
    clf_dt.fit(X_app,Y_app)
    y_pred_nb = clf_dt.predict(X_test)
    print("DT: ",classification_report(Y_test, y_pred_nb, target_names=cnames))

    clf_kppv = KNeighborsClassifier()
    clf_kppv.fit(X_app,Y_app)
    y_pred_nb = clf_kppv.predict(X_test)
    print("KP: ",classification_report(Y_test, y_pred_nb, target_names=cnames))

*********** Rapport avec déséquilibre / weights = [0.1, 0.9]************
Maj:               precision    recall  f1-score   support

           M       0.86      0.80      0.83        40
           m       0.00      0.00      0.00         5

    accuracy                           0.71        45
   macro avg       0.43      0.40      0.42        45
weighted avg       0.77      0.71      0.74        45

NB:                precision    recall  f1-score   support

           M       0.97      0.93      0.95        40
           m       0.57      0.80      0.67         5

    accuracy                           0.91        45
   macro avg       0.77      0.86      0.81        45
weighted avg       0.93      0.91      0.92        45

DT:                precision    recall  f1-score   support

           M       0.97      0.95      0.96        40
           m       0.67      0.80      0.73         5

    accuracy                           0.93        45
   macro avg       0.82      0.88      0

La diminution du déséquilibre permet d'observer une amélioration des résultats pour certains modèles, tandis que d'autres parraissent meilleurs lorsque le weight est déséquilibré. 

### 1.2 Sur de vraies données  

**Question 1**

In [7]:
from sklearn.datasets import load_breast_cancer

In [8]:
data = load_breast_cancer()

In [9]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.30,random_state=12)

In [10]:
cnames=list(data.target_names)

print("*********** Rapport avec déséquilibre ************") 

clf_nb = DummyClassifier(strategy="stratified") 
clf_nb.fit(X_train,Y_train)
y_pred_nb = clf_nb.predict(X_test)
print("Maj:",classification_report(Y_test, y_pred_nb, target_names=cnames))

clf_nb = GaussianNB()
clf_nb.fit(X_train,Y_train)
y_pred_nb = clf_nb.predict(X_test)
print("NB: ",classification_report(Y_test, y_pred_nb, target_names=cnames))

clf_dt = DecisionTreeClassifier()
clf_dt.fit(X_train,Y_train)
y_pred_nb = clf_dt.predict(X_test)
print("DT: ",classification_report(Y_test, y_pred_nb, target_names=cnames))

clf_kppv = KNeighborsClassifier()
clf_kppv.fit(X_train,Y_train)
y_pred_nb = clf_kppv.predict(X_test)
print("KP: ",classification_report(Y_test, y_pred_nb, target_names=cnames))

*********** Rapport avec déséquilibre ************
Maj:               precision    recall  f1-score   support

   malignant       0.32      0.38      0.35        64
      benign       0.59      0.53      0.56       107

    accuracy                           0.47       171
   macro avg       0.46      0.45      0.45       171
weighted avg       0.49      0.47      0.48       171

NB:                precision    recall  f1-score   support

   malignant       0.95      0.84      0.89        64
      benign       0.91      0.97      0.94       107

    accuracy                           0.92       171
   macro avg       0.93      0.91      0.92       171
weighted avg       0.93      0.92      0.92       171

DT:                precision    recall  f1-score   support

   malignant       0.89      0.84      0.86        64
      benign       0.91      0.93      0.92       107

    accuracy                           0.90       171
   macro avg       0.90      0.89      0.89       171
weighted

**Question 2**

In [11]:
from collections import Counter
from random import randint

In [12]:
while max(Counter(y).values()) - min(Counter(y).values()) != 0:
    i = randint(0, len(y))
    if y[i] == 1 : 
        X = np.delete(X,(i),axis=0)
        y = np.delete(y,i)
print(Counter(y).values())

dict_values([212, 212])


In [13]:
print(X.shape)
print(y.shape)

(424, 30)
(424,)


In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.30,random_state=12)

In [15]:
print("*********** Rapport avec déséquilibre ************") 

clf_nb = DummyClassifier(strategy="stratified") 
clf_nb.fit(X_train,Y_train)
y_pred_nb = clf_nb.predict(X_test)
print("Maj:",classification_report(Y_test, y_pred_nb, target_names=cnames))

clf_nb = GaussianNB()
clf_nb.fit(X_train,Y_train)
y_pred_nb = clf_nb.predict(X_test)
print("NB: ",classification_report(Y_test, y_pred_nb, target_names=cnames))

clf_dt = DecisionTreeClassifier()
clf_dt.fit(X_train,Y_train)
y_pred_nb = clf_dt.predict(X_test)
print("DT: ",classification_report(Y_test, y_pred_nb, target_names=cnames))

clf_kppv = KNeighborsClassifier()
clf_kppv.fit(X_train,Y_train)
y_pred_nb = clf_kppv.predict(X_test)
print("KP: ",classification_report(Y_test, y_pred_nb, target_names=cnames))

*********** Rapport avec déséquilibre ************
Maj:               precision    recall  f1-score   support

   malignant       0.44      0.49      0.47        61
      benign       0.48      0.43      0.46        67

    accuracy                           0.46       128
   macro avg       0.46      0.46      0.46       128
weighted avg       0.46      0.46      0.46       128

NB:                precision    recall  f1-score   support

   malignant       0.93      0.90      0.92        61
      benign       0.91      0.94      0.93        67

    accuracy                           0.92       128
   macro avg       0.92      0.92      0.92       128
weighted avg       0.92      0.92      0.92       128

DT:                precision    recall  f1-score   support

   malignant       0.90      0.98      0.94        61
      benign       0.98      0.90      0.94        67

    accuracy                           0.94       128
   macro avg       0.94      0.94      0.94       128
weighted

**Question 3**

1ère technique

In [11]:
X, y = load_breast_cancer(return_X_y=True)

In [17]:
while max(Counter(y).values()) != min(Counter(y).values()):
    i = randint(0, len(y)-1)
    if y[i] == 0 :
        X = np.append(X,[X[i]], axis = 0)
        y = np.append(y,[y[i]], axis = 0)

print(Counter(y).values())

dict_values([357, 357])


In [18]:
print(X.shape)
print(y.shape)

(714, 30)
(714,)


In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.30,random_state=12)

In [20]:
print("*********** Rapport avec déséquilibre ************") 

clf_nb = DummyClassifier(strategy="stratified") 
clf_nb.fit(X_train,Y_train)
y_pred_nb = clf_nb.predict(X_test)
print("Maj:",classification_report(Y_test, y_pred_nb, target_names=cnames))

clf_nb = GaussianNB()
clf_nb.fit(X_train,Y_train)
y_pred_nb = clf_nb.predict(X_test)
print("NB: ",classification_report(Y_test, y_pred_nb, target_names=cnames))

clf_dt = DecisionTreeClassifier()
clf_dt.fit(X_train,Y_train)
y_pred_nb = clf_dt.predict(X_test)
print("DT: ",classification_report(Y_test, y_pred_nb, target_names=cnames))

clf_kppv = KNeighborsClassifier()
clf_kppv.fit(X_train,Y_train)
y_pred_nb = clf_kppv.predict(X_test)
print("KP: ",classification_report(Y_test, y_pred_nb, target_names=cnames))

*********** Rapport avec déséquilibre ************
Maj:               precision    recall  f1-score   support

   malignant       0.62      0.56      0.59       117
      benign       0.53      0.59      0.56        98

    accuracy                           0.58       215
   macro avg       0.58      0.58      0.58       215
weighted avg       0.58      0.58      0.58       215

NB:                precision    recall  f1-score   support

   malignant       0.97      0.91      0.94       117
      benign       0.90      0.97      0.94        98

    accuracy                           0.94       215
   macro avg       0.94      0.94      0.94       215
weighted avg       0.94      0.94      0.94       215

DT:                precision    recall  f1-score   support

   malignant       0.96      0.91      0.93       117
      benign       0.90      0.96      0.93        98

    accuracy                           0.93       215
   macro avg       0.93      0.93      0.93       215
weighted

2ème technique

In [22]:
X, y = load_breast_cancer(return_X_y=True)

In [23]:
while max(Counter(y).values()) != min(Counter(y).values()):
    i = randint(0, len(y)-1)
    if y[i] == 0 :
        mu = abs(np.mean(y[i]))
        sigma = np.std(y[i])
        noise = np.random.normal(mu, sigma, 30)
        noisy = X[i].copy()
        noisy += noise
        X = np.append(X,[noisy], axis = 0)
        y = np.append(y,[y[i]], axis = 0)

print(Counter(y).values())

dict_values([357, 357])


In [24]:
print(X.shape)
print(y.shape)

(714, 30)
(714,)


In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.30,random_state=12)

In [26]:
print("*********** Rapport avec déséquilibre ************") 

clf_nb = DummyClassifier(strategy="stratified") 
clf_nb.fit(X_train,Y_train)
y_pred_nb = clf_nb.predict(X_test)
print("Maj:",classification_report(Y_test, y_pred_nb, target_names=cnames))

clf_nb = GaussianNB()
clf_nb.fit(X_train,Y_train)
y_pred_nb = clf_nb.predict(X_test)
print("NB: ",classification_report(Y_test, y_pred_nb, target_names=cnames))

clf_dt = DecisionTreeClassifier()
clf_dt.fit(X_train,Y_train)
y_pred_nb = clf_dt.predict(X_test)
print("DT: ",classification_report(Y_test, y_pred_nb, target_names=cnames))

clf_kppv = KNeighborsClassifier()
clf_kppv.fit(X_train,Y_train)
y_pred_nb = clf_kppv.predict(X_test)
print("KP: ",classification_report(Y_test, y_pred_nb, target_names=cnames))

*********** Rapport avec déséquilibre ************
Maj:               precision    recall  f1-score   support

   malignant       0.49      0.41      0.45       117
      benign       0.41      0.49      0.45        98

    accuracy                           0.45       215
   macro avg       0.45      0.45      0.45       215
weighted avg       0.45      0.45      0.45       215

NB:                precision    recall  f1-score   support

   malignant       0.98      0.86      0.92       117
      benign       0.86      0.98      0.91        98

    accuracy                           0.92       215
   macro avg       0.92      0.92      0.92       215
weighted avg       0.92      0.92      0.92       215

DT:                precision    recall  f1-score   support

   malignant       0.92      0.93      0.93       117
      benign       0.92      0.91      0.91        98

    accuracy                           0.92       215
   macro avg       0.92      0.92      0.92       215
weighted

## 1 - Challenge sur donnees réelles (Kaggle), pour pratiquer

In [50]:
import pandas as pd
from sklearn import preprocessing

df = pd.read_csv('titanic.csv', sep = ';')
le = preprocessing.LabelEncoder()

In [51]:
df['Sex'] = le.fit_transform(df['Sex'])

In [52]:
nb_manquantes = len(df['Embarked'][df.Embarked.isnull()])
val_remplacement = df['Embarked'].dropna().mode().values
df['Embarked'][df['Embarked'].isnull()] = val_remplacement

print(df.iloc[96,11])
print(df.iloc[99,11])

S
S
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Embarked'][df['Embarked'].isnull()] = val_remplacement


Les deux valeurs manquantes ont été remplacées par la valeur S, qui est présente en majorité dans la colonne Embarked.

In [53]:
df['Embarked'] = le.fit_transform(df['Embarked'])

In [54]:
X_train = df['Survived']

In [55]:
df = df.drop(['Survived', 'Cabin', 'Name', 'Ticket', 'PassengerId','Fare'], axis=1)