In [119]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# Classificateur par vote

In [42]:
X, y = make_moons(n_samples=500, noise=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [60]:
voting_clf = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc', SVC(random_state=42))
    ]
)

In [78]:
voting_clf.fit(X_train, y_train)

In [82]:
for name, clf in voting_clf.named_estimators.items():    # named_estimators : pour accéder aux bibliothèques
    clf.fit(X_train, y_train)  
    print(name, "=", clf.score(X_test, y_test))  

lr = 0.864
rf = 0.896
svc = 0.896


In [84]:
voting_clf.predict(X_test[:1])

array([1], dtype=int64)

La prédiction "1" estr un vote rigide, 2 classificateurs sur 3 ont prédit le "1"

In [90]:
[clf.predict(X_test[:1]) for clf in voting_clf.estimators_]

[array([1], dtype=int64), array([1], dtype=int64), array([0], dtype=int64)]

Au dessus, la prédiction des 3 classificateurs

In [92]:
voting_clf.score(X_test, y_test)

0.912

Le classificateur par vote a une performance plus élevée que chacun des classificateurs

### Vote souple

In [103]:
voting_clf.voting = "soft"    
# permet de prédire la classe ayant la plus grande moyenne des probabilités sur l'ensemble des classificateurs
# Accorde plus de poids aux votes dont l'indice de confiance est élevé
voting_clf.named_estimators["svc"].probability = True
voting_clf.fit(X_train, y_train)
voting_clf.score(X_test, y_test)

0.92

### Vote rigide

In [109]:
voting_clf.voting = "hard"    
voting_clf.named_estimators["svc"].probability = True
voting_clf.fit(X_train, y_train)
voting_clf.score(X_test, y_test)

0.912

# MNIST

In [113]:
from scipy.io import arff

In [131]:
data, meta = arff.loadarff(r"C:\Users\nicolas.sales\Desktop\Projet csv\MNIST\mnist_784.arff")

X = np.array(data.tolist())

y = X[:, -1]

X = X[:, :-1]

In [133]:
X = X.astype(float)
y = y.astype(int)

In [141]:
X.shape

(70000, 784)

In [143]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [180]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

voting_clf = VotingClassifier(
    estimators=[
        ('lr', make_pipeline(StandardScaler(), LogisticRegression(random_state=42, max_iter=1000))),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc', make_pipeline(StandardScaler(), SVC(random_state=42, probability=True)))
    ],
    voting='soft'
)

voting_clf.fit(X_train, y_train)
print("Score final (voting='soft'):", voting_clf.score(X_test, y_test))

Score final (voting='soft'): 0.9635


In [182]:
for name, clf in voting_clf.named_estimators.items():    # named_estimators : pour accéder aux bibliothèques
    clf.fit(X_train, y_train)  
    print(name, "=", clf.score(X_test, y_test)) 

lr = 0.9217
rf = 0.9705
svc = 0.966


In [183]:
voting_clf.voting = "soft"    
voting_clf.named_estimators["svc"].probability = True
voting_clf.fit(X_train, y_train)
voting_clf.score(X_test, y_test)

0.9635

In [184]:
voting_clf.voting = "hard"    
voting_clf.named_estimators["svc"].probability = True
voting_clf.fit(X_train, y_train)
voting_clf.score(X_test, y_test)

0.9668