In [None]:
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import seaborn as sn
from pprint import pprint
import numpy as np

from sklearn.datasets import fetch_20newsgroups
# use annual index instead?
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
import pandas as pd
import re

# Chargement du dataset 20 newsgroups
Pour plus d'information : https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html

In [None]:
news = fetch_20newsgroups(subset='all')

In [None]:
print("Number of articles: " + str(len(news.data)))

In [None]:
print("Number of categories: " + str(len(news.target_names)))

# Exploration du dataset

In [None]:
labels = news.target_names
pprint(labels)

In [None]:
# Exemples d'articles et de labels
for i, article in enumerate(news.data[:10]):
    print(f'===== {labels[news.target[i]]} =====')
    print(article.replace('\n', ' '), '\n')

# Création d'un modèle de machine learning avec Scikit-Learn
Pour plus d'information :
- Pipeline : https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
- TfidfVectorizer : https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
- MultinomialNB : https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html

Un article de blog qui expliquer le TFIDF:
- https://medium.com/analytics-vidhya/tf-idf-term-frequency-technique-easiest-explanation-for-text-classification-in-nlp-with-code-8ca3912e58c3

Un article de blog qui explique les naive bayes:
- https://towardsdatascience.com/naive-bayes-classifier-explained-54593abe6e18

In [None]:
classifier = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'), min_df=50, max_df=0.5)),
    ('classifier', MultinomialNB()),
    ])

# Séparation du dataset en features et target (X, y) et en train et test
Plus d'information : https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
labelled_target = np.array([labels[t] for t in news.target])

In [None]:
texts = [re.sub('[^a-z]+', ' ', t.lower()).strip() for t in news.data]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(texts, labelled_target, test_size=0.2, random_state=11)

# Entraînement du modèle de machine learning sur les données d'entrainement

In [None]:
classifier.fit(X_train, y_train)

# Qu'est ce qu'il s'est passé ?

In [None]:
# Le TFIDF a calculé l'IDF de chaque mot du corpus
feature_names = classifier.named_steps['vectorizer'].get_feature_names()
idf_ = classifier.named_steps['vectorizer'].idf_

In [None]:
len(feature_names)

In [None]:
for i in range(1000, 1042):
    print(feature_names[i], ':', round(idf_[i], 2))

In [None]:
# Et ensuite il transforme chaque document en vecteur de la taille du vocabulaire et donc le score est le TFIDF
tmp = classifier.named_steps['vectorizer'].transform(X_train[:10])
pd.DataFrame(tmp.toarray(), columns=classifier.named_steps['vectorizer'].get_feature_names())

In [None]:
# Et le naïf bayésien apprends la corrélation entre chaque mot et chaque catégorie
pd.DataFrame(classifier.named_steps['classifier'].coef_, index=labels, columns=feature_names).T

In [None]:
pd.DataFrame(classifier.named_steps['classifier'].coef_, index=labels, columns=feature_names).T.sort_values(by='alt.atheism', ascending=False).head(20)

# Prédiction des targets des données de test

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
# Aperçu des targets prédites
y_pred

In [None]:
# Aperçu des targets réelles
y_test

# Construction du rapport de classification
Pour plus d'information sur la précision, le recall et le f1-score : https://fr.wikipedia.org/wiki/Pr%C3%A9cision_et_rappel

In [None]:
print(classification_report(y_test, y_pred))

# Création d'une matrice de confusion

In [None]:
from scikitplot.metrics import plot_confusion_matrix

In [None]:
plot_confusion_matrix(y_test, y_pred, figsize=(10, 10), labels=labels, x_tick_rotation=90)