In [1]:
import pandas as pd
import sklearn
import numpy as np
from sklearn.model_selection import train_test_split
import re
import datetime
import time
import pickle
import mlflow
import mlflow.sklearn

In [2]:
df = pd.read_csv('data.csv')
#df = df[df.journal!='Libération']
#df = df[df.journal!='Le Parisien']
#df = df[df.journal!="L'Express"]
df

Unnamed: 0,titre,jour_publication,auteur,journal
0,"Pour les galeries d’art, une situation moins s...",01-02-2021,['Roxana Azimi'],Le Monde
1,Quand la masturbation provoque une hémorragie ...,01-02-2021,['Marc Gozlan'],Le Monde
2,Les pistes de la France pour améliorer les ter...,01-02-2021,['Julien Bouissou'],Le Monde
3,Facebook et Apple se livrent une guerre ouverte,01-02-2021,['Alexandre Piquard'],Le Monde
4,"Kent Walker : « Google agit selon les lois, et...",01-02-2021,['Kent Walker'],Le Monde
...,...,...,...,...
39311,Christina Milian : cet achat complètement impu...,2020-04-01,,Closer
39312,Matt Pokora confiné : il partage une photo tro...,2020-04-01,,Closer
39313,"Meghan, Harry et Archie : que font-ils pendant...",2020-04-01,,Closer
39314,Megxit : quels membres de la famille royale su...,2020-04-01,,Closer


# Preprocessing Nettoyage

In [3]:
# récupération dans le package des stop-mots
import nltk
from nltk.corpus import stopwords

stop_words = stopwords.words('french')
stop_words.extend(["c'est","j'ai","a","plus","contre","après", "d'un","d'une","entre","ans","deux","veut","comme",
"va","trois","sous","faut","n'est","cinq","leurs","doit","qu'il","peut","n'a","mis","six","cette","j'ai","-","s'est","dit","dont"])

In [4]:
from nltk.stem.snowball import FrenchStemmer #import utilisé pour raciniser les mots , finalement pas utilisé

# fonction qui enlève les caractères spéciaux
def nettoyage(text):
    stemmer = FrenchStemmer()
    text = str(text).lower() # mettre les mots en minuscule
    text = re.sub(r"[.,\!\?\%\(\)\/\"]", "", text)  # Retrait les caractères spéciaux :
    text = re.sub(r"\&\S*\s", "", text)
    text = re.sub(r"\d", "", text) 
    text = re.sub(r"\-", "", text) 
    text = re.sub(r"\:", "", text)
    text = re.sub(r"\»", "", text) 
    text = re.sub(r"\«", "", text)
    text = re.sub(r"\’", " ", text)
    text = text.split()
    les_mots = ""
    for mot in text:
        a_ajouter = stemmer.stem(mot)
        if a_ajouter not in stop_words:
            les_mots = les_mots + " "+ a_ajouter
    return les_mots

In [5]:
def encode_cat(x):
        if x =="Le Monde":
            val = 1
        elif x== "Libération":
            val = 2
        elif x =="Le Parisien":
            val = 3
        elif x == "L'Express":
            val = 4
        elif x == "Closer":
            val = 5
  
        return val

In [6]:
df.titre = df.titre.map(lambda x : nettoyage(x))

In [7]:
df.journal = df.journal.map(lambda x : encode_cat(x))
df

Unnamed: 0,titre,jour_publication,auteur,journal
0,galer art situat moin sombr redout,01-02-2021,['Roxana Azimi'],1
1,quand masturb provoqu hémorrag méning,01-02-2021,['Marc Gozlan'],1
2,pist franc amélior term accord libreéchang me...,01-02-2021,['Julien Bouissou'],1
3,facebook apple livrent guerr ouvert,01-02-2021,['Alexandre Piquard'],1
4,kent walk googl agit selon lois non selon pol...,01-02-2021,['Kent Walker'],1
...,...,...,...,...
39311,christin milian cet achat complet impuls qu'e...,2020-04-01,,5
39312,matt pokor confin partag photo trop mignon en...,2020-04-01,,5
39313,meghan harry archi fontil pend confin,2020-04-01,,5
39314,megx quel membr famill royal succèdent harry ...,2020-04-01,,5


## Split

In [8]:
X = df.titre
y = df['journal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print(np.shape(X_train), np.shape(X_test), np.shape(y_train), np.shape(y_test))

(27521,) (11795,) (27521,) (11795,)


In [9]:
X_train.shape

(27521,)

In [10]:
y_train.value_counts()

5    5992
1    5614
2    5547
4    5217
3    5151
Name: journal, dtype: int64

In [11]:
y_test.value_counts()

5    2534
2    2417
1    2386
3    2240
4    2218
Name: journal, dtype: int64

In [12]:
print(y_train.value_counts().apply(lambda x: x/len(y_train)))
print(y_test.value_counts().apply(lambda x: x/len(y_test)))

5    0.217725
1    0.203990
2    0.201555
4    0.189564
3    0.187166
Name: journal, dtype: float64
5    0.214837
2    0.204917
1    0.202289
3    0.189911
4    0.188046
Name: journal, dtype: float64


## PREPROCESSING Vectorisation et pondération

from sklearn.feature_extraction.text import CountVectorizer , TfidfTransformer
from sklearn.pipeline import Pipeline

vectorizer = CountVectorizer(analyzer='word')
corpus = X_train.to_list()
vectorizer.fit(corpus)
pipe = Pipeline([('count', CountVectorizer(vocabulary= vectorizer.get_feature_names())),
                 ('tfid', TfidfTransformer())]).fit(corpus)

df_tfidf = pipe.transform(corpus).toarray()
print(np.shape(df_tfidf))

In [13]:
## commentaires
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer='word')
tfidf.fit(X_train)
X_train_word_features = tfidf.transform(X_train)

In [19]:
#print(tfidf.get_feature_names())

In [20]:
test_features = tfidf.transform(X_test)

In [21]:
file = 'Flask_journaux/src/models_pickle/file_tfidf.pkl' 
pickle.dump(tfidf, open(file, 'wb'))

In [24]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

## 1. Modèle KNN

In [25]:
from sklearn.neighbors import KNeighborsClassifier

with mlflow.start_run():
    print("Tuning KMM...")
    knn = KNeighborsClassifier()    
    knn.fit(X_train_word_features, y_train)
    pred_knn = knn.predict(test_features)
    print('Accurancy', accuracy_score(y_test, pred_knn))
    print('Precision', precision_score(y_test, pred_knn ,average='weighted'))
    print('Recall', recall_score(y_test, pred_knn,average='weighted'))
    print('f1-score', f1_score(y_test, pred_knn,average='weighted'))
    print("Fin de traitement")

Tuning KMM...
Accurancy 0.5190334887664264
Precision 0.534041249404564
Recall 0.5190334887664264
f1-score 0.5217686644967492
Fin de traitement


In [26]:
confusion_matrix(y_test, pred_knn)

array([[1054,  720,  204,  330,   78],
       [ 611, 1184,  188,  326,  108],
       [ 409,  502,  999,  258,   72],
       [ 583,  565,  248,  759,   63],
       [  99,  165,   73,   71, 2126]], dtype=int64)

In [27]:
print(classification_report(y_test, pred_knn))

              precision    recall  f1-score   support

           1       0.38      0.44      0.41      2386
           2       0.38      0.49      0.43      2417
           3       0.58      0.45      0.51      2240
           4       0.44      0.34      0.38      2218
           5       0.87      0.84      0.85      2534

    accuracy                           0.52     11795
   macro avg       0.53      0.51      0.52     11795
weighted avg       0.53      0.52      0.52     11795



In [28]:
file = 'Flask_journaux/src/models_pickle/file_model_fitted_Knn.pkl' 
pickle.dump(knn, open(file, 'wb'))

## 2. Modèle Random Forest

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, mean_squared_error

with mlflow.start_run():
    print("Tuning Randomforest...")
    rf = RandomForestClassifier()
    rf.fit(X_train_word_features, y_train)
    pred_rf = rf.predict(test_features)
    print('Accurancy', accuracy_score(y_test, pred_rf))
    print('Precision', precision_score(y_test, pred_rf ,average='weighted'))
    print('Recall', recall_score(y_test, pred_rf,average='weighted'))
    print('f1-score', f1_score(y_test, pred_rf,average='weighted'))
    print("Fin de traitement")

Tuning Randomforest...
Accurancy 0.5676133955065705
Precision 0.5645434329134511
Recall 0.5676133955065705
f1-score 0.55915013905046
Fin de traitement


In [30]:
# calcul de la matrise de confusion 
confusion_matrix(y_test, pred_rf)

array([[ 935,  741,  202,  309,  199],
       [ 346, 1498,  160,  265,  148],
       [ 340,  382, 1073,  252,  193],
       [ 401,  523,  241,  897,  156],
       [  61,  102,   47,   32, 2292]], dtype=int64)

In [31]:
print(classification_report(y_test, pred_rf))

              precision    recall  f1-score   support

           1       0.45      0.39      0.42      2386
           2       0.46      0.62      0.53      2417
           3       0.62      0.48      0.54      2240
           4       0.51      0.40      0.45      2218
           5       0.77      0.90      0.83      2534

    accuracy                           0.57     11795
   macro avg       0.56      0.56      0.55     11795
weighted avg       0.56      0.57      0.56     11795



## 3. MODELE SVM

In [32]:
from sklearn.svm import SVC

with mlflow.start_run():
    print("Tuning SVM...")
    svm = SVC()
    svm.fit(X_train_word_features, y_train)
    pred_svm = svm.predict(test_features)
    print('Accurancy', accuracy_score(y_test, pred_svm))
    print('Precision', precision_score(y_test, pred_svm ,average='weighted'))
    print('Recall', recall_score(y_test, pred_svm,average='weighted'))
    print('f1-score', f1_score(y_test, pred_svm,average='weighted'))
    print("Fin de traitement")

Tuning SVM...
Accurancy 0.5929631199660873
Precision 0.5989666598775786
Recall 0.5929631199660873
f1-score 0.5938089391381794
Fin de traitement


In [33]:
confusion_matrix(y_test, pred_svm)

array([[1217,  548,  172,  377,   72],
       [ 613, 1249,  169,  310,   76],
       [ 369,  311, 1211,  268,   81],
       [ 518,  408,  217, 1010,   65],
       [  60,   80,   56,   31, 2307]], dtype=int64)

In [34]:
print(classification_report(y_test, pred_svm))

              precision    recall  f1-score   support

           1       0.44      0.51      0.47      2386
           2       0.48      0.52      0.50      2417
           3       0.66      0.54      0.60      2240
           4       0.51      0.46      0.48      2218
           5       0.89      0.91      0.90      2534

    accuracy                           0.59     11795
   macro avg       0.60      0.59      0.59     11795
weighted avg       0.60      0.59      0.59     11795



## 2.1. Hyperparamètres & Modèle KNN

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [38]:
with mlflow.start_run():
    print("Tuning KMM...")
    knn = KNeighborsClassifier()
    n_neighbors = range(1, 21, 2)
    weights = ['uniform', 'distance']
    metric = ['euclidean', 'manhattan', 'minkowski']

    # define grid search
    params_knn = dict(n_neighbors = n_neighbors , weights = weights, metric = metric)
    cv_knn = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    random_search_knn = RandomizedSearchCV(estimator=knn, param_distributions = params_knn, n_jobs=-1, cv=cv_knn, scoring='accuracy',error_score=0)
    random_result_knn = random_search_knn.fit(X_train_word_features, y_train)

    # summarize results
    print("Best: %f using %s" % (random_result_knn.best_score_, random_result_knn.best_params_))
    mlflow.log_param('best_params',random_result_knn.best_params_)
    mlflow.log_metric("score", random_result_knn.best_score_)
    print("Fin de traitement")

Best: 0.536184 using {'weights': 'distance', 'n_neighbors': 9, 'metric': 'euclidean'}


In [None]:
with mlflow.start_run():
    print("Traning best model Knn...")
    knn_opti = KNeighborsClassifier(n_neighbors = 9 , weights = 'distance', metric ='euclidean')
    knn_opti.fit(X_train_word_features, y_train)
    pred_rf_opti = knn_opti.predict(test_features)
    print('Accurancy', accuracy_score(y_test, pred_rf_opti))
    print('Precision', precision_score(y_test, pred_rf_opti ,average='weighted'))
    print('Recall', recall_score(y_test, pred_rf_opti,average='weighted'))
    print('f1-score', f1_score(y_test, pred_rf_opti,average='weighted'))
    print("Fin de traitement")

In [None]:
file_knn = 'Flask_journaux/src/models_pickle/file_model_fitted_knn_opti.pkl' 
pickle.dump(knn_opti, open(file_rf, 'wb'))

## 2.2 Hyperparamètres & Random forest

In [43]:
with mlflow.start_run():
    print("Tuning Randomforest...")
    model_forest = RandomForestClassifier()
    n_estimators = [10,100,200]
    max_features = ['sqrt', 'log2']

    params_forest = dict(n_estimators = n_estimators, max_features = max_features)
    cv_forest = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    random_search_forest = RandomizedSearchCV(estimator = model_forest, param_distributions = params_forest, n_jobs=-1, cv=cv_forest, scoring='accuracy',error_score=0)
    random_result_forest = random_search_forest.fit(X_train_word_features, y_train)

    print("Best: %f using %s" % (random_result_forest.best_score_, random_result_forest.best_params_))
    mlflow.log_param('best_params',random_result_forest.best_params_)
    mlflow.log_metric("best_score", random_result_forest.best_score_)
    mlflow.sklearn.log_model(RandomForestClassifier(), "journaux")
    print("Fin de traitement")

Best: 0.593402 using {'n_estimators': 200, 'max_features': 'log2'}
Fin de traitement


In [None]:
with mlflow.start_run():
    print("Tuning Randomforest...")
    rf_opti = RandomForestClassifier(n_estimators=200,max_features=log2)
    rf_opti.fit(X_train_word_features, y_train)
    pred_rf_opti = rf_opti.predict(test_features)
    print('Accurancy', accuracy_score(y_test, pred_rf_opti))
    print('Precision', precision_score(y_test, pred_rf_opti ,average='weighted'))
    print('Recall', recall_score(y_test, pred_rf_opti,average='weighted'))
    print('f1-score', f1_score(y_test, pred_rf_opti,average='weighted'))
    print("Fin de traitement")

In [None]:
file_rf = 'Flask_journaux/src/models_pickle/file_model_fitted_RandonForest.pkl' 
pickle.dump(rf_opti, open(file_rf, 'wb'))

##  2.3. Hyperparamètres & MODELE SVM

In [None]:
with mlflow.start_run(nested=True):
    print("Tuning SVM...")
    svm = SVC()
    kernel = ['poly', 'rbf', 'sigmoid']
    C = [50, 10, 1.0, 0.1, 0.01]
    gamma = ['scale']

    # define grid search
    params_svm = dict(kernel=kernel,C=C,gamma=gamma)
    cv_svm = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    random_search_SVM = RandomizedSearchCV(estimator=svm, param_distributions=params_svm, n_jobs=-1, cv=cv_svm, scoring='accuracy',error_score=0)
    random_result_SVM = random_search_SVM.fit(X_train_word_features, y_train)

    # summarize results
    print("Best: %f using %s" % (random_result_SVM.best_score_, random_result_SVM.best_params_))
    mlflow.log_param('best_params',random_result_SVM.best_params_)
    mlflow.log_metric("best_score", random_result_SVM.best_score_)
    mlflow.sklearn.log_model(SVC(), "journaux")
    print("Fin de traitement")

Tuning SVM...


In [None]:
with mlflow.start_run():
    print("Training best model SVM...")
    svm_opti = SVC(kernel = ['poly', 'rbf', 'sigmoid'], C = 50, gamma = 'scale')
    svm_opti.fit(X_train_word_features, y_train)
    pred_svm_opti = svm_opti.predict(test_features)
    print('Accurancy', accuracy_score(y_test, pred_svm_opti))
    print('Precision', precision_score(y_test, pred_svm_opti ,average='weighted'))
    print('Recall', recall_score(y_test, pred_svm_opti,average='weighted'))
    print('f1-score', f1_score(y_test, pred_svm_opti,average='weighted'))
    print("Fin de traitement")

In [69]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
#create a new KNN model
knn6 = KNeighborsClassifier(n_neighbors=6)
knn19 = KNeighborsClassifier(n_neighbors=19)
cv_scores6 = cross_val_score(knn6, X_train_word_features, y_train, cv=5)
cv_scores19 = cross_val_score(knn19, X_train_word_features, y_train, cv=5)

print(cv_scores6)
print("cv_scores6 mean:{}".format(np.mean(cv_scores6)))
print(cv_scores19)
print("cv_scores19 mean:{}".format(np.mean(cv_scores19)))

[0.51752952 0.51889535 0.50999273 0.51580669 0.515625  ]
cv_scores6 mean:0.5155698572122595
[0.53605813 0.52925145 0.52579942 0.53143169 0.52525436]
cv_scores19 mean:0.5295590095156623


In [73]:
results6 = knn6.fit(X_train_word_features, y_train)

file6 = 'Flask_journaux/src/models_pickle/file_model_fitted_Knn6.pkl' 
pickle.dump(results6, open(file6, 'wb'))

In [74]:
results19 = knn19.fit(X_train_word_features, y_train)

file19 = 'Flask_journaux/src/models_pickle/file_model_fitted_Knn19.pkl' 
pickle.dump(results19, open(file19, 'wb'))

In [75]:
y_hat6 = results6.predict(test_features)
y_hat_proba6 = results6.predict_proba(test_features)

In [76]:
y_hat19 = results19.predict(test_features)
y_hat_proba19 = results19.predict_proba(test_features)

In [77]:
confusion_matrix(y_test, y_hat6)

array([[1082,  714,  185,  332,   73],
       [ 624, 1194,  182,  310,  107],
       [ 420,  504, 1005,  240,   71],
       [ 599,  560,  235,  760,   64],
       [  90,  171,   69,   60, 2144]], dtype=int64)

In [78]:
confusion_matrix(y_test, y_hat19)

array([[ 962,  699,  214,  394,  117],
       [ 520, 1161,  206,  356,  174],
       [ 325,  498, 1034,  273,  110],
       [ 447,  571,  225,  878,   97],
       [  54,  134,   49,   50, 2247]], dtype=int64)

In [79]:
print('Accurancy6', accuracy_score(y_test, y_hat6))
print('Precision', precision_score(y_test, y_hat6 ,average='weighted'))
print('Recall', recall_score(y_test, y_hat6,average='weighted'))
print('f1-score', f1_score(y_test, y_hat6,average='weighted'))

Accurancy6 0.5243747350572276
Precision 0.5407635311686833
Recall 0.5243747350572276
f1-score 0.5270738032755025


In [80]:
print('Accurancy19', accuracy_score(y_test, y_hat19))
print('Precision', precision_score(y_test, y_hat19,average='weighted' ))
print('Recall', recall_score(y_test, y_hat19,average='weighted'))
print('f1-score', f1_score(y_test, y_hat19,average='weighted'))

Accurancy19 0.5325985587113183
Precision 0.5361134648890626
Recall 0.5325985587113183
f1-score 0.5308148634241694


In [34]:
print(classification_report(y_test, y_hat6))

              precision    recall  f1-score   support

           1       0.24      0.20      0.22      2406
           2       0.22      0.40      0.29      2392
           3       0.22      0.62      0.32      2302
           4       0.70      0.09      0.16      2185
           5       0.53      0.07      0.12      2192
           6       0.97      0.18      0.30      2604

    accuracy                           0.26     14081
   macro avg       0.48      0.26      0.23     14081
weighted avg       0.48      0.26      0.24     14081



In [35]:
print(classification_report(y_test, y_hat19))

              precision    recall  f1-score   support

           1       0.47      0.03      0.05      2406
           2       0.19      0.77      0.30      2392
           3       0.19      0.29      0.23      2302
           4       0.67      0.03      0.06      2185
           5       0.58      0.04      0.07      2192
           6       0.96      0.08      0.15      2604

    accuracy                           0.21     14081
   macro avg       0.51      0.21      0.14     14081
weighted avg       0.51      0.21      0.15     14081



## Réglage hyperparametres

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [37]:
model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=19, weights = 'uniform', metric = 'euclidean') 
#model = LogisticRegression(solver='liblinear')
results = model.fit(X_train, y_train)

file = 'Flask_journaux/models_pickle/file_model_fitted_Knn.pkl' 
pickle.dump(results, open(file, 'wb'))

In [43]:
y_hat = results.predict(X_test)
y_hat_proba = results.predict_proba(X_test)

In [44]:
confusion_matrix(y_test, y_hat)

array([[  66, 1601,  697,    6,   34,    2],
       [  13, 1850,  510,    8,   10,    1],
       [   1, 1625,  676,    0,    0,    0],
       [  16, 1484,  596,   70,   16,    3],
       [  32, 1508,  543,   19,   87,    3],
       [  12, 1790,  584,    2,    4,  212]], dtype=int64)

In [46]:
print('Accurancy', accuracy_score(y_test, y_hat))
print('Precision', precision_score(y_test, y_hat,average='weighted' ))
print('Recall', recall_score(y_test, y_hat,average='weighted'))
print('f1-score', f1_score(y_test, y_hat,average='weighted'))

Accurancy 0.21028336055677863


## TEST DU MODELE

In [43]:
with open("Flask_journaux/models_pickle/file_model_fitted_Knn.pkl", 'rb') as file_model:
    lr = pickle.load(file_model)  
    print ('Model loaded')

with open("Flask_journaux/models_pickle/file_columns_model.pkl", 'rb') as file_columns:
    model_columns = pickle.load(file_columns)
    print ('Model columns loaded')

with open("Flask_journaux/models_pickle/file_vectorization.pkl", 'rb') as vector:
    vectorization = pickle.load(vector)
    print ('vector transfo loaded')

Model loaded
Model columns loaded
vector transfo loaded


In [44]:
from sklearn.preprocessing import StandardScaler
titre ='Bob Mould, trente ans de distorsion'
titre_nettoyé = nettoyage(titre)
print(titre_nettoyé)
query = vectorization.transform([titre_nettoyé])
query 

 bob mould trent an distors


<1x18614 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [45]:
query.shape

(1, 18614)

In [46]:
lr.predict(query)

array([3], dtype=int64)