# SUMMARY

## [Import libraries and files](#ch0)

# [1 - LDA ](#ch1)

# [2 - RNN Models ](#ch2)


## [Export](#ch99)

<a id="ch0"></a>
## Import libraries and files

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Tokenize
import nltk
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import re

# Features Extract
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from wordcloud import WordCloud

# RNN models

import tensorflow as tf
import tensorflow.keras
from tensorflow.keras import backend as K

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import metrics as kmetrics
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
import tensorflow_hub as hub

import os
import pickle
from joblib import load
import transformers
from transformers import AutoTokenizer

import gensim

#LDA

import gensim.corpora as corpora
import pyLDAvis
import pyLDAvis.gensim_models
from gensim.models import CoherenceModel

# Test models
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import jaccard_score, accuracy_score, f1_score, precision_score, recall_score

# OneVsRest Classifications
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Read files
PATH_DATA = '/Users/alexandremonod/Python/OC_ML/projet_5_categoriser_automatiquement_questions/data'
PATH_MODELS = '/Users/alexandremonod/Python/OC_ML/projet_5_categoriser_automatiquement_questions/models'

data = pd.read_pickle(f'{PATH_DATA}/data.pkl')
w2v_data = pd.read_pickle(f'{PATH_DATA}/w2v.pkl')
cv_data = pd.read_pickle(f'{PATH_DATA}/cv_data.pkl')
ctf_data = pd.read_pickle(f'{PATH_DATA}/ctf_data.pkl')
use_data = pd.read_csv(f'{PATH_DATA}/df_use.csv')
bert_hf_data = pd.read_csv(f'{PATH_DATA}/df_bert_huggingface.csv')
bert_tf_data = pd.read_csv(f'{PATH_DATA}/df_bert_hub_tensorflow.csv')

In [6]:
import pickle

<a id="ch1"></a>
## I - LDA

In [4]:
id2word = corpora.Dictionary(data.body)

corpus = []
for body in data.body:
    new = id2word.doc2bow(body)
    corpus.append(new)

print (corpus[0][0:20])

word = id2word[[0][:1][0]]
print (word)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)]
ability 


In [5]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=30,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

KeyboardInterrupt: 

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  other = LooseVersion(other

In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data.body, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.3621676168196795


In [None]:
# Find optimal n_topics
dict_coherence = {}
for n_topics in range(5, 16):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=n_topics,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data.body, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    dict_coherence.update({f'Coherence Score with {n_topics} topics : ': coherence_lda})

dict_coherence

{'Coherence Score with 5 topics : ': 0.41323368341140265,
 'Coherence Score with 6 topics : ': 0.4100261622297971,
 'Coherence Score with 7 topics : ': 0.47525819536648706,
 'Coherence Score with 8 topics : ': 0.41221334115205877,
 'Coherence Score with 9 topics : ': 0.4670340784747662,
 'Coherence Score with 10 topics : ': 0.39028121775147506,
 'Coherence Score with 11 topics : ': 0.39260031578108223,
 'Coherence Score with 12 topics : ': 0.3735052238244689,
 'Coherence Score with 13 topics : ': 0.33634904991982617,
 'Coherence Score with 14 topics : ': 0.33381663918714877,
 'Coherence Score with 15 topics : ': 0.3159956900429516}

In [None]:
max_value = max(dict_coherence.values())
max_key = dict_coherence.get(max_value)
best_score = [k for k,v in dict_coherence.items() if v == max_value]
print("Best coherence score is  :", best_score)

Best coherence score is  : ['Coherence Score with 7 topics : ']


In [None]:
# LDA with best model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=7,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__versi

5e groupe plus lié à Python/Pandas.

In [None]:
# Calculate Document/topic matrix with Gensim
doc_topic = pd.DataFrame(lda_model\
                             .get_document_topics(corpus,
                                                  minimum_probability=0))
for topic in doc_topic.columns:
    doc_topic[topic] = doc_topic[topic].apply(lambda x : x[1])

y = data["tags"]
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(y)
y_binarized = multilabel_binarizer.transform(y)
print('document/tag : ', y_binarized.shape)
print('document/topic : ', doc_topic.shape)

document/tag :  (27734, 100)
document/topic :  (27734, 7)


In [None]:
# Matricial multiplication with Document / Topics transpose
topic_tag = np.matmul(doc_topic.T, y_binarized)
topic_tag.shape

(7, 100)

In [None]:
topic_tag.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,47.688388,8.109638,26.873428,61.099993,7.961717,6.508955,7.136592,6.119793,12.412587,23.420475,...,24.370106,21.196874,14.105282,11.841306,14.331161,11.141632,28.837082,9.049625,24.801179,8.424082
1,416.011704,79.879247,188.429018,500.10919,39.278243,42.767777,69.981422,46.155208,45.367205,137.058062,...,142.799618,160.977227,45.816043,80.556388,64.941686,75.129242,146.208143,91.832601,156.93863,55.036014
2,121.637772,60.645695,30.030538,290.080688,23.352157,26.919918,37.39334,25.749458,10.931405,32.598423,...,36.861088,127.719222,16.172717,23.631448,16.585367,20.634834,42.481612,43.365244,103.844402,22.179612


Nous obtenons donc une matrice dont les lignes représentent les Topics créés et les colonnes les Tags associés et leurs distribution. Chaque valeur est un poids dont nous n'avons pas l'échelle : on peut juste les comparer entre eux, mais 47,68 par exemple, ne veut rien dire en soi, en-dehors de la comparaison avec les autres valeurs.  
Nous allons donc créer nos prédictions en prenant les  n premiers tags associés aux topics de chaque document :

In [None]:
y_results = pd.DataFrame(y)
y_results["best_topic"] = doc_topic.idxmax(axis=1).values
y_results["nb_tags"] = y_results["tags"].apply(lambda x : len(x))

df_y_bin = pd.DataFrame(y_binarized)
df_dict = dict(
    list(
        df_y_bin.groupby(df_y_bin.index)
    )
)

tags_num = []
for k, v in df_dict.items():
    check = v.columns[(v == 1).any()]
    tags_num.append(check.to_list())

y_results["y_true"] = tags_num
y_results.head(3)

Unnamed: 0,tags,best_topic,nb_tags,y_true
0,[c#],6,1,[17]
1,"[c, file]",6,2,"[16, 32]"
2,"[c#, .net]",6,2,"[0, 17]"


In [None]:
y_results.head(10)

Unnamed: 0,tags,best_topic,nb_tags,y_true
0,[c#],6,1,[17]
1,"[c, file]",6,2,"[16, 32]"
2,"[c#, .net]",6,2,"[0, 17]"
3,"[scala, apache-spark, dataframe]",6,3,"[8, 25, 81]"
4,"[php, mysql]",6,2,"[62, 71]"
5,"[.net, windows]",6,2,"[0, 96]"
6,"[c, image, opencv]",6,3,"[16, 41, 67]"
7,"[xcode, macos]",6,2,"[55, 98]"
8,"[php, mysql]",6,2,"[62, 71]"
9,[algorithm],6,1,[2]


In [None]:
# Select predicted tags in Topics / Tags matrix
list_tag = []
for row in y_results.itertuples():
    nb_tags = row.nb_tags
    best_topic = row.best_topic
    row_tags = list(topic_tag.iloc[best_topic]\
                    .sort_values(ascending=False)[0:nb_tags].index)
    list_tag.append(row_tags)
    
y_results["y_pred"] = list_tag
y_results.head(3)

Unnamed: 0,tags,best_topic,nb_tags,y_true,y_pred
0,[c#],6,1,[17],[46]
1,"[c, file]",6,2,"[16, 32]","[46, 73]"
2,"[c#, .net]",6,2,"[0, 17]","[46, 73]"


In [None]:
def metrics_score(model, df, y_true, y_pred):

    if(df is not None):
        temp_df = df
    else:
        temp_df = pd.DataFrame(index=["Accuracy", "F1",
                                      "Jaccard", "Recall",
                                      "Precision"],
                               columns=[model])
        
    scores = []
    scores.append(accuracy_score(y_true, 
                                         y_pred))
    scores.append(f1_score(y_pred, 
                                   y_true, 
                                   average='weighted'))
    scores.append(jaccard_score(y_true, 
                                        y_pred, 
                                        average='weighted'))
    scores.append(recall_score(y_true, 
                                       y_pred, 
                                       average='weighted'))
    scores.append(precision_score(y_true, 
                                          y_pred, 
                                          average='weighted'))
    temp_df[model] = scores
    
    return temp_df

In [None]:
# Create matrix for pred and true y LDA
lda_y_pred = np.zeros(y_binarized.shape)
n = 0
for row in y_results.y_pred.values:
    for i in range(len(row)):
        lda_y_pred[n,row[i]] = 1
    n+=1
    
lda_y_true = np.zeros(y_binarized.shape)
m = 0
for row in y_results.y_true.values:
    for i in range(len(row)):
        lda_y_true[m,row[i]] = 1
    m+=1

In [None]:
df_metrics_compare = metrics_score("LDA", df=None,
                                   y_true=lda_y_true,
                                   y_pred=lda_y_pred)
df_metrics_compare

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,LDA
Accuracy,0.090358
F1,0.213843
Jaccard,0.031305
Recall,0.135193
Precision,0.084118


### OneVsRest Classifications

#### CountVectorizer

In [10]:
# Logistic Regression

# CountVectorizer
X = cv_data.drop(columns=["tag"])
y = cv_data["tag"]
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X, y, test_size=0.2)

# Instanciation du transformer
multilabel_binarizer_cv = MultiLabelBinarizer()    # Comme ça on peut faire plsuieurs types de transformations : inverse par ex
y_train_binarized_cv = multilabel_binarizer_cv.fit_transform(y_train_cv)
y_test_binarized_cv = multilabel_binarizer_cv.fit_transform(y_test_cv)

In [11]:
param_logit = {"estimator__C": [0.1], #[1, 0.1, 0.01], # Plus C est petit, plus la régularisation est forte. C'est l'inverse la force de régularisation.
                                                    # Plus C est grand, plus on accorde de poids à la data d'entrainement

                                                    # A high value of C tells the model to give high weight to the training data, and a lower weight to the complexity penalty.
                                                    # A low value tells the model to give more weight to this complexity penalty at the expense of fitting to the training data. 
                                                    # Basically, a high C means "Trust this training data a lot", 
                                                    # while a low value says "This data may not be fully representative of the real world data, 
                                                    # so if it's telling you to make a parameter really large, don't listen to it".

               "estimator__penalty": ["l1", "l2"],  # Modèles qui pénalisent les poids nuls : ils diminuent les poids des vecteurs peu informatifs (avec ncp de zéros)
                                                    # elasticnet : combination of l1 and l2

               "estimator__solver": ["lbfgs"], ### Liblinear est un solver utilisé pour résoudre un pb d'accumulation. Il est limité pour OneVsRest --> regarder les autres dans la page
                                                   ### Essayer les autres (les mettre dans une liste, comme les autres params)

                                                   ### Liblinear est un solver utilisé pour résoudre un pb d'accumulation. Il est limité pour OneVsRest --> regarder les autres dans la page
                                                   ### Essayer les autres (les mettre dans une liste, comme les autres params)
                                                   ### newton-cg et lbfgs lent avec grands datasets (https://towardsdatascience.com/dont-sweat-the-solver-stuff-aea7cddc3451)
                                                   ### saga et sag ne convergent pas : 
                "estimator__class_weight": ["balanced"]}

multi_lr_cv = GridSearchCV(OneVsRestClassifier(LogisticRegression()),
                              param_grid=param_logit,
                              n_jobs=-1,
                              cv=3, # Le GridSearch garantit la cross-validation avec cet argument
                              scoring=["f1_weighted", "accuracy"], # Peut mettre "accuracy" aussi
                              return_train_score = True,
                              refit="f1_weighted", # Le modèel calcule les scorings avec tous les éléments de la liste scoring, mais en utilise un seul pour recalculer
                                                    # Quand il aura trouvé le meilleur estimator il va appliquer ses paramètres sur l'ensemble du dataframe ; mais en fonction du scoring mentiojné ici
                                                    # Ici : f1_weighted
                              #verbose=3            # Me donne le suivi de ce que fait l'algo pendant qu'il fonctionne
                              ) 

multi_lr_cv.fit(X_train_cv, y_train_binarized_cv)
pickle.dump(multi_lr_cv, open(f"{PATH_MODELS}/multi_lr_cv.pkl", 'wb'))

KeyboardInterrupt: 

In [None]:
# Récupération des best parameters
logit_best_params = multi_lr_cv.best_params_
logit_best_params

{'estimator__C': 0.1,
 'estimator__class_weight': 'balanced',
 'estimator__penalty': 'l2',
 'estimator__solver': 'lbfgs'}

In [None]:
# Predict
y_test_predicted_labels_lr_cv = multi_lr_cv.predict(X_test_cv)

# Inverse transform
y_test_pred_inversed_lr_cv = multilabel_binarizer_cv.inverse_transform(y_test_predicted_labels_lr_cv)
y_test_inversed_lr_cv = multilabel_binarizer_cv.inverse_transform(y_test_binarized_cv)

print("-"*50)
print("Print 5 first predicted Tags vs true Tags")
print("-" * 50)
print("Predicted:", y_test_pred_inversed_lr_cv[0:5])
print("True:", y_test_inversed_lr_cv[0:5])

--------------------------------------------------
Print 5 first predicted Tags vs true Tags
--------------------------------------------------
Predicted: [('linux', 'multithreading'), ('.net', 'asp.net', 'asp.net-mvc', 'authentication', 'c#'), ('cocoa-touch', 'ios', 'ipad', 'iphone', 'objective-c', 'swift', 'xcode'), ('machine-learning', 'numpy', 'python', 'r'), ('hibernate', 'java', 'jpa', 'json', 'spring')]
True: [('c', 'linux', 'multithreading'), ('asp.net', 'asp.net-mvc', 'authentication', 'c#'), ('cocoa-touch', 'ios', 'objective-c'), ('machine-learning', 'python'), ('java', 'json', 'spring-mvc')]


In [None]:
# Calcul du score Jaccard pour tous les tags
jac_scor_lr_cv = jaccard_score(y_test_binarized_cv, y_test_predicted_labels_lr_cv, average='weighted')
ac_scor_lr_cv = accuracy_score(y_test_binarized_cv, y_test_predicted_labels_lr_cv)
f1_scor_lr_cv = f1_score(y_test_binarized_cv, y_test_predicted_labels_lr_cv, average = "weighted")
prec_scor_lr_cv = precision_score(y_test_binarized_cv, y_test_predicted_labels_lr_cv, average = "weighted")
rec_scor_lr_cv = recall_score(y_test_binarized_cv, y_test_predicted_labels_lr_cv, average = "weighted")


scores_rl_cv = [jac_scor_lr_cv, ac_scor_lr_cv, f1_scor_lr_cv, prec_scor_lr_cv, rec_scor_lr_cv]
print(jac_scor_lr_cv, ac_scor_lr_cv, f1_scor_lr_cv, prec_scor_lr_cv, rec_scor_lr_cv)

0.42689026019827564 0.10041463854335676 0.5813910397855518 0.48599145107966824 0.7562807082599425


In [None]:
# GridSearch SGDClassifier

grid = {
    'estimator__alpha': [0.1], # learning rate
    #'estimator__n_iter': [1000], # number of epochs
    'estimator__loss': ['log'], # logistic regression,
    'estimator__penalty': ['l1', 'l2'],
    'estimator__n_jobs': [-1]
}

sgdc_cv = GridSearchCV(OneVsRestClassifier(SGDClassifier()),
                              param_grid=grid,
                              n_jobs=-1,
                              cv=5, # Le GridSearch garantit la cross-validation avec cet argument
                              scoring="accuracy", # Peut mettre "weighted_label" aussi
                              return_train_score = True,
                              refit=True,
                              verbose=3) # Me donne le suivi de ce que fait l'algo pendant qu'il fonctionne
sgdc_cv.fit(X_train_cv, y_train_binarized_cv)
pickle.dump(sgdc_cv, open(f"{PATH_MODELS}/sgdc_cv.pkl", 'wb'))

Fitting 5 folds for each of 2 candidates, totalling 10 fits


  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)


[CV 2/5] END estimator__alpha=0.1, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.057, test=0.060) total time= 1.2min
[CV 1/5] END estimator__alpha=0.1, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.057, test=0.058) total time= 1.2min
[CV 3/5] END estimator__alpha=0.1, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.057, test=0.059) total time= 1.2min


  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)


[CV 4/5] END estimator__alpha=0.1, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.059, test=0.051) total time= 1.2min
[CV 1/5] END estimator__alpha=0.1, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.061, test=0.063) total time= 1.2min
[CV 5/5] END estimator__alpha=0.1, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.057, test=0.058) total time= 1.2min
[CV 2/5] END estimator__alpha=0.1, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.061, test=0.064) total time= 1.2min
[CV 3/5] END estimator__alpha=0.1, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.061, test=0.062) total time= 1.2min
[CV 4/5] END estimator__alpha=0.1, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.063, test=0.055) total time=  42.0s
[CV 5/5] END estimator__alpha=0.1, estimator__loss=log, estimator__n_jobs=-1, esti

In [None]:
# Predict
y_test_predicted_labels_sgdc_cv = sgdc_cv.predict(X_test_cv)

# Inverse transform
y_test_pred_inversed_sgdc_cv = multilabel_binarizer_cv.inverse_transform(y_test_predicted_labels_sgdc_cv)
y_test_inversed_sgdc_cv = multilabel_binarizer_cv.inverse_transform(y_test_binarized_cv)

print("-"*50)
print("Print 5 first predicted Tags vs true Tags")
print("-" * 50)
print("Predicted:", y_test_pred_inversed_sgdc_cv[0:5])
print("True:", y_test_inversed_sgdc_cv[0:5])

--------------------------------------------------
Print 5 first predicted Tags vs true Tags
--------------------------------------------------
Predicted: [(), (), (), (), ()]
True: [('c#',), ('c',), ('ios', 'ipad', 'iphone', 'objective-c'), ('bash', 'linux', 'shell'), ('hibernate', 'java', 'spring')]


In [None]:
# Ici on a bcp de valeurs prédites manquantes : on utilise predict_proba pour voir quels sont les tags les plus probables
y_test_predicted_proba_labels_sgdc_cv = sgdc_cv.predict_proba(X_test_cv)
#y_test_pred_proba_inversed_lr_cv = multilabel_binarizer_cv.inverse_transform(y_test_predicted_proba_labels_lr_cv)
print("Predicted proba:", y_test_predicted_proba_labels_sgdc_cv[0:5])

In [None]:
# Calcul du score Jaccard pour tous les tags
jac_scor_sgdc_cv = jaccard_score(y_test_binarized_cv, y_test_predicted_labels_sgdc_cv, average='weighted') 
ac_scor_sgdc_cv = accuracy_score(y_test_binarized_cv, y_test_predicted_labels_sgdc_cv)
f1_scor_sgdc_cv = f1_score(y_test_binarized_cv, y_test_predicted_labels_sgdc_cv, average = "weighted")
prec_scor_sgdc_cv = precision_score(y_test_binarized_cv, y_test_predicted_labels_sgdc_cv, average = "weighted")
rec_scor_sgdc_cv = recall_score(y_test_binarized_cv, y_test_predicted_labels_sgdc_cv, average = "weighted")

scores_sgdc_cv = [jac_scor_sgdc_cv, ac_scor_sgdc_cv, f1_scor_sgdc_cv, prec_scor_sgdc_cv, rec_scor_sgdc_cv]
print(jac_scor_sgdc_cv, ac_scor_sgdc_cv, f1_scor_sgdc_cv, prec_scor_sgdc_cv, rec_scor_sgdc_cv)

0.008772028960545079 0.060393005228051196 0.016858882389728495 0.24043178477219454 0.008806897191063317


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# RandomForrest

param_rfc = {"estimator__max_depth": [5, 25, 50],  # Profondeur
             "estimator__min_samples_leaf": [1, 5, 10],  # Cb d'éléments dans chaque feuille
             "estimator__class_weight": ["balanced"]}  
multi_rfc_cv = GridSearchCV(OneVsRestClassifier(RandomForestClassifier()),
                            param_grid=param_rfc,
                            n_jobs=-1,
                            cv=5,
                            scoring="accuracy",
                            return_train_score = True,
                            refit=True,
                            #verbose=3
                            )
# Fit on Sample data
multi_rfc_cv.fit(X_train_cv, y_train_binarized_cv)
pickle.dump(multi_rfc_cv, open(f"{PATH_MODELS}/multi_rfc_cv.pkl", 'wb'))

  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)


In [None]:
# Récupération des best parameters
logit_best_params = multi_rfc_cv.best_params_
logit_best_params

{'estimator__class_weight': 'balanced',
 'estimator__max_depth': 25,
 'estimator__min_samples_leaf': 1}

In [None]:
# Predict
y_test_predicted_labels_rfc_cv = multi_rfc_cv.predict(X_test_cv)

# Inverse transform
y_test_pred_inversed_rfc_cv = multilabel_binarizer_cv.inverse_transform(y_test_predicted_labels_rfc_cv)
y_test_inversed_rfc_cv = multilabel_binarizer_cv.inverse_transform(y_test_binarized_cv)

print("-"*50)
print("Print 5 first predicted Tags vs true Tags")
print("-" * 50)
print("Predicted:", y_test_pred_inversed_rfc_cv[0:5])
print("True:", y_test_inversed_rfc_cv[0:5])

--------------------------------------------------
Print 5 first predicted Tags vs true Tags
--------------------------------------------------
Predicted: [('.net', 'c#', 'java'), ('c', 'linux'), ('ios', 'ipad', 'iphone', 'objective-c'), ('bash', 'file', 'linux', 'shell', 'unix'), ('hibernate', 'java', 'spring')]
True: [('c#',), ('c',), ('ios', 'ipad', 'iphone', 'objective-c'), ('bash', 'linux', 'shell'), ('hibernate', 'java', 'spring')]


In [None]:
# Calcul du score Jaccard pour tous les tags
jac_scor_rfc_cv = jaccard_score(y_test_binarized_cv, y_test_predicted_labels_rfc_cv, average='weighted')
ac_scor_rfc_cv = accuracy_score(y_test_binarized_cv, y_test_predicted_labels_rfc_cv)
f1_scor_rfc_cv = f1_score(y_test_binarized_cv, y_test_predicted_labels_rfc_cv, average = "weighted")
prec_scor_rfc_cv = precision_score(y_test_binarized_cv, y_test_predicted_labels_rfc_cv, average = "weighted")
rec_scor_rfc_cv = recall_score(y_test_binarized_cv, y_test_predicted_labels_rfc_cv, average = "weighted")

scores_rf_cv = [jac_scor_rfc_cv, ac_scor_rfc_cv, f1_scor_rfc_cv, prec_scor_rfc_cv, rec_scor_rfc_cv]
print(jac_scor_rfc_cv, ac_scor_rfc_cv, f1_scor_rfc_cv, prec_scor_rfc_cv, rec_scor_rfc_cv)

0.36649433405102916 0.16801874887326482 0.5109636487159239 0.6025683308852487 0.49541114304255124


  _warn_prf(average, modifier, msg_start, len(result))


#### TF-Idf

In [None]:
# LR

X = ctf_data.drop(columns=["tag"])
y = ctf_data["tag"]
X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(X, y, test_size=0.2)

multilabel_binarizer_tf = MultiLabelBinarizer()
y_train_tf_binarized = multilabel_binarizer_tf.fit_transform(y_train_tf)
y_test_tf_binarized = multilabel_binarizer_tf.fit_transform(y_test_tf)

param_logit = {"estimator__C": [10, 1.0, 0.1],
               "estimator__penalty": ["l1", "l2"],
               "estimator__solver": ["lbfgs"]} 

multi_logit_tf = GridSearchCV(OneVsRestClassifier(LogisticRegression()),
                              param_grid=param_logit,
                              n_jobs=-1,
                              cv=5, 
                              scoring="accuracy",
                              return_train_score = True,
                              refit=True) 
multi_logit_tf.fit(X_train_tf, y_train_tf_binarized)
pickle.dump(multi_logit_tf, open(f"{PATH_MODELS}/multi_logit_tf.pkl", 'wb'))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [None]:
# Predict
y_test_rl_tf_predicted_labels = multi_logit_tf.predict(X_test_tf)

# Inverse transform
y_test_rl_tf_pred_inversed = multilabel_binarizer_tf.inverse_transform(y_test_rl_tf_predicted_labels)
y_test_rl_tf_inversed = multilabel_binarizer_tf.inverse_transform(y_test_tf_binarized)

print("-"*50)
print("Print 5 first predicted Tags vs true Tags")
print("-" * 50)
print("Predicted:", y_test_rl_tf_pred_inversed[0:5])
print("True:", y_test_rl_tf_inversed[0:5])

--------------------------------------------------
Print 5 first predicted Tags vs true Tags
--------------------------------------------------
Predicted: [('javascript', 'node.js'), (), (), ('python',), ('android',)]
True: [('javascript', 'node.js'), ('android',), ('macos', 'ruby'), ('algorithm', 'python'), ('android', 'android-studio')]


In [None]:
# Calcul du score Jaccard pour tous les tags
jac_scor_rl_tf = jaccard_score(y_test_tf_binarized, y_test_rl_tf_predicted_labels, average='weighted')
ac_scor_rl_tf = accuracy_score(y_test_tf_binarized, y_test_rl_tf_predicted_labels)
f1_scor_rl_tf = f1_score(y_test_tf_binarized, y_test_rl_tf_predicted_labels, average = "weighted")
prec_scor_rl_tf = precision_score(y_test_tf_binarized, y_test_rl_tf_predicted_labels, average = "weighted")
rec_scor_rl_tf = recall_score(y_test_tf_binarized, y_test_rl_tf_predicted_labels, average = "weighted")

scores_rl_tf = [jac_scor_rl_tf, ac_scor_rl_tf, f1_scor_rl_tf, prec_scor_rl_tf, rec_scor_rl_tf]
print(scores_rl_tf)

[0.3856214639735998, 0.22588786731566612, 0.5319935955655992, 0.7081304026403332, 0.43948926720947445]


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# SGDC

grid = {
    'estimator__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], # learning rate
    #'estimator__n_iter': [1000], # number of epochs
    'estimator__loss': ['log'], # logistic regression,
    'estimator__penalty': ['l1', 'l2'],
    'estimator__n_jobs': [-1]
}

sgdc_tf = GridSearchCV(OneVsRestClassifier(SGDClassifier()),
                              param_grid=grid,
                              n_jobs=-1,
                              cv=5, # Le GridSearch garantit la cross-validation avec cet argument
                              scoring="accuracy", # Peut mettre "weighted_label" aussi
                              return_train_score = True,
                              refit=True,
                              verbose=3) # Me donne le suivi de ce que fait l'algo pendant qu'il fonctionne
sgdc_tf.fit(X_train_tf, y_train_tf_binarized)
pickle.dump(sgdc_tf, open(f"{PATH_MODELS}/sgdc_tf.pkl", 'wb'))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END estimator__alpha=0.0001, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.200, test=0.191) total time= 1.1min
[CV 3/5] END estimator__alpha=0.0001, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.198, test=0.196) total time= 1.2min
[CV 2/5] END estimator__alpha=0.0001, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.204, test=0.185) total time= 1.2min
[CV 4/5] END estimator__alpha=0.0001, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.200, test=0.191) total time= 1.2min
[CV 5/5] END estimator__alpha=0.0001, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.203, test=0.179) total time= 1.2min
[CV 2/5] END estimator__alpha=0.0001, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.157, test=0.130) total time= 1.2min
[CV

In [None]:
# Predict
y_test_sgdc_tf_predicted_labels = sgdc_tf.predict(X_test_tf)

# Inverse transform
y_test_sgdc_tf_pred_inversed = multilabel_binarizer_tf.inverse_transform(y_test_sgdc_tf_predicted_labels)
y_test_sgdc_tf_inversed = multilabel_binarizer_tf.inverse_transform(y_test_tf_binarized)

print("-"*50)
print("Print 5 first predicted Tags vs true Tags")
print("-" * 50)
print("Predicted:", y_test_sgdc_tf_pred_inversed[0:5])
print("True:", y_test_sgdc_tf_inversed[0:5])

--------------------------------------------------
Print 5 first predicted Tags vs true Tags
--------------------------------------------------
Predicted: [('javascript', 'node.js'), (), (), (), ('android',)]
True: [('javascript', 'node.js'), ('android',), ('macos', 'ruby'), ('algorithm', 'python'), ('android', 'android-studio')]


In [None]:
# Calcul du score Jaccard pour tous les tags
jac_scor_sgdc_tf = jaccard_score(y_test_tf_binarized, y_test_sgdc_tf_predicted_labels, average='weighted') 
ac_scor_sgdc_tf = accuracy_score(y_test_tf_binarized, y_test_sgdc_tf_predicted_labels)
f1_scor_sgdc_tf = f1_score(y_test_tf_binarized, y_test_sgdc_tf_predicted_labels, average = "weighted")
prec_scor_sgdc_tf = precision_score(y_test_tf_binarized, y_test_sgdc_tf_predicted_labels, average = "weighted")
rec_scor_sgdc_tf = recall_score(y_test_tf_binarized, y_test_sgdc_tf_predicted_labels, average = "weighted")

scores_sgdc_tf = [jac_scor_sgdc_tf, ac_scor_sgdc_tf, f1_scor_sgdc_tf, prec_scor_sgdc_tf, rec_scor_sgdc_tf]
print(scores_sgdc_tf)

[0.30818895451913186, 0.192897061474671, 0.4401287164406876, 0.6988109440561143, 0.3403034789045152]


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# RandomForrest

param_rfc = {"estimator__max_depth": [25, 50],  # Profondeur
             "estimator__min_samples_leaf": [5, 10],  # Cb d'éléments dans chaque feuille
             "estimator__class_weight": ["balanced"]}  

multi_rfc_tf = GridSearchCV(OneVsRestClassifier(RandomForestClassifier()),
                            param_grid=param_rfc,
                            n_jobs=-1,
                            cv=5,
                            scoring="accuracy",
                            return_train_score = True,
                            refit=True,
                            #verbose=3
                            )
# Fit on Sample data
multi_rfc_tf.fit(X_train_tf, y_train_tf_binarized)
pickle.dump(multi_rfc_tf, open(f"{PATH_MODELS}/multi_rfc_tf.pkl", 'wb'))

In [None]:
# Predict
y_test_rfc_tf_predicted_labels = multi_rfc_tf.predict(X_test_tf)

# Inverse transform
y_test_rfc_tf_pred_inversed = multilabel_binarizer_tf.inverse_transform(y_test_rfc_tf_predicted_labels)
y_test_rfc_tf_inversed = multilabel_binarizer_tf.inverse_transform(y_test_tf_binarized)

print("-"*50)
print("Print 5 first predicted Tags vs true Tags")
print("-" * 50)
print("Predicted:", y_test_rfc_tf_pred_inversed[0:5])
print("True:", y_test_rfc_tf_inversed[0:5])

--------------------------------------------------
Print 5 first predicted Tags vs true Tags
--------------------------------------------------
Predicted: [('javascript', 'node.js'), (), ('.net', 'c#', 'macos'), ('python',), ('android',)]
True: [('javascript', 'node.js'), ('android',), ('macos', 'ruby'), ('algorithm', 'python'), ('android', 'android-studio')]


In [None]:
# Calcul du score Jaccard pour tous les tags
jac_scor_rfc_tf = jaccard_score(y_test_tf_binarized, y_test_rfc_tf_predicted_labels, average='weighted') 
ac_scor_rfc_tf = accuracy_score(y_test_tf_binarized, y_test_rfc_tf_predicted_labels)
f1_scor_rfc_tf = f1_score(y_test_tf_binarized, y_test_rfc_tf_predicted_labels, average = "weighted")
prec_scor_rfc_tf = precision_score(y_test_tf_binarized, y_test_rfc_tf_predicted_labels, average = "weighted")
rec_scor_rfc_tf = recall_score(y_test_tf_binarized, y_test_rfc_tf_predicted_labels, average = "weighted")

scores_rfc_tf = [jac_scor_rfc_tf, ac_scor_rfc_tf, f1_scor_rfc_tf, prec_scor_rfc_tf, rec_scor_rfc_tf]
print(scores_rfc_tf)

[0.4077113836749434, 0.1838831800973499, 0.5562108806892139, 0.592309937079098, 0.5611584011843079]


#### Word2Vec

In [None]:
# LR

X = w2v_data.drop(columns=["tag"])
y = w2v_data["tag"]
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(X, y, test_size=0.2)

multilabel_binarizer_w2v = MultiLabelBinarizer()
y_train_w2v_binarized = multilabel_binarizer_w2v.fit_transform(y_train_w2v)
y_test_w2v_binarized = multilabel_binarizer_w2v.fit_transform(y_test_w2v)

param_logit = {"estimator__C": [10, 1.0, 0.1],
               "estimator__penalty": ["l1", "l2"],
               "estimator__solver": ["lbfgs"]} 

multi_logit_w2v = GridSearchCV(OneVsRestClassifier(LogisticRegression()),
                              param_grid=param_logit,
                              n_jobs=-1,
                              cv=5, 
                              scoring="accuracy",
                              return_train_score = True,
                              refit=True) 
multi_logit_w2v.fit(X_train_w2v, y_train_w2v_binarized)
pickle.dump(multi_logit_w2v, open(f"{PATH_MODELS}/multi_logit_w2v.pkl", 'wb'))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [None]:
# Predict
y_test_rl_w2v_predicted_labels = multi_logit_w2v.predict(X_test_w2v)

# Inverse transform
y_test_rl_w2v_pred_inversed = multilabel_binarizer_w2v.inverse_transform(y_test_rl_w2v_predicted_labels)
y_test_rl_w2v_inversed = multilabel_binarizer_w2v.inverse_transform(y_test_w2v_binarized)

print("-"*50)
print("Print 5 first predicted Tags vs true Tags")
print("-" * 50)
print("Predicted:", y_test_rl_w2v_pred_inversed[0:5])
print("True:", y_test_rl_w2v_inversed[0:5])

--------------------------------------------------
Print 5 first predicted Tags vs true Tags
--------------------------------------------------
Predicted: [(), (), (), (), ()]
True: [('ruby', 'ruby-on-rails'), ('c#', 'performance'), ('.net', 'c#'), ('mysql', 'python'), ('class', 'java', 'oop')]


In [None]:
# Calcul du score Jaccard pour tous les tags
jac_scor_rl_w2v = jaccard_score(y_test_w2v_binarized, y_test_rl_w2v_predicted_labels, average='weighted')
ac_scor_rl_w2v = accuracy_score(y_test_w2v_binarized, y_test_rl_w2v_predicted_labels)
f1_scor_rl_w2v = f1_score(y_test_w2v_binarized, y_test_rl_w2v_predicted_labels, average = "weighted")
prec_scor_rl_w2v = precision_score(y_test_w2v_binarized, y_test_rl_w2v_predicted_labels, average = "weighted")
rec_scor_rl_w2v = recall_score(y_test_w2v_binarized, y_test_rl_w2v_predicted_labels, average = "weighted")

scores_rl_w2v = [jac_scor_rl_w2v, ac_scor_rl_w2v, f1_scor_rl_w2v, prec_scor_rl_w2v, rec_scor_rl_w2v]
print(scores_rl_w2v)

[0.17118330299222523, 0.10996935280331711, 0.28187334456428226, 0.5468526226861323, 0.19423523942352394]


In [None]:
# SGDC

grid = {
    'estimator__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], # learning rate
    #'estimator__n_iter': [1000], # number of epochs
    'estimator__loss': ['log'], # logistic regression,
    'estimator__penalty': ['l1', 'l2'],
    'estimator__n_jobs': [-1]
}

sgdc_w2v = GridSearchCV(OneVsRestClassifier(SGDClassifier()),
                              param_grid=grid,
                              n_jobs=-1,
                              cv=5, # Le GridSearch garantit la cross-validation avec cet argument
                              scoring="accuracy", # Peut mettre "weighted_label" aussi
                              return_train_score = True,
                              refit=True,
                              verbose=3) # Me donne le suivi de ce que fait l'algo pendant qu'il fonctionne
sgdc_w2v.fit(X_train_w2v, y_train_w2v_binarized)
pickle.dump(sgdc_w2v, open(f"{PATH_MODELS}/sgdc_w2v.pkl", 'wb'))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END estimator__alpha=0.0001, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.135, test=0.098) total time= 1.1min
[CV 2/5] END estimator__alpha=0.0001, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.137, test=0.108) total time= 1.1min
[CV 3/5] END estimator__alpha=0.0001, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.136, test=0.097) total time= 1.1min
[CV 1/5] END estimator__alpha=0.001, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.083, test=0.082) total time= 1.0min
[CV 4/5] END estimator__alpha=0.0001, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.131, test=0.090) total time= 1.1min
[CV 5/5] END estimator__alpha=0.0001, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.135, test=0.089) total time= 1.1min
[CV 

In [None]:
# Predict
y_test_sgdc_w2v_predicted_labels = sgdc_w2v.predict(X_test_w2v)

# Inverse transform
y_test_sgdc_w2v_pred_inversed = multilabel_binarizer_w2v.inverse_transform(y_test_sgdc_w2v_predicted_labels)
y_test_sgdc_w2v_inversed = multilabel_binarizer_w2v.inverse_transform(y_test_w2v_binarized)

print("-"*50)
print("Print 5 first predicted Tags vs true Tags")
print("-" * 50)
print("Predicted:", y_test_sgdc_w2v_pred_inversed[0:5])
print("True:", y_test_sgdc_w2v_inversed[0:5])

--------------------------------------------------
Print 5 first predicted Tags vs true Tags
--------------------------------------------------
Predicted: [(), (), (), (), ()]
True: [('ruby', 'ruby-on-rails'), ('c#', 'performance'), ('.net', 'c#'), ('mysql', 'python'), ('class', 'java', 'oop')]


In [None]:
# Calcul du score Jaccard pour tous les tags
jac_scor_sgdc_w2v = jaccard_score(y_test_w2v_binarized, y_test_sgdc_w2v_predicted_labels, average='weighted')
ac_scor_sgdc_w2v = accuracy_score(y_test_w2v_binarized, y_test_sgdc_w2v_predicted_labels)
f1_scor_sgdc_w2v = f1_score(y_test_w2v_binarized, y_test_sgdc_w2v_predicted_labels, average = "weighted")
prec_scor_sgdc_w2v = precision_score(y_test_w2v_binarized, y_test_sgdc_w2v_predicted_labels, average = "weighted")
rec_scor_sgdc_w2v = recall_score(y_test_w2v_binarized, y_test_sgdc_w2v_predicted_labels, average = "weighted")

scores_sgdc_w2v = [jac_scor_sgdc_w2v, ac_scor_sgdc_w2v, f1_scor_sgdc_w2v, prec_scor_sgdc_w2v, rec_scor_sgdc_w2v]
print(scores_sgdc_w2v)

[0.13300756113145193, 0.10347935821164593, 0.22306953692777257, 0.5667754212992165, 0.14579265457926546]


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# RandomForrest

param_rfc = {"estimator__max_depth": [25, 50],  # Profondeur
             "estimator__min_samples_leaf": [5, 10],  # Cb d'éléments dans chaque feuille
             "estimator__class_weight": ["balanced"]} 
multi_rfc_w2v = GridSearchCV(OneVsRestClassifier(RandomForestClassifier()),
                            param_grid=param_rfc,
                            n_jobs=-1,
                            cv=5,
                            scoring="accuracy",
                            return_train_score = True,
                            refit=True,
                            #verbose=3
                            )
# Fit on Sample data
multi_rfc_w2v.fit(X_train_w2v, y_train_w2v_binarized)
pickle.dump(multi_rfc_w2v, open(f"{PATH_MODELS}/multi_rfc_w2v.pkl", 'wb'))

In [None]:
# Predict
y_test_rfc_w2v_predicted_labels = multi_rfc_w2v.predict(X_test_w2v)

# Inverse transform
y_test_rfc_w2v_pred_inversed = multilabel_binarizer_w2v.inverse_transform(y_test_rfc_w2v_predicted_labels)
y_test_rfc_w2v_inversed = multilabel_binarizer_w2v.inverse_transform(y_test_w2v_binarized)

print("-"*50)
print("Print 5 first predicted Tags vs true Tags")
print("-" * 50)
print("Predicted:", y_test_rfc_w2v_pred_inversed[0:5])
print("True:", y_test_rfc_w2v_inversed[0:5])

--------------------------------------------------
Print 5 first predicted Tags vs true Tags
--------------------------------------------------
Predicted: [(), (), (), (), ()]
True: [('ruby', 'ruby-on-rails'), ('c#', 'performance'), ('.net', 'c#'), ('mysql', 'python'), ('class', 'java', 'oop')]


In [None]:
# Calcul du score Jaccard pour tous les tags
jac_scor_rfc_w2v = jaccard_score(y_test_w2v_binarized, y_test_rfc_w2v_predicted_labels, average='weighted')
ac_scor_rfc_w2v = accuracy_score(y_test_w2v_binarized, y_test_rfc_w2v_predicted_labels)
f1_scor_rfc_w2v = f1_score(y_test_w2v_binarized, y_test_rfc_w2v_predicted_labels, average = "weighted")
prec_scor_rfc_w2v = precision_score(y_test_w2v_binarized, y_test_rfc_w2v_predicted_labels, average = "weighted")
rec_scor_rfc_w2v = recall_score(y_test_w2v_binarized, y_test_rfc_w2v_predicted_labels, average = "weighted")

scores_rfc_w2v = [jac_scor_rfc_w2v, ac_scor_rfc_w2v, f1_scor_rfc_w2v, prec_scor_rfc_w2v, rec_scor_rfc_w2v]
print(scores_rfc_w2v)

[0.0658484284025793, 0.08094465476834325, 0.11765142125560502, 0.5137306622914951, 0.0708507670850767]


  _warn_prf(average, modifier, msg_start, len(result))


#### BERT Huggingface

In [16]:
from sklearn import preprocessing
X = bert_hf_data.drop(columns = ['tags'])
scaler = preprocessing.StandardScaler()
label_encoder = preprocessing.LabelEncoder()

data_transformed_bert_hf = scaler.fit_transform(X)

In [17]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(bert_hf_data['tags'])
y_binarized_bert_hf = multilabel_binarizer.transform(bert_hf_data['tags'])

In [18]:
# Create train and test split (20%)
X_train_bert_hf, X_test_bert_hf, y_train_bert_hf, y_test_bert_hf = train_test_split(data_transformed_bert_hf, y_binarized_bert_hf,
                                                    test_size=0.2, random_state=8)

# Model
param_logit = {"estimator__C": [1.0, 0.1],
               "estimator__penalty": ["l1", "l2"],
               "estimator__solver": ["lbfgs"]} 

multi_logit_bert_huggingface = GridSearchCV(OneVsRestClassifier(LogisticRegression()),
                              param_grid=param_logit,
                              n_jobs=-1,
                              cv=2, 
                              scoring="accuracy",
                              return_train_score = True,
                              refit=True,
                              #verbose=3
                              ) 

multi_logit_bert_huggingface.fit(X_train_bert_hf,y_train_bert_hf)

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Label %s is present in all training examples." % str(classes[c])
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logi

GridSearchCV(cv=2,
             estimator=OneVsRestClassifier(estimator=LogisticRegression()),
             n_jobs=-1,
             param_grid={'estimator__C': [1.0, 0.1],
                         'estimator__penalty': ['l1', 'l2'],
                         'estimator__solver': ['lbfgs']},
             return_train_score=True, scoring='accuracy')

In [19]:
y_test_rl_bert_huggingface_predicted_labels = multi_logit_bert_huggingface.predict(X_test_bert_hf)

In [20]:
# Calcul du score Jaccard pour tous les tags
jac_scor_rl_bert_huggingface = jaccard_score(y_test_bert_hf, y_test_rl_bert_huggingface_predicted_labels, average='weighted')
ac_scor_rl_bert_huggingface = accuracy_score(y_test_bert_hf, y_test_rl_bert_huggingface_predicted_labels)
f1_scor_rl_bert_huggingface = f1_score(y_test_bert_hf, y_test_rl_bert_huggingface_predicted_labels, average = "weighted")
prec_scor_rl_bert_huggingface = precision_score(y_test_bert_hf, y_test_rl_bert_huggingface_predicted_labels, average = "weighted")
rec_scor_rl_bert_huggingface = recall_score(y_test_bert_hf, y_test_rl_bert_huggingface_predicted_labels, average = "weighted")

scores_rl_bert_huggingface = [jac_scor_rl_bert_huggingface, ac_scor_rl_bert_huggingface, f1_scor_rl_bert_huggingface, prec_scor_rl_bert_huggingface, rec_scor_rl_bert_huggingface]
print(scores_rl_bert_huggingface)

[0.6771800597884835, 0.0, 0.7645672369267188, 0.7928831036843809, 0.7458556028625781]


  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# SGDC

grid = {
    'estimator__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], # learning rate
    #'estimator__n_iter': [1000], # number of epochs
    'estimator__loss': ['log'], # logistic regression,
    'estimator__penalty': ['l1', 'l2'],
    'estimator__n_jobs': [-1]
}

sgdc_bert_huggingface = GridSearchCV(OneVsRestClassifier(SGDClassifier()),
                              param_grid=grid,
                              n_jobs=-1,
                              cv=3, # Le GridSearch garantit la cross-validation avec cet argument
                              scoring="accuracy", # Peut mettre "weighted_label" aussi
                              return_train_score = True,
                              refit=True,
                              #verbose=3
                              )
sgdc_bert_huggingface.fit(X_train_bert_hf,y_train_bert_hf)
pickle.dump(sgdc_bert_huggingface, open(f"{PATH_MODELS}/sgdc_bert_huggingface.pkl", 'wb'))

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.

In [22]:
# Predict
y_test_sgdc_bert_huggingface_predicted_labels = sgdc_bert_huggingface.predict(X_test_bert_hf)

In [23]:
# Calcul du score Jaccard pour tous les tags
jac_scor_sgdc_bert_huggingface = jaccard_score(y_test_bert_hf, y_test_sgdc_bert_huggingface_predicted_labels, average='weighted') 
ac_scor_sgdc_bert_huggingface = accuracy_score(y_test_bert_hf, y_test_sgdc_bert_huggingface_predicted_labels)
f1_scor_sgdc_bert_huggingface = f1_score(y_test_bert_hf, y_test_sgdc_bert_huggingface_predicted_labels, average = "weighted")
prec_scor_sgdc_bert_huggingface = precision_score(y_test_bert_hf, y_test_sgdc_bert_huggingface_predicted_labels, average = "weighted")
rec_scor_sgdc_bert_huggingface = recall_score(y_test_bert_hf, y_test_sgdc_bert_huggingface_predicted_labels, average = "weighted")

scores_sgdc_bert_huggingface = [jac_scor_sgdc_bert_huggingface, ac_scor_sgdc_bert_huggingface, f1_scor_sgdc_bert_huggingface, prec_scor_sgdc_bert_huggingface, rec_scor_sgdc_bert_huggingface]
print(scores_sgdc_bert_huggingface)

[0.6808551543570087, 0.0, 0.7719749573879391, 0.7818159699696138, 0.7634296584835583]


In [24]:
# RandomForrest

param_rfc = {"estimator__max_depth": [25, 50],  # Profondeur
             "estimator__min_samples_leaf": [5, 10],  # Cb d'éléments dans chaque feuille
             "estimator__class_weight": ["balanced"]}   

multi_rfc_bert_huggingface = GridSearchCV(OneVsRestClassifier(RandomForestClassifier()),
                            param_grid=param_rfc,
                            n_jobs=-1,
                            cv=2,
                            scoring="accuracy",
                            return_train_score = True,
                            refit=True,
                            #verbose=3
                            )
# Fit on Sample data
multi_rfc_bert_huggingface.fit(X_train_bert_hf,y_train_bert_hf)
pickle.dump(multi_rfc_bert_huggingface, open(f"{PATH_MODELS}/multi_rfc_bert_huggingface.pkl", 'wb'))

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.

In [25]:
# Predict
y_test_rfc_bert_huggingface_predicted_labels = multi_rfc_bert_huggingface.predict(X_test_bert_hf)

In [26]:
# Calcul du score Jaccard pour tous les tags
jac_scor_rfc_bert_huggingface = jaccard_score(y_test_bert_hf, y_test_rfc_bert_huggingface_predicted_labels, average='weighted')
ac_scor_rfc_bert_huggingface = accuracy_score(y_test_bert_hf, y_test_rfc_bert_huggingface_predicted_labels)
f1_scor_rfc_bert_huggingface = f1_score(y_test_bert_hf, y_test_rfc_bert_huggingface_predicted_labels, average = "weighted")
prec_scor_rfc_bert_huggingface = precision_score(y_test_bert_hf, y_test_rfc_bert_huggingface_predicted_labels, average = "weighted")
rec_scor_rfc_bert_huggingface = recall_score(y_test_bert_hf, y_test_rfc_bert_huggingface_predicted_labels, average = "weighted")

scores_rfc_bert_huggingface = [jac_scor_rfc_bert_huggingface, ac_scor_rfc_bert_huggingface, f1_scor_rfc_bert_huggingface, prec_scor_rfc_bert_huggingface, rec_scor_rfc_bert_huggingface]
print(scores_rfc_bert_huggingface)

[0.7303159161807505, 0.001, 0.7951860804060157, 0.7911668688253517, 0.8357641090678504]


  _warn_prf(average, modifier, msg_start, len(result))


#### BERT Tensorflow

In [27]:
X = bert_tf_data.drop(columns = ['tags'])
scaler = preprocessing.StandardScaler()
label_encoder = preprocessing.LabelEncoder()

data_transformed_bert_tf = scaler.fit_transform(X)

In [28]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(bert_tf_data['tags'])
y_binarized_bert_tf = multilabel_binarizer.transform(bert_tf_data['tags'])

In [29]:
# Create train and test split (20%)
X_train_bert_tf, X_test_bert_tf, y_train_bert_tf, y_test_bert_tf = train_test_split(data_transformed_bert_tf, y_binarized_bert_tf,
                                                    test_size=0.2, random_state=8)

# Model
param_logit = {"estimator__C": [10, 1.0, 0.1],
               "estimator__penalty": ["l1", "l2"],
               "estimator__solver": ["lbfgs"]} 

multi_logit_bert_hub_tf = GridSearchCV(OneVsRestClassifier(LogisticRegression()),
                              param_grid=param_logit,
                              n_jobs=-1,
                              cv=5, 
                              scoring="accuracy",
                              return_train_score = True,
                              refit=True,
                              #verbose=3
                              ) 

multi_logit_bert_hub_tf.fit(X_train_bert_tf,y_train_bert_tf)

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.

GridSearchCV(cv=5,
             estimator=OneVsRestClassifier(estimator=LogisticRegression()),
             n_jobs=-1,
             param_grid={'estimator__C': [10, 1.0, 0.1],
                         'estimator__penalty': ['l1', 'l2'],
                         'estimator__solver': ['lbfgs']},
             return_train_score=True, scoring='accuracy')

In [30]:
y_test_rl_bert_hub_tf_predicted_labels = multi_logit_bert_hub_tf.predict(X_test_bert_tf)

In [31]:
# Calcul du score Jaccard pour tous les tags
jac_scor_rl_bert_hub_tf = jaccard_score(y_test_bert_tf, y_test_rl_bert_hub_tf_predicted_labels, average='weighted') 
ac_scor_rl_bert_hub_tf = accuracy_score(y_test_bert_tf, y_test_rl_bert_hub_tf_predicted_labels)
f1_scor_rl_bert_hub_tf = f1_score(y_test_bert_tf, y_test_rl_bert_hub_tf_predicted_labels, average = "weighted")
prec_scor_rl_bert_hub_tf = precision_score(y_test_bert_tf, y_test_rl_bert_hub_tf_predicted_labels, average = "weighted")
rec_scor_rl_bert_hub_tf = recall_score(y_test_bert_tf, y_test_rl_bert_hub_tf_predicted_labels, average = "weighted")

scores_rl_bert_hub_tf = [jac_scor_rl_bert_hub_tf, ac_scor_rl_bert_hub_tf, f1_scor_rl_bert_hub_tf, prec_scor_rl_bert_hub_tf, rec_scor_rl_bert_hub_tf]
print(scores_rl_bert_hub_tf)

[0.6914991107827083, 0.0, 0.7812846933750901, 0.8071205040445759, 0.7621614276655494]


  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
# SGDC

grid = {
    'estimator__alpha': [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], # learning rate
    #'estimator__n_iter': [1000], # number of epochs
    'estimator__loss': ['log'], # logistic regression,
    'estimator__penalty': ['l1', 'l2'],
    'estimator__n_jobs': [-1]
}

sgdc_bert_hub_tf = GridSearchCV(OneVsRestClassifier(SGDClassifier()),
                              param_grid=grid,
                              n_jobs=-1,
                              cv=2, # Le GridSearch garantit la cross-validation avec cet argument
                              scoring="accuracy", # Peut mettre "weighted_label" aussi
                              return_train_score = True,
                              refit=True,
                              #verbose=3
                              ) 
sgdc_bert_hub_tf.fit(X_train_bert_tf,y_train_bert_tf)
pickle.dump(sgdc_bert_hub_tf, open(f"{PATH_MODELS}/sgdc_bert_hub_tf.pkl", 'wb'))

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.

In [40]:
# Predict
y_test_sgdc_bert_hub_tf_predicted_labels = sgdc_bert_hub_tf.predict(X_test_bert_tf)

In [41]:
# Calcul du score Jaccard pour tous les tags
jac_scor_sgdc_bert_hub_tf = jaccard_score(y_test_bert_tf, y_test_sgdc_bert_hub_tf_predicted_labels, average='weighted') 
ac_scor_sgdc_bert_hub_tf = accuracy_score(y_test_bert_tf, y_test_sgdc_bert_hub_tf_predicted_labels)
f1_scor_sgdc_bert_hub_tf = f1_score(y_test_bert_tf, y_test_sgdc_bert_hub_tf_predicted_labels, average = "weighted")
prec_scor_sgdc_bert_hub_tf = precision_score(y_test_bert_tf, y_test_sgdc_bert_hub_tf_predicted_labels, average = "weighted")
rec_scor_sgdc_bert_hub_tf = recall_score(y_test_bert_tf, y_test_sgdc_bert_hub_tf_predicted_labels, average = "weighted")

scores_sgdc_bert_hub_tf = [jac_scor_sgdc_bert_hub_tf, ac_scor_sgdc_bert_hub_tf, f1_scor_sgdc_bert_hub_tf, prec_scor_sgdc_bert_hub_tf, rec_scor_sgdc_bert_hub_tf]
print(scores_sgdc_bert_hub_tf)

[0.7066453552879597, 0.0, 0.7733656515898996, 0.8318134999121913, 0.7753872633390706]


  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
# RandomForrest

param_rfc = {"estimator__max_depth": [25, 50],  # Profondeur
             "estimator__min_samples_leaf": [5, 10],  # Cb d'éléments dans chaque feuille
             "estimator__class_weight": ["balanced"]} 

multi_rfc_bert_hub_tf = GridSearchCV(OneVsRestClassifier(RandomForestClassifier()),
                            param_grid=param_rfc,
                            n_jobs=-1,
                            cv=2,
                            scoring="accuracy",
                            return_train_score = True,
                            refit=True,
                            #verbose=3
                            )
# Fit on Sample data
multi_rfc_bert_hub_tf.fit(X_train_bert_tf,y_train_bert_tf)
pickle.dump(multi_rfc_bert_hub_tf, open(f"{PATH_MODELS}/multi_rfc_bert_hub_tf.pkl", 'wb'))

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.

In [44]:
# Predict
y_test_rfc_bert_hub_tf_predicted_labels = multi_rfc_bert_hub_tf.predict(X_test_bert_tf)

In [45]:
# Calcul du score Jaccard pour tous les tags
jac_scor_rfc_bert_hub_tf = jaccard_score(y_test_bert_tf, y_test_rfc_bert_hub_tf_predicted_labels, average='weighted') 
ac_scor_rfc_bert_hub_tf = accuracy_score(y_test_bert_tf, y_test_rfc_bert_hub_tf_predicted_labels)
f1_scor_rfc_bert_hub_tf = f1_score(y_test_bert_tf, y_test_rfc_bert_hub_tf_predicted_labels, average = "weighted")
prec_scor_rfc_bert_hub_tf = precision_score(y_test_bert_tf, y_test_rfc_bert_hub_tf_predicted_labels, average = "weighted")
rec_scor_rfc_bert_hub_tf = recall_score(y_test_bert_tf, y_test_rfc_bert_hub_tf_predicted_labels, average = "weighted")

scores_rfc_bert_hub_tf = [jac_scor_rfc_bert_hub_tf, ac_scor_rfc_bert_hub_tf, f1_scor_rfc_bert_hub_tf, prec_scor_rfc_bert_hub_tf, rec_scor_rfc_bert_hub_tf]
print(scores_rfc_bert_hub_tf)

[0.7373757807447812, 0.0, 0.8055020673591423, 0.8016298964573901, 0.841108796086602]


  _warn_prf(average, modifier, msg_start, len(result))


#### USE

In [12]:
# LR

X = use_data.drop(columns=["tags"])
y = use_data["tags"]
X_train_use, X_test_use, y_train_use, y_test_use = train_test_split(X, y, test_size=0.2)

multilabel_binarizer_use = MultiLabelBinarizer()
y_train_use_binarized = multilabel_binarizer_use.fit_transform(y_train_use)
y_test_use_binarized = multilabel_binarizer_use.fit_transform(y_test_use)

param_logit = {"estimator__C": [10, 1.0, 0.1],
               "estimator__penalty": ["l1", "l2"],
               "estimator__solver": ["lbfgs"]} 

multi_logit_use = GridSearchCV(OneVsRestClassifier(LogisticRegression()),
                              param_grid=param_logit,
                              n_jobs=-1,
                              cv=5, 
                              scoring="accuracy",
                              return_train_score = True,
                              refit=True) 
multi_logit_use.fit(X_train_use, y_train_use_binarized)
pickle.dump(multi_logit_use, open(f"{PATH_MODELS}/multi_logit_use.pkl", 'wb'))

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.

In [13]:
# Predict
y_test_rl_use_predicted_labels = multi_logit_use.predict(X_test_use)

# Inverse transform
y_test_rl_use_pred_inversed = multilabel_binarizer_use.inverse_transform(y_test_rl_use_predicted_labels)
y_test_rl_use_inversed = multilabel_binarizer_use.inverse_transform(y_test_use_binarized)

print("-"*50)
print("Print 5 first predicted Tags vs true Tags")
print("-" * 50)
print("Predicted:", y_test_rl_use_pred_inversed[0:5])
print("True:", y_test_rl_use_inversed[0:5])

--------------------------------------------------
Print 5 first predicted Tags vs true Tags
--------------------------------------------------
Predicted: [(' ', "'", ',', '-', '[', ']', 'a', 'c', 'd', 'e', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u'), (' ', "'", ',', '-', '[', ']', 'a', 'c', 'd', 'e', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u'), (' ', "'", ',', '[', ']', 'a', 'c', 'e', 'i', 'l', 'n', 'o', 'r', 's', 't'), (' ', "'", ',', '-', '[', ']', 'a', 'c', 'd', 'e', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u'), (' ', "'", ',', '-', '[', ']', 'a', 'c', 'e', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't')]
True: [(' ', "'", ',', '[', ']', 'a', 'd', 'e', 'h', 'i', 'j', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u'), (' ', "'", ',', '[', ']', 'a', 'c', 'e', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v'), (' ', '#', "'", ',', '-', '.', '[', ']', 'a', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'r', 's', 't', 'u', 'w', 'y'), (' ', "'", ',', '9', '[', 

In [14]:
# Calcul du score Jaccard pour tous les tags
jac_scor_rl_use = jaccard_score(y_test_use_binarized, y_test_rl_use_predicted_labels, average='weighted')
ac_scor_rl_use = accuracy_score(y_test_use_binarized, y_test_rl_use_predicted_labels)
f1_scor_rl_use = f1_score(y_test_use_binarized, y_test_rl_use_predicted_labels, average = "weighted")
prec_scor_rl_use = precision_score(y_test_use_binarized, y_test_rl_use_predicted_labels, average = "weighted")
rec_scor_rl_use = recall_score(y_test_use_binarized, y_test_rl_use_predicted_labels, average = "weighted")

scores_rl_use = [jac_scor_rl_use, ac_scor_rl_use, f1_scor_rl_use, prec_scor_rl_use, rec_scor_rl_use]
print(scores_rl_use)

[0.6461268712830956, 0.0, 0.6884625479094949, 0.6880334899035757, 0.702215375081764]


  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
# SGDC

grid = {
    'estimator__alpha': [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], # learning rate
    #'estimator__n_iter': [1000], # number of epochs
    'estimator__loss': ['log'], # logistic regression,
    'estimator__penalty': ['l1', 'l2'],
    'estimator__n_jobs': [-1]
}

sgdc_use = GridSearchCV(OneVsRestClassifier(SGDClassifier()),
                              param_grid=grid,
                              n_jobs=-1,
                              cv=2, # Le GridSearch garantit la cross-validation avec cet argument
                              scoring="accuracy", # Peut mettre "weighted_label" aussi
                              return_train_score = True,
                              refit=True,
                              verbose=3) # Me donne le suivi de ce que fait l'algo pendant qu'il fonctionne
sgdc_use.fit(X_train_use, y_train_use_binarized)
pickle.dump(sgdc_use, open(f"{PATH_MODELS}/sgdc_use.pkl", 'wb'))

Fitting 2 folds for each of 14 candidates, totalling 28 fits


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.

[CV 1/2] END estimator__alpha=0.001, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.000, test=0.000) total time= 1.6min


  "Label %s is present in all training examples." % str(classes[c])


[CV 2/2] END estimator__alpha=0.001, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.000, test=0.000) total time= 1.7min


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


[CV 1/2] END estimator__alpha=0.01, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.000, test=0.000) total time= 1.7min


  "Label %s is present in all training examples." % str(classes[c])


[CV 2/2] END estimator__alpha=0.01, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.000, test=0.000) total time= 1.8min


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.

[CV 1/2] END estimator__alpha=0.01, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.000, test=0.000) total time= 3.5min


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


[CV 2/2] END estimator__alpha=0.1, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.000, test=0.000) total time= 1.9min
[CV 1/2] END estimator__alpha=0.1, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.000, test=0.000) total time= 1.9min


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


[CV 2/2] END estimator__alpha=0.01, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.000, test=0.000) total time= 3.7min


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


[CV 1/2] END estimator__alpha=0.001, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.000, test=0.000) total time= 4.0min


  "Label %s is present in all training examples." % str(classes[c])


[CV 2/2] END estimator__alpha=0.001, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.000, test=0.000) total time= 4.3min


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


[CV 1/2] END estimator__alpha=0.1, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.000, test=0.000) total time= 3.6min


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


[CV 2/2] END estimator__alpha=0.1, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.000, test=0.000) total time= 3.8min


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


[CV 2/2] END estimator__alpha=1.0, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.000, test=0.000) total time= 2.4min
[CV 1/2] END estimator__alpha=1.0, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.000, test=0.000) total time= 2.4min


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


[CV 1/2] END estimator__alpha=10.0, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.000, test=0.000) total time= 2.3min


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


[CV 2/2] END estimator__alpha=10.0, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.000, test=0.000) total time= 2.3min


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


[CV 2/2] END estimator__alpha=1.0, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.000, test=0.000) total time= 4.9min


  "Label %s is present in all training examples." % str(classes[c])


[CV 1/2] END estimator__alpha=1.0, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.000, test=0.000) total time= 5.3min


  "Label %s is present in all training examples." % str(classes[c])


[CV 1/2] END estimator__alpha=100.0, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.000, test=0.000) total time= 1.4min


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


[CV 2/2] END estimator__alpha=100.0, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.000, test=0.000) total time= 1.4min


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


[CV 1/2] END estimator__alpha=1000.0, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.000, test=0.000) total time=  33.0s


  "Label %s is present in all training examples." % str(classes[c])


[CV 2/2] END estimator__alpha=1000.0, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l2;, score=(train=0.000, test=0.000) total time=  32.1s


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


[CV 2/2] END estimator__alpha=10.0, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.000, test=0.000) total time= 8.1min


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


[CV 1/2] END estimator__alpha=10.0, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.000, test=0.000) total time= 9.2min




[CV 1/2] END estimator__alpha=100.0, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.000, test=0.000) total time=13.5min




[CV 2/2] END estimator__alpha=100.0, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.000, test=0.000) total time=14.1min




[CV 1/2] END estimator__alpha=1000.0, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.000, test=0.000) total time=13.7min




[CV 2/2] END estimator__alpha=1000.0, estimator__loss=log, estimator__n_jobs=-1, estimator__penalty=l1;, score=(train=0.000, test=0.000) total time=14.8min


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


In [33]:
# Predict
y_test_sgdc_use_predicted_labels = sgdc_use.predict(X_test_use)

# Inverse transform
y_test_sgdc_use_pred_inversed = multilabel_binarizer_use.inverse_transform(y_test_sgdc_use_predicted_labels)
y_test_sgdc_use_inversed = multilabel_binarizer_use.inverse_transform(y_test_use_binarized)

print("-"*50)
print("Print 5 first predicted Tags vs true Tags")
print("-" * 50)
print("Predicted:", y_test_sgdc_use_pred_inversed[0:5])
print("True:", y_test_sgdc_use_inversed[0:5])

--------------------------------------------------
Print 5 first predicted Tags vs true Tags
--------------------------------------------------
Predicted: [(' ', "'", ',', '-', '[', ']', 'a', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'p', 'r', 's', 't', 'v'), (' ', "'", ',', '-', '[', ']', 'a', 'c', 'd', 'e', 'g', 'i', 'l', 'n', 'o', 'p', 'r', 's', 't', 'v'), (' ', "'", ',', '[', ']', 'a', 'e', 'i', 's'), (' ', "'", ',', '-', '[', ']', 'a', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'p', 'r', 's', 't', 'v'), (' ', "'", ',', '-', '[', ']', 'a', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'p', 'r', 's', 't')]
True: [(' ', "'", ',', '[', ']', 'a', 'd', 'e', 'h', 'i', 'j', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u'), (' ', "'", ',', '[', ']', 'a', 'c', 'e', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v'), (' ', '#', "'", ',', '-', '.', '[', ']', 'a', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'r', 's', 't', 'u', 'w', 'y'), (' ', "'", ',', '9', '[', ']', 'a', 'b', 'c', 'e', 'f', 'i', 'l', 

In [34]:
# Calcul du score Jaccard pour tous les tags
jac_scor_sgdc_use = jaccard_score(y_test_use_binarized, y_test_sgdc_use_predicted_labels, average='weighted')
ac_scor_sgdc_use = accuracy_score(y_test_use_binarized, y_test_sgdc_use_predicted_labels)
f1_scor_sgdc_use = f1_score(y_test_use_binarized, y_test_sgdc_use_predicted_labels, average = "weighted")
prec_scor_sgdc_use = precision_score(y_test_use_binarized, y_test_sgdc_use_predicted_labels, average = "weighted")
rec_scor_sgdc_use = recall_score(y_test_use_binarized, y_test_sgdc_use_predicted_labels, average = "weighted")

scores_sgdc_use = [jac_scor_sgdc_use, ac_scor_sgdc_use, f1_scor_sgdc_use, prec_scor_sgdc_use, rec_scor_sgdc_use]
print(scores_sgdc_use)

[0.6099706801727053, 0.0, 0.6626550911969835, 0.6782379618192543, 0.6617636933246118]


  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
# RandomForrest

param_rfc = {"estimator__max_depth": [25, 50], 
             "estimator__min_samples_leaf": [5, 10],  
             "estimator__class_weight": ["balanced"]}   

multi_rfc_use = GridSearchCV(OneVsRestClassifier(RandomForestClassifier()),
                            param_grid=param_rfc,
                            n_jobs=-1,
                            cv=2,
                            scoring="accuracy",
                            return_train_score = True,
                            refit=True,
                            #verbose=3
                            )
# Fit on Sample data
multi_rfc_use.fit(X_train_use, y_train_use_binarized)
pickle.dump(multi_rfc_use, open(f"{PATH_MODELS}/multi_rfc_use.pkl", 'wb'))

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples.

In [37]:
# Predict
y_test_rfc_use_predicted_labels = multi_rfc_use.predict(X_test_use)

# Inverse transform
y_test_rfc_use_pred_inversed = multilabel_binarizer_use.inverse_transform(y_test_rfc_use_predicted_labels)
y_test_rfc_use_inversed = multilabel_binarizer_use.inverse_transform(y_test_use_binarized)

print("-"*50)
print("Print 5 first predicted Tags vs true Tags")
print("-" * 50)
print("Predicted:", y_test_rfc_use_pred_inversed[0:5])
print("True:", y_test_rfc_use_inversed[0:5])

--------------------------------------------------
Print 5 first predicted Tags vs true Tags
--------------------------------------------------
Predicted: [(' ', "'", ',', '-', '[', ']', 'a', 'b', 'c', 'd', 'e', 'g', 'i', 'j', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'v'), (' ', "'", ',', '-', '[', ']', 'a', 'c', 'd', 'e', 'g', 'h', 'i', 'j', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v'), (' ', '#', "'", ',', '-', '.', '[', ']', 'a', 'c', 'd', 'e', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'v', 'w', 'y'), (' ', "'", ',', '-', '[', ']', 'a', 'b', 'c', 'd', 'e', 'h', 'i', 'l', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w'), (' ', "'", '+', ',', '-', '1', '[', ']', 'a', 'c', 'e', 'f', 'g', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u')]
True: [(' ', "'", ',', '[', ']', 'a', 'd', 'e', 'h', 'i', 'j', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u'), (' ', "'", ',', '[', ']', 'a', 'c', 'e', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v'), (' ', '#', "'", ',', '-', '.', '[', ']

In [38]:
# Calcul du score Jaccard pour tous les tags
jac_scor_rfc_use = jaccard_score(y_test_use_binarized, y_test_rfc_use_predicted_labels, average='weighted') 
ac_scor_rfc_use = accuracy_score(y_test_use_binarized, y_test_rfc_use_predicted_labels)
f1_scor_rfc_use = f1_score(y_test_use_binarized, y_test_rfc_use_predicted_labels, average = "weighted")
prec_scor_rfc_use = precision_score(y_test_use_binarized, y_test_rfc_use_predicted_labels, average = "weighted")
rec_scor_rfc_use = recall_score(y_test_use_binarized, y_test_rfc_use_predicted_labels, average = "weighted")

scores_rfc_use = [jac_scor_rfc_use, ac_scor_rfc_use, f1_scor_rfc_use, prec_scor_rfc_use, rec_scor_rfc_use]
print(scores_rfc_use)

[0.7712308758985474, 0.0011470082202255784, 0.8412617399107408, 0.8490487791697129, 0.8685406410300548]


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Tableau récapitulatif

df_metrics_compare = pd.DataFrame(
    index =["W2V - RL", "W2V - SGDC", "W2V - RF"
                ,"CV - RL", "CV - SGDC", 'CV - RF'
                ,"TF - RL", "TF - SGDC", 'TF - RF'
                ,"USE - RL", "USE - SGDC", 'USE - RF'
                #,"BERT - RL", "BERT - SGDC", 'BERT - RF'
            ],
    
    data = [scores_rl_w2v, scores_sgdc_w2v, scores_rfc_w2v
                , scores_rl_cv, scores_sgdc_cv, scores_rf_cv
                , scores_rl_tf, scores_sgdc_tf, scores_rfc_tf
                #, scores_rl_bert, scores_sgdc_bert, scores_rf_bert
                , scores_rl_use, scores_sgdc_use, scores_rfc_use
               ],

                
    columns=["Jaccard", "Accuracy", "F1", "Precision", "Recall"]).T


In [None]:
# Mean scores
df_metrics_compare.loc["mean"] = df_metrics_compare.mean()

In [None]:
df_metrics_compare

Unnamed: 0,W2V - RL,W2V - SGDC,W2V - RF,CV - RL,CV - SGDC,CV - RF,TF - RL,TF - SGDC,TF - RF,USE - RL,USE - SGDC,USE - RF
Jaccard,0.160442,0.129794,0.060781,0.42233,0.010088,0.277058,0.379594,0.298339,0.396181,0.383616,0.268638,0.325982
Accuracy,0.108347,0.106544,0.076798,0.100775,0.061114,0.170723,0.224626,0.193258,0.180818,0.221561,0.173968,0.188751
F1,0.265168,0.217026,0.108966,0.577192,0.019411,0.393203,0.526503,0.430747,0.544787,0.533368,0.37777,0.463484
Precision,0.530518,0.535871,0.437806,0.482195,0.318617,0.660717,0.708699,0.716595,0.592142,0.69301,0.645767,0.673051
Recall,0.182031,0.144221,0.066168,0.755437,0.010124,0.319366,0.434787,0.328803,0.55134,0.449025,0.30354,0.382753
mean,0.249301,0.226691,0.150104,0.467586,0.083871,0.364213,0.454842,0.393548,0.453054,0.456116,0.353937,0.406804


In [46]:
# Tableau récapitulatif BERT - USE

df_metrics_compare_bert_use = pd.DataFrame(
    index =[ 
                "BERT HF - RL", "BERT HF - SGDC", 'BERT HF - RF'
                ,"BERT TF - RL", "BERT TF - SGDC", 'BERT TF - RF'
                ,"USE - RL", "USE - SGDC", 'USE - RF'
            ],
    
    data = [
                scores_rl_bert_huggingface, scores_sgdc_bert_huggingface, scores_rfc_bert_huggingface
                , scores_rl_bert_hub_tf, scores_sgdc_bert_hub_tf, scores_rfc_bert_hub_tf
                , scores_rl_use, scores_sgdc_use, scores_rfc_use
               ],

                
    columns=["Jaccard", "Accuracy", "F1", "Precision", "Recall"]).T

# Mean scores
df_metrics_compare_bert_use.loc["mean"] = df_metrics_compare_bert_use.mean()

df_metrics_compare_bert_use

Unnamed: 0,BERT HF - RL,BERT HF - SGDC,BERT HF - RF,BERT TF - RL,BERT TF - SGDC,BERT TF - RF,USE - RL,USE - SGDC,USE - RF
Jaccard,0.67718,0.680855,0.730316,0.691499,0.706645,0.737376,0.646127,0.609971,0.771231
Accuracy,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.001147
F1,0.764567,0.771975,0.795186,0.781285,0.773366,0.805502,0.688463,0.662655,0.841262
Precision,0.792883,0.781816,0.791167,0.807121,0.831813,0.80163,0.688033,0.678238,0.849049
Recall,0.745856,0.76343,0.835764,0.762161,0.775387,0.841109,0.702215,0.661764,0.868541
mean,0.596097,0.599615,0.630687,0.608413,0.617442,0.637123,0.544968,0.522525,0.666246


F1 : combinaison recall et precision

In [None]:
best_model = df_metrics_compare.loc["mean"].idxmax()
print(f"Best model is : {best_model}")

Best model is : CV - RL


In [None]:
# Export model
import joblib
joblib.dump(multi_lr_cv, f"{PATH_MODELS}/best_model.pkl")

['best_model.pkl']