# Projet 6 : Catégorisez automatiquement des questions
# <u>C. Méthodes supervisées</u> <br/>

In [1]:
#import os
import numpy as np
import pandas as pd
from collections import Counter
from ast import literal_eval
from time import time

from sklearn import model_selection, metrics
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_predict

from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB


import warnings; 
#warnings.simplefilter('always') 
warnings.simplefilter('ignore') 

# 1. Chargement des données pré-traitées

Nos données sont réparties dans 5 fichiers représentant une taille totale de 0,12Go.

In [2]:
df = pd.read_csv('cleaned_data.csv')
#replace NaN by empty string
df = df.replace(np.nan, '', regex=True)
df['TAGS_P'] = df['TAGS_P'].apply(literal_eval)

In [3]:
df.shape

(64432, 7)

In [4]:
df.head()

Unnamed: 0,TITLE,BODY,SCORE,TAGS,TITLE_P,BODY_P,TAGS_P
0,Java generics variable <T> value,<p>At the moment I am using the following code...,6,<java><generics>,java gener variabl valu,moment use follow code filter jpa reduc block ...,"[java, generics]"
1,How a value typed variable is copied when it i...,<blockquote>\n <p>Swift's string type is a va...,6,<swift><function><value-type>,valu type variabl copi pass function hold copi,swift string type valu type creat new string v...,"[swift, function, value-type]"
2,Error while waiting for device: The emulator p...,<p>I am a freshman for the development of the ...,6,<android><android-studio><android-emulator><avd>,error wait devic emul process avd kill,freshman develop andriod suffer odd question r...,"[android, android-studio, android-emulator, avd]"
3,gulp-inject not working with gulp-watch,<p>I am using gulp-inject to auto add SASS imp...,10,<javascript><node.js><npm><gulp><gulp-watch>,gulp inject work gulp watch,use gulp inject auto add sass import newli cre...,"[javascript, node.js, npm, gulp, gulp-watch]"
4,React - Call function on props change,<p>My TranslationDetail component is passed an...,12,<reactjs><react-router>,react call function prop chang,translationdetail compon pass id upon open bas...,"[reactjs, react-router]"


# 2. Transformation des données

## 2.1 Echantillonage

Travaillons sur un échantillon de 25 000 posts.

In [5]:
df_sample = df.sample(25000)

In [6]:
df_sample.shape

(25000, 7)

- Gardons 15 000 données pour l'apprentissage'
- Et 10 000 pour vérifier la pertinence de  nos modèles

In [7]:
df_learn = df_sample.iloc[10000:, :].copy()
df_validation = df_sample.iloc[:10000, :].copy()

In [8]:
display(df_learn.shape)
display(df_validation.shape)

(15000, 7)

(10000, 7)

## 2.2 Filtre sur les tags les plus fréquents

Pour chaque tag on stocke son nombre d'occurences.

In [9]:
counts = Counter()
for tags_list in df['TAGS_P']:
    counts.update(tags_list)
tags_df = pd.DataFrame.from_dict(counts, orient='index')
tags_df.reset_index(drop = False, inplace = True)
tags_df= tags_df.rename(columns={'index':'tag', 0:'count'})

La structures tags_df contient pour chacun des tags son occurence. <br/>
Gardons que les tags qui sont présents dans au moins 50 documents pour l'apprentissage.

In [10]:
frequent_tags = tags_df[tags_df['count'] > 50]['tag'].tolist()
df_learn['TAGS_P'] = df_learn['TAGS_P'].apply(lambda x: [w for w in x if w in frequent_tags] )
# On supprime les lignes qui n'ont plus de tags associés (car aucun n'est présent dans la liste frequent_tags)
df_learn = df_learn[df_learn.astype(str)['TAGS_P'] != '[]']

In [11]:
len(frequent_tags)

540

Il nous reste un peu plus de 1100 tags différents.

In [12]:
df_learn.shape

(14529, 7)

## 2.3 Découpage en jeu entrainement et test

In [13]:
X = df_learn['TITLE_P'] + ' ' + df_learn['BODY_P']
Y = df_learn['TAGS_P']

Gardons 70% des données pour l'entrainement et 30% pour les tests.

In [14]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(X,Y,test_size = 0.3,random_state = 0, shuffle = True)

In [15]:
print("train", x_train.shape)
print("test ",x_test.shape)

train (10170,)
test  (4359,)


Préparons également les données non filtrés (tags les plus fréquents) pour nos tests.

In [16]:
x_validation = df_validation['TITLE_P'] + ' ' + df_validation['BODY_P']
y_validation = df_validation['TAGS_P']

### Cible = Multi labels 

Notre variable cible est composée de plusieurs valeurs de tags.<br/>
Nous allons transformer nos tags en matrice binaire indiquant la présence ou pas d'un tag'

In [17]:
mlb = MultiLabelBinarizer(classes=frequent_tags)

In [18]:
y_train_mlb = mlb.fit_transform(y_train)
y_test_mlb = mlb.fit_transform(y_test)

# 3. Evaluation des modéles

In [19]:
def getClassifierScore(y_true, y_predicted) :
    return metrics.f1_score(y_true, y_predicted, average='micro')

'''
Méthode générique pour faire une recherche sur grille et évaluer le modèle de classification.
Affiche les meilleurs paramètres et la précision du modèle.
'''
def evaluateClassifier(model, extra_param, x_train, y_train, x_test, y_test) :
    t0 = time()
    Kfold = 5
    parameters = { 
              'vect__min_df': [5],
              'vect__max_df': [0.95],
              'tfidf__use_idf': [True],
              'tfidf__norm' : ['l2']
             }
    parameters.update(extra_param)
    classifier = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(model))])
    
    gs_classifier = GridSearchCV(estimator = classifier, param_grid = parameters, cv = Kfold,  n_jobs=-1)
    fit = gs_classifier.fit(x_train, y_train)
    print("Best params :", gs_classifier.best_params_)
    y_pred = gs_classifier.predict(x_test)
    print("Classification score: {:.2f} % ".format(100*getClassifierScore(y_test,y_pred)))
    print("done in %0.3fs." % (time() - t0))
    return gs_classifier

In [20]:
def predict_tags(clf, text_data, mlabel_bin, num_tags):
    if hasattr(clf, 'decision_function'):
        predictions = clf.decision_function(text_data)
    elif hasattr(clf, 'predict_proba'):
        predictions = clf.predict_proba(text_data)
    else :
        return None
    top_classes= np.argsort(-predictions)[:,:num_tags]
    tags_pred = mlabel_bin.classes_[top_classes]
    y_predicted_df = pd.DataFrame(index=text_data.index)
    y_predicted_df['TAGS_P']=tags_pred.tolist()
    return y_predicted_df

In [21]:
'''
Méthode permettant d'évaluer la qualité des prédictions en comparant les tags prédits aux tags réels.
calcule pour chaque post, le rapport entre le nombre de tags correctement prédits sur le nombre de tags réels.
retourne la moyenne de ces rapports.
'''
def predictionAccuracy(y_true, y_predicted) :
    tags_found=[]
    for index, row in y_predicted.iterrows():
        number_tags_found = 0
        for t in row['TAGS_P'] :
            if t in y_true.loc[index]['TAGS_P'] :
                number_tags_found +=1
        tags_found.append(number_tags_found/len(y_true.loc[index]['TAGS_P']))
    print("Prediction accuracy: {:.2f} % ".format(100*np.mean(tags_found)))

## 3.1 Gaussian Naive Bayes

In [22]:
from sklearn.base import TransformerMixin
class DenseTransformer(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [23]:
g_nb = GaussianNB()
g_nb_pipeline = Pipeline([
    ('vect', CountVectorizer(min_df = 5,max_df= 0.95)),
    ('tfidf', TfidfTransformer()),
    ('to_dense', DenseTransformer()), 
    ('clf', OneVsRestClassifier(g_nb))])

In [24]:
t0 = time()
g_nb_pipeline.fit(x_train, y_train_mlb)
print("done in %0.3fs." % (time() - t0))

done in 1264.820s.


In [25]:
y_pred = predict_tags(g_nb_pipeline, x_test, mlb, 5)
y_true = y_test.to_frame()
predictionAccuracy(y_true, y_pred)

Prediction accuracy: 18.98 % 


In [26]:
y_pred = predict_tags(g_nb_pipeline, x_validation, mlb, 5)
y_true = y_validation.to_frame()
predictionAccuracy(y_true, y_pred)

Prediction accuracy: 13.62 % 


## 3.2 SVM Linéaire

In [27]:
svc = LinearSVC()
parameters = {'clf__estimator__C':np.logspace(-1,3,10)}
svc_grid = evaluateClassifier(svc, parameters, x_train, y_train_mlb, x_test, y_test_mlb )

Best params : {'clf__estimator__C': 2.1544346900318834, 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__max_df': 0.95, 'vect__min_df': 5}
Classification score: 53.11 % 
done in 584.405s.


In [28]:
y_pred = predict_tags(svc_grid, x_test, mlb, 5)
y_true = y_test.to_frame()
predictionAccuracy(y_true, y_pred)

Prediction accuracy: 75.24 % 


In [29]:
y_pred = predict_tags(svc_grid, x_validation, mlb, 5)
y_true = y_validation.to_frame()
predictionAccuracy(y_true, y_pred)

Prediction accuracy: 53.17 % 


## 3.3 Decision Tree

In [30]:
dtree = DecisionTreeClassifier()
parameters = {'clf__estimator__criterion' : ['entropy', 'gini'], 
              'clf__estimator__max_depth': [1, 2, 3, 4]}
dtree_grid = evaluateClassifier(dtree,parameters, x_train, y_train_mlb, x_test, y_test_mlb)

Best params : {'clf__estimator__criterion': 'entropy', 'clf__estimator__max_depth': 2, 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__max_df': 0.95, 'vect__min_df': 5}
Classification score: 52.03 % 
done in 1614.793s.


In [31]:
y_pred = predict_tags(dtree_grid, x_test, mlb, 5)
y_true = y_test.to_frame()
predictionAccuracy(y_true, y_pred)

Prediction accuracy: 70.48 % 


In [32]:
y_pred = predict_tags(dtree_grid, x_validation, mlb, 5)
y_true = y_validation.to_frame()
predictionAccuracy(y_true, y_pred)

Prediction accuracy: 49.50 % 


## 3.4 SGD Classifier

In [33]:
sgd = SGDClassifier(loss='log', max_iter=5, tol=None)
parameters = {'clf__estimator__alpha': (0.00001, 0.000001), 'clf__estimator__penalty': ('l2', 'elasticnet')}
sgd_grid = evaluateClassifier(sgd, parameters, x_train, y_train_mlb, x_test, y_test_mlb )

Best params : {'clf__estimator__alpha': 1e-05, 'clf__estimator__penalty': 'elasticnet', 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__max_df': 0.95, 'vect__min_df': 5}
Classification score: 49.53 % 
done in 223.929s.


In [34]:
y_pred = predict_tags(sgd_grid, x_test, mlb, 5)
y_true = y_test.to_frame()
predictionAccuracy(y_true, y_pred)

Prediction accuracy: 76.40 % 


In [35]:
y_pred = predict_tags(sgd_grid, x_validation, mlb, 5)
y_true = y_validation.to_frame()
predictionAccuracy(y_true, y_pred)

Prediction accuracy: 54.09 % 


## 3.5 Random Forest

In [77]:
rfc = RandomForestClassifier(oob_score = True)
rfc_pipeline = Pipeline([
    ('vect', CountVectorizer(max_df=0.95, min_df=5)),
    ('tfidf', TfidfTransformer()),
    ('to_dense', DenseTransformer()), 
    ('clf', OneVsRestClassifier(rfc))])

In [83]:
t0 = time()
rfc_pipeline.fit(x_train, y_train_mlb)
print("done in %0.3fs." % (time() - t0))

done in 2083.522s.


In [84]:
y_pred = predict_tags(rfc_pipeline, x_test, mlb, 5)
y_true = y_test.to_frame()
predictionAccuracy(y_true, y_pred)

Prediction accuracy: 58.62 % 


In [85]:
y_pred = predict_tags(rfc_pipeline, x_validation, mlb, 5)
y_true = y_validation.to_frame()
predictionAccuracy(y_true, y_pred)

Prediction accuracy: 41.22 % 


## 3.6 Gradient Boosting

In [86]:
gb = GradientBoostingClassifier()
gb_pipeline = Pipeline([
    ('vect', CountVectorizer(max_df=0.95, min_df=5)),
    ('tfidf', TfidfTransformer()),
    ('to_dense', DenseTransformer()), 
    ('clf', OneVsRestClassifier(rfc))])

In [87]:
t0 = time()
gb_pipeline.fit(x_train, y_train_mlb)
print("done in %0.3fs." % (time() - t0))

done in 2243.466s.


In [88]:
y_pred = predict_tags(gb_pipeline, x_test, mlb, 5)
y_true = y_test.to_frame()
predictionAccuracy(y_true, y_pred)

Prediction accuracy: 57.70 % 


In [89]:
y_pred = predict_tags(gb_pipeline, x_validation, mlb, 5)
y_true = y_validation.to_frame()
predictionAccuracy(y_true, y_pred)

Prediction accuracy: 41.02 % 


# 4. Analyse des résultats

Si on se concentre sur la fiabilité des résultats pour le jeu de validation, nous avons :

### Modèles supervisés


|         | Gaussian Naive Bayes |      SVM Linéaire    | Decision Tree | SGD       | Random Forest |Gradient Boosting |
|---------|:---------------------:|:--------:|:-------------:|:-----------:|:-----------:|:-------------:|
| Scores  |     13.62 %           | 53.17 %  |    49.50 %     | **54.09 %**    | 41.22 %     |41.02 %        |

### Modèles non supervisés


|           | LDA        |      NMF   | 
|:---------:|:----------:|:----------:|
| Scores    |    27.94 % |  33.68 %   | 


=> Nous avons la meilleure performance avec l'algorithme SGD. C'est celui que nous garderons pour l'API finale.

## Sauvegarde des données

On sauve le classifier.

In [117]:
from sklearn.externals import joblib
joblib.dump(sgd_grid, './data/tags_SGDClassifier.pkl')

['./data/tags_SVCClassifier.pkl']

On sauve aussi le MultiLabelBinarizer.

In [116]:
joblib.dump(mlb, './data/tags_multiLabelBin.pkl')

['./data/tags_multiLabelBin.pkl']