# Projet 6 : Catégorisez automatiquement des questions
# <u>C. Méthodes supervisées</u> <br/>

In [1]:
#import os
import numpy as np
import pandas as pd
from collections import Counter
from ast import literal_eval
from time import time

from sklearn import model_selection, metrics
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier


import warnings; 
#warnings.simplefilter('always') 
warnings.simplefilter('ignore') 

# 1. Chargement des données pré-traitées

Nos données sont réparties dans 5 fichiers représentant une taille totale de 0,12Go.

In [2]:
df = pd.read_csv('cleaned_data.csv')
#replace NaN by empty string
df = df.replace(np.nan, '', regex=True)
df['TAGS_P'] = df['TAGS_P'].apply(literal_eval)

In [3]:
df.shape

(64432, 7)

In [4]:
df.head()

Unnamed: 0,TITLE,BODY,SCORE,TAGS,TITLE_P,BODY_P,TAGS_P
0,Java generics variable <T> value,<p>At the moment I am using the following code...,6,<java><generics>,java gener variabl valu,moment use follow code filter jpa reduc block ...,"[java, generics]"
1,How a value typed variable is copied when it i...,<blockquote>\n <p>Swift's string type is a va...,6,<swift><function><value-type>,valu type variabl copi pass function hold copi,swift string type valu type creat new string v...,"[swift, function, value-type]"
2,Error while waiting for device: The emulator p...,<p>I am a freshman for the development of the ...,6,<android><android-studio><android-emulator><avd>,error wait devic emul process avd kill,freshman develop andriod suffer odd question r...,"[android, android-studio, android-emulator, avd]"
3,gulp-inject not working with gulp-watch,<p>I am using gulp-inject to auto add SASS imp...,10,<javascript><node.js><npm><gulp><gulp-watch>,gulp inject work gulp watch,use gulp inject auto add sass import newli cre...,"[javascript, node.js, npm, gulp, gulp-watch]"
4,React - Call function on props change,<p>My TranslationDetail component is passed an...,12,<reactjs><react-router>,react call function prop chang,translationdetail compon pass id upon open bas...,"[reactjs, react-router]"


# 2. Transformation des données

## 2.1 Echantillonage

Travaillons sur un échantillon de 15 000 posts.

In [33]:
df_sample = df.sample(25000)

In [40]:
df_sample.shape

(25000, 7)

In [51]:
df_learn = df_sample.iloc[10000:, :].copy()
df_validation = df_sample.iloc[:10000, :].copy()

In [52]:
display(df_learn.shape)
display(df_validation.shape)

(15000, 7)

(10000, 7)

## 2.2 Filtre sur les tags les plus fréquents

Pour chaque tag on stocke son nombre d'occurences.

In [7]:
counts = Counter()
for tags_list in df['TAGS_P']:
    counts.update(tags_list)
tags_df = pd.DataFrame.from_dict(counts, orient='index')
tags_df.reset_index(drop = False, inplace = True)
tags_df= tags_df.rename(columns={'index':'tag', 0:'count'})

La structures tags_df contient pour chacun des tags son occurence. <br/>
Gardons que les tags qui sont présents dans au moins 20 documents.

In [53]:
frequent_tags = tags_df[tags_df['count'] > 20]['tag'].tolist()
df_learn['TAGS_P'] = df_learn['TAGS_P'].apply(lambda x: [w for w in x if w in frequent_tags] )
# On supprime les lignes qui n'ont plus de tags associés (car aucun n'est présent dans la liste frequent_tags)
df_learn = df_learn[df_learn.astype(str)['TAGS_P'] != '[]']

In [54]:
len(frequent_tags)

1182

In [55]:
df_learn.shape

(14770, 7)

## 2.3 Découpage en jeu entrainement et test

In [56]:
X = df_learn['TITLE_P'] + ' ' + df_learn['BODY_P']
Y = df_learn['TAGS_P']

In [57]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(X,Y,test_size = 0.3,random_state = 0, shuffle = True)

In [58]:
print("train", x_train.shape)
print("test ",x_test.shape)

train (10339,)
test  (4431,)


In [61]:
x_validation = df_validation['TITLE_P'] + ' ' + df_validation['BODY_P']
y_validation = df_validation['TAGS_P']

### Cible = Multi labels 

Notre variable cible est composée de plusieurs valeurs de tags.<br/>
Nous allons transformer nos tags en matrice binaire indiquant la présence ou pas d'un tag'

In [59]:
mlb = MultiLabelBinarizer(classes=frequent_tags)

In [60]:
y_train_mlb = mlb.fit_transform(y_train)
y_test_mlb = mlb.fit_transform(y_test)

# 3. Evaluation des modéles

In [16]:
def getClassifierScore(y_true, y_predicted) :
    return metrics.f1_score(y_true, y_predicted, average='micro')

'''
Méthode générique pour faire une recherche sur grille et évaluer le modèle de classification.
Affiche les meilleurs paramètres et la précision du modèle.
'''
def evaluateClassifier(model, extra_param, x_train, y_train, x_test, y_test) :
    t0 = time()
    Kfold = 5
    parameters = { 
              'vect__min_df': [5],
              'vect__max_df': [0.95],
              'tfidf__use_idf': [True],
              'tfidf__norm' : ['l2']
             }
    parameters.update(extra_param)
    classifier = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(model))])
    
    gs_classifier = GridSearchCV(estimator = classifier, param_grid = parameters, cv = Kfold,  n_jobs=-1)
    fit = gs_classifier.fit(x_train, y_train)
    print("Best params :", gs_classifier.best_params_)
    y_pred = gs_classifier.predict(x_test)
    print("Classification score: {:.2f} % ".format(100*getClassifierScore(y_test,y_pred)))
    print("done in %0.3fs." % (time() - t0))
    return gs_classifier

In [17]:
def predict_tags(clf, text_data, mlabel_bin, num_tags):
    if hasattr(clf, 'decision_function'):
        predictions = clf.decision_function(text_data)
    elif hasattr(clf, 'predict_proba'):
        predictions = clf.predict_proba(text_data)
    else :
        return None
    top_classes= np.argsort(-predictions)[:,:num_tags]
    tags_pred = mlabel_bin.classes_[top_classes]
    y_predicted_df = pd.DataFrame(index=text_data.index)
    y_predicted_df['TAGS_P']=tags_pred.tolist()
    return y_predicted_df

In [18]:
'''
Méthode permettant d'évaluer la qualité des prédictions en comparant les tags prédits aux tags réels.
calcule pour chaque post, le rapport entre le nombre de tags correctement prédits sur le nombre de tags réels.
retourne la moyenne de ces rapports.
'''
def predictionAccuracy(y_true, y_predicted) :
    tags_found=[]
    for index, row in y_predicted.iterrows():
        number_tags_found = 0
        for t in row['TAGS_P'] :
            if t in y_true.loc[index]['TAGS_P'] :
                number_tags_found +=1
        tags_found.append(number_tags_found/len(y_true.loc[index]['TAGS_P']))
    print("Prediction accuracy: {:.2f} % ".format(100*np.mean(tags_found)))

## 3.1 SVM Linéaire

In [19]:
svc = LinearSVC()
parameters = {'clf__estimator__C':np.logspace(-1,3,10)}
svc_grid = evaluateClassifier(svc, parameters, x_train, y_train_mlb, x_test, y_test_mlb )

Best params : {'clf__estimator__C': 2.1544346900318834, 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__max_df': 0.95, 'vect__min_df': 5}
Classification score: 48.34 % 
done in 1139.806s.


In [20]:
y_pred = predict_tags(svc_grid, x_test, mlb, 5)
y_true = y_test.to_frame()
predictionAccuracy(y_true, y_pred)

Prediction accuracy: 53.47 % 


In [65]:
y_pred = predict_tags(svc_grid, x_validation, mlb, 5)
y_true = y_validation.to_frame()
predictionAccuracy(y_true, y_pred)

Prediction accuracy: 47.83 % 


## 3.2 Decision Tree

In [21]:
dtree = DecisionTreeClassifier()
parameters = {'clf__estimator__criterion' : ['entropy', 'gini'], 
              'clf__estimator__max_depth': [1, 2, 3, 4]}
dtree_grid = evaluateClassifier(dtree,parameters, x_train, y_train_mlb, x_test, y_test_mlb)

Best params : {'clf__estimator__criterion': 'gini', 'clf__estimator__max_depth': 1, 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__max_df': 0.95, 'vect__min_df': 5}
Classification score: 46.57 % 
done in 3046.049s.


In [22]:
y_pred = predict_tags(dtree_grid, x_test, mlb, 5)
y_true = y_test.to_frame()
predictionAccuracy(y_true, y_pred)

Prediction accuracy: 57.02 % 


In [67]:
y_pred = predict_tags(dtree_grid, x_validation, mlb, 5)
y_true = y_validation.to_frame()
predictionAccuracy(y_true, y_pred)

Prediction accuracy: 46.29 % 


## 3.3 SGD Classifier

In [23]:
sgd = SGDClassifier(loss='log', max_iter=5, tol=None)
parameters = {'clf__estimator__alpha': (0.00001, 0.000001), 'clf__estimator__penalty': ('l2', 'elasticnet')}
sgd_grid = evaluateClassifier(sgd, parameters, x_train, y_train_mlb, x_test, y_test_mlb )

Best params : {'clf__estimator__alpha': 1e-06, 'clf__estimator__penalty': 'elasticnet', 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__max_df': 0.95, 'vect__min_df': 5}
Classification score: 47.31 % 
done in 465.765s.


In [24]:
y_pred = predict_tags(sgd_grid, x_test, mlb, 5)
y_true = y_test.to_frame()
predictionAccuracy(y_true, y_pred)

Prediction accuracy: 50.36 % 


In [68]:
y_pred = predict_tags(sgd_grid, x_validation, mlb, 5)
y_true = y_validation.to_frame()
predictionAccuracy(y_true, y_pred)

Prediction accuracy: 46.09 % 


## 3.4 Random Forest

In [25]:
rfc = RandomForestClassifier(oob_score = True)
parameters = { 
    'clf__estimator__n_estimators': [50, 100, 200]
}
rfc_grid = evaluateClassifier(rfc,parameters, x_train, y_train_mlb, x_test, y_test_mlb )

KeyboardInterrupt: 

In [None]:
y_pred = predict_tags(rfc_grid, x_test, mlb, 5)
y_true = y_test.to_frame()
predictionAccuracy(y_true, y_pred)

## 3.5 Gradient Boosting

In [43]:
#gb = GradientBoostingClassifier()
#parameters = {'clf__estimator__n_estimators' : [10, 30, 50, 70, 90]}
#parameters = {}
#gb_grid = evaluateClassifier(gb, parameters, x_train, y_train_mlb, x_test, y_test_mlb , mlb, 5 )