# Projet 6 : Catégorisez automatiquement des questions
# <u>C. Méthodes supervisées</u> <br/>

In [58]:
#import os
import numpy as np
import pandas as pd
from collections import Counter
from ast import literal_eval
from time import time

from sklearn import model_selection, metrics
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import tree


import warnings; 
#warnings.simplefilter('always') 
warnings.simplefilter('ignore') 

# 1. Chargement des données pré-traitées

Nos données sont réparties dans 5 fichiers représentant une taille totale de 0,12Go.

In [2]:
df = pd.read_csv('cleaned_data.csv')
#replace NaN by empty string
df = df.replace(np.nan, '', regex=True)
df['TAGS_P'] = df['TAGS_P'].apply(literal_eval)

In [3]:
df.shape

(64432, 7)

In [4]:
df.head()

Unnamed: 0,TITLE,BODY,SCORE,TAGS,TITLE_P,BODY_P,TAGS_P
0,Java generics variable <T> value,<p>At the moment I am using the following code...,6,<java><generics>,java gener variabl valu,moment use follow code filter jpa reduc block ...,"[java, generics]"
1,How a value typed variable is copied when it i...,<blockquote>\n <p>Swift's string type is a va...,6,<swift><function><value-type>,valu type variabl copi pass function hold copi,swift string type valu type creat new string v...,"[swift, function, value-type]"
2,Error while waiting for device: The emulator p...,<p>I am a freshman for the development of the ...,6,<android><android-studio><android-emulator><avd>,error wait devic emul process avd kill,freshman develop andriod suffer odd question r...,"[android, android-studio, android-emulator, avd]"
3,gulp-inject not working with gulp-watch,<p>I am using gulp-inject to auto add SASS imp...,10,<javascript><node.js><npm><gulp><gulp-watch>,gulp inject work gulp watch,use gulp inject auto add sass import newli cre...,"[javascript, node.js, npm, gulp, gulp-watch]"
4,React - Call function on props change,<p>My TranslationDetail component is passed an...,12,<reactjs><react-router>,react call function prop chang,translationdetail compon pass id upon open bas...,"[reactjs, react-router]"


# 2. Transformation des données

## 2.1 Echantillonage

Travaillons sur un échantillon de 15 000 posts.

In [70]:
df_sample = df.sample(10000)

In [71]:
df_sample.shape

(10000, 7)

## 2.2 Filtre sur les tags les plus fréquents

Pour chaque tag on stocke son nombre d'occurences.

In [7]:
counts = Counter()
for tags_list in df['TAGS_P']:
    counts.update(tags_list)
tags_df = pd.DataFrame.from_dict(counts, orient='index')
tags_df.reset_index(drop = False, inplace = True)
tags_df= tags_df.rename(columns={'index':'tag', 0:'count'})

La structures tags_df contient pour chacun des tags son occurence. <br/>
Gardons que les tags qui sont présents dans au moins 10 documents.

In [72]:
frequent_tags = tags_df[tags_df['count'] > 50]['tag'].tolist()
df_sample['TAGS_P'] = df_sample['TAGS_P'].apply(lambda x: [w for w in x if w in frequent_tags] )
# On supprime les lignes qui n'ont plus de tags associés (car aucun n'est présent dans la liste frequent_tags)
df_sample = df_sample[df_sample.astype(str)['TAGS_P'] != '[]']

In [9]:
len(frequent_tags)

540

In [73]:
df_sample.shape

(9681, 7)

## 2.3 Découpage en jeu entrainement et test

In [74]:
X = df_sample['TITLE_P'] + ' ' + df_sample['BODY_P']
Y = df_sample['TAGS_P']

In [75]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(X,Y,test_size = 0.3,random_state = 0, shuffle = True)

In [76]:
print("train", x_train.shape)
print("test ",x_test.shape)

train (6776,)
test  (2905,)


### Cible = Multi labels 

Notre variable cible est composée de plusieurs valeurs de tags.<br/>
Nous allons transformer nos tags en matrice binaire indiquant la présence ou pas d'un tag'

In [77]:
mlb = MultiLabelBinarizer(classes=frequent_tags)

In [78]:
y_train_mlb = mlb.fit_transform(y_train)
y_test_mlb = mlb.fit_transform(y_test)

# 3. Evaluation des modéles

In [31]:
def getClassifierScore(y_true, y_predicted) :
    return metrics.f1_score(y_true, y_predicted, average='micro')

'''
Méthode générique pour faire une recherche sur grille et évaluer le modèle de classification.
Affiche les meilleurs paramètres et la précision du modèle.
'''
def evaluateClassifier(model, extra_param, x_train, y_train, x_test, y_test) :
    t0 = time()
    Kfold = 5
    parameters = { 
              'vect__min_df': [5],
              'vect__max_df': [0.95],
              'tfidf__use_idf': [True],
              'tfidf__norm' : ['l2']
             }
    parameters.update(extra_param)
    classifier = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(model))])
    
    gs_classifier = GridSearchCV(estimator = classifier, param_grid = parameters, cv = Kfold)
    fit = gs_classifier.fit(x_train, y_train)
    print("Best params :", gs_classifier.best_params_)
    y_pred = gs_classifier.predict(x_test)
    print("Classification score: {:.2f} % ".format(100*getClassifierScore(y_test,y_pred)))
    print("done in %0.3fs." % (time() - t0))
    return gs_classifier

In [36]:
def get_pertinent_tags(clf, text_data, mlabel_bin, num_tags):
    if hasattr(clf, 'decision_function'):
        predictions = clf.decision_function(text_data)
    elif hasattr(clf, 'predict_proba'):
        predictions = clf.predict_proba(text_data)
    else :
        return None
    top_classes= np.argsort(-predictions)[:,:num_tags]
    return mlabel_bin.classes_[top_classes]

## 3.1 SVM Linéaire

In [27]:
svc = LinearSVC()
parameters = {'clf__estimator__C':np.logspace(-3,3,10)}
svc_grid = evaluateClassifier(svc, parameters, x_train, y_train_mlb, x_test, y_test_mlb )

Best params : {'clf__estimator__C': 2.154434690031882, 'tfidf__use_idf': True, 'vect__max_df': 0.95, 'vect__min_df': 5}
Classification score: 52.95 % 
done in 1238.230s.


## 3.2 Decision Tree

In [33]:
dtree = tree.DecisionTreeClassifier()
parameters = {'clf__estimator__criterion' : ['entropy', 'gini'], 
              'clf__estimator__max_depth': [1, 2, 3, 4, 5],
              'clf__estimator__max_features': [1,2,3]}
dtree_grid = evaluateClassifier(dtree,parameters, x_train, y_train_mlb, x_test, y_train_mlb)

Best params : {'clf__estimator__criterion': 'gini', 'clf__estimator__max_depth': 2, 'clf__estimator__max_features': 3, 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__max_df': 0.95, 'vect__min_df': 5}
Classification score: 0.10 % 
done in 2116.483s.


## 3.3 Random Forest

In [38]:
rfc = RandomForestClassifier(oob_score = True)
parameters = { 
    'clf__estimator__n_estimators': [50, 100, 200, 700],
    'clf__estimator__max_features': ['auto', 'sqrt', 'log2']
}
rfc_grid = evaluateClassifier(rfc,parameters, x_train, y_train_mlb, x_test, y_train_mlb  )

KeyboardInterrupt: 

## 3.4 Gradient Boosting

In [None]:
gb = GradientBoostingClassifier()
#parameters = {'clf__estimator__n_estimators' : [10, 30, 50, 70, 90]}
parameters = {}
gb_grid = evaluateClassifier(gb, parameters, x_train, y_train_mlb, x_test, y_test_mlb  )