# P6 - Catégorisez automatiquement des questions

## Importation des librairies

In [1]:
# Librairies classiques
import pandas as pd
import numpy as np
import pickle
from time import time

# Librairies graphiques
import matplotlib.pyplot as plt
%matplotlib inline

# Librairies de traitement de texte
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Méthodes d'import export
import pickle
from sklearn.externals import joblib
CT_DIR = '../autotag/save/'

def load_obj(name):
    with open(CT_DIR + name + '.pkl', 'rb') as f:
        return pickle.load(f)


## Récupération et visualisation des données

In [3]:
all_tags = load_obj('all_tags')
df_train = pd.read_csv(CT_DIR + 'df_train.csv')
df_test = pd.read_csv(CT_DIR + 'df_test.csv')

In [4]:
print(df_train.shape)
print(df_test.shape)
dataraw = df_train.append(df_test)
print(dataraw.shape)
dataraw.head()

dataraw = dataraw.sample(100000)

(152855, 2)
(38214, 2)
(191069, 2)


# Mode non supervisé - Analyse de Title+Body

On va maintenant essayer d'extraire des clusters de notre nouvelle feature, d'abord avec LDA ensuite avec NMF

In [5]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

mystops = set(stopwords.words("english"))

In [6]:
no_topics = 15
no_top_words = 10

# Méthode d'affichage des mots clés de chaque groupe
def display_topics(model, feature_names, name, no_top_words, display=True):
    total_matches = 0
    for topic_idx, topic in enumerate(model.components_):
        if display: print ("Topic %d " % (topic_idx), end=' ')
        topic_top_words_index = topic.argsort()[:-no_top_words - 1:-1]
        n = 0
        for i in topic_top_words_index:
            if feature_names[i] in all_tags:
                n += 1
        total_matches += n
        if display: print ("(%d matchs) :" % (n), end=' ')
        if display: print (" ".join([feature_names[i] for i in topic_top_words_index]))
    nmax = no_topics * no_top_words
    note = total_matches/nmax
    print("Note=%.2f (%d matches sur %d possibles - %s)" % (note, total_matches, nmax, name))
    return note

def score(vect, mod, name, display=False):
    t0 = time()
    tf = vect.fit_transform(dataraw['TextCleaned'])
    mod.fit(tf)
    if display: print("done in %0.3fs." % (time() - t0))
    note = display_topics(mod, vect.get_feature_names(), name, no_top_words, display=display)
    
    if display: 
        print()
        predict = mod.transform(tf)
        for n in range(10):
            topic_most_pr = predict[n].argmax()
            print("doc {}, topic {}, {}...".format(n, topic_most_pr, dataraw.TextCleaned.iloc[n][:50]))
    
    return note

## LDA

In [7]:
# LDA avec CountVectorizer

def scoreLDA(max_df, min_df, max_features, display=False):
    vect = CountVectorizer(max_df=max_df, min_df=min_df, max_features=max_features, stop_words='english')

    mod = LatentDirichletAllocation(n_components=no_topics, max_iter=10, learning_method='online', 
                                    learning_offset=50., random_state=0)
    name = 'LDA max=%.1f - min=%i - feat=%i' % (max_df, min_df, max_features)
    return score(vect, mod, name, display)

## NMF

In [8]:
# NMF avec TfidfVectorizer

def scoreNMF(max_df, min_df, max_features, loss='frobenius', display=False):
    vect = TfidfVectorizer(max_df=max_df, min_df=min_df, max_features=max_features, stop_words='english')

    mod = NMF(n_components=no_topics, alpha=.1, l1_ratio=.5, beta_loss=loss, solver='mu', max_iter=200, random_state=0)
    name = 'NMF max=%.1f - min=%i - feat=%i' % (max_df, min_df, max_features)
    return score(vect, mod, name, display)

## Comparaison

In [9]:
t0 = time()
print("MIN")
p_min = [5, 10, 20]
for m in p_min:
    scoreLDA(.8, m, 10000)
    scoreNMF(.8, m, 10000)
print("MAX")
p_max = [.5, .8, .9]
for m in p_max:
    scoreLDA(m, 5, 10000)
    scoreNMF(m, 5, 10000)
print("FEATURES")
p_feat = [1000, 10000, 100000]
for m in p_feat:
    scoreLDA(.8, 5, m)
    scoreNMF(.8, 5, m)

print("NMF loss")
scoreNMF(.8, 5, 10000, loss='frobenius')
scoreNMF(.8, 5, 10000, loss='kullback-leibler')

print("done in %0.3fs." % (time() - t0))

MIN
Note=0.51 (77 matches sur 150 possibles - LDA max=0.8 - min=5 - feat=10000)
Note=0.47 (70 matches sur 150 possibles - NMF max=0.8 - min=5 - feat=10000)
Note=0.44 (66 matches sur 150 possibles - LDA max=0.8 - min=10 - feat=10000)
Note=0.48 (72 matches sur 150 possibles - NMF max=0.8 - min=10 - feat=10000)
Note=0.47 (70 matches sur 150 possibles - LDA max=0.8 - min=20 - feat=10000)
Note=0.48 (72 matches sur 150 possibles - NMF max=0.8 - min=20 - feat=10000)
MAX
Note=0.42 (63 matches sur 150 possibles - LDA max=0.5 - min=5 - feat=10000)
Note=0.49 (74 matches sur 150 possibles - NMF max=0.5 - min=5 - feat=10000)
Note=0.51 (77 matches sur 150 possibles - LDA max=0.8 - min=5 - feat=10000)
Note=0.47 (70 matches sur 150 possibles - NMF max=0.8 - min=5 - feat=10000)
Note=0.51 (77 matches sur 150 possibles - LDA max=0.9 - min=5 - feat=10000)
Note=0.47 (70 matches sur 150 possibles - NMF max=0.9 - min=5 - feat=10000)
FEATURES
Note=0.45 (68 matches sur 150 possibles - LDA max=0.8 - min=5 - fea

In [10]:
# Détail du meilleur modèle
scoreLDA(.8, 5, 10000, True)

done in 736.365s.
Topic 0  (7 matchs) : file line use text string data print read output format
Topic 1  (3 matchs) : test date 10 key 00 2009 time unit 12 datetim
Topic 2  (5 matchs) : page html form asp control javascript text id function click
Topic 3  (5 matchs) : file window use run project applic instal work visual version
Topic 4  (4 matchs) : tabl sql databas queri data row column id select use
Topic 5  (6 matchs) : list user view item id model field key custom select
Topic 6  (2 matchs) : error messag log event tri session connect rubi assembl bar
Topic 7  (5 matchs) : imag width div style color height text bind li background
Topic 8  (5 matchs) : thread product report process program start time excel year day
Topic 9  (5 matchs) : java xml org eclips apach class com xsl jar hibern
Topic 10  (1 matchs) : use like way need code know applic look want net
Topic 11  (6 matchs) : number match express point doubl regex anim draw frame algorithm
Topic 12  (7 matchs) : librari load me

0.5133333333333333