# P6 - Catégorisez automatiquement des questions

## Librairies et méthodes d'importation

In [1]:
# Librairies classiques
import pandas as pd
import numpy as np
import datetime
import pickle
import os
from time import time

# Librairies de traitement de texte
import nltk
from nltk.corpus import stopwords
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Méthodes d'import export
import pickle
from sklearn.externals import joblib
CT_DIR = '../autotag/save/'

def save_obj(obj, name):
    fn = CT_DIR + name + '.pkl'
    try:
        os.remove(fn)
    except OSError:
        pass
    with open(fn, 'wb+') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
    print(fn, 'saved')

def load_obj(name):
    with open(CT_DIR + name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
def save_sklearn_obj(obj, name):
    fn = CT_DIR + name + '.pkl'
    try:
        os.remove(fn)
    except OSError:
        pass
    joblib.dump(obj, fn)
    print(fn, 'saved')

def load_sklearn_obj(name):
    return joblib.load(CT_DIR + name + '.pkl')

## Choix d'un nombre de tags

In [3]:
max_features_tags = 400

# Liste de tous les tags
top_tags = load_obj('top_tags')

# Liste des tags
max_tags = []
for i in range(0, max_features_tags):
    max_tags.append(top_tags[i][0])

# Méthode pour enlever les tags hors de la liste officielle de classification
def cleanTags(p_val):
    ret = []
    for word in str(p_val).split():
        if word in max_tags:
            ret.append(word)
    return " ".join(ret)

In [4]:
print(top_tags[:10])

[['c#', 26446], ['.net', 16452], ['java', 13844], ['asp.net', 12347], ['javascript', 9584], ['c++', 9410], ['php', 8607], ['python', 6953], ['sql', 6542], ['sql-server', 5972]]


## Récupération des données

In [50]:
df_train = pd.read_csv(CT_DIR + 'df_train.csv')
df_test = pd.read_csv(CT_DIR + 'df_test.csv')
print(df_train.shape)
print(df_test.shape)

(152855, 2)
(38214, 2)


In [51]:

# Samples 
if True:
    df_train = df_train[:30000]
    df_test = df_test[:3000]
    print(df_train.shape)
    print(df_test.shape)

(30000, 2)
(3000, 2)


## Nettoyage

In [52]:
df_train['Tags'] = df_train['TagsCleaned']
df_test['Tags'] = df_test['TagsCleaned']

In [53]:
# On enlève tous les tags qui ne font pas parti du top choisi
df_train['TagsCleaned'] = df_train['TagsCleaned'].apply(cleanTags)
df_test['TagsCleaned'] = df_test['TagsCleaned'].apply(cleanTags)

In [54]:
# On enlève de l'entrainement les données qui se retrouvent sans target
df_train = df_train[df_train['TagsCleaned'].str.len()>0]
print(df_train.shape)

(28437, 3)


In [34]:
df_train.head()

Unnamed: 0,TextCleaned,TagsCleaned,Tags
0,appli opac form use decim doubl valu want use ...,c# winforms,c# winforms type-conversion decimal opacity
1,percentag width child element absolut posit pa...,html css internet-explorer-7,html css css3 internet-explorer-7
2,calcul someon age c# given datetim repres pers...,c# .net datetime,c# .net datetime
3,calcul rel time c# given specif datetim valu d...,c# datetime time,c# datetime time datediff relative-time-span
4,determin user timezon standard way web server ...,javascript html browser,javascript html browser timezone timezoneoffset


In [23]:
df_test.head()

Unnamed: 0,TextCleaned,TagsCleaned,Tags
0,javascript access deni object support properti...,jquery,jquery
1,suspend weblog datasourc command line wonder a...,database oracle jdbc,database oracle jdbc
2,creat gener object base type variabl need crea...,c# .net generics,c# .net generics
3,test iphon app limit network access way simul ...,iphone testing networking,iphone testing networking
4,wpf multipl valu properti would like appli sty...,wpf properties,wpf properties


## Mode supervisé - Analyse de Title+Body vs Tags

In [24]:
# Méthode pour récupérer les meilleurs tags en fonction du pourcentage de la prédiction
def get_best_tags(clf, X, lb, n_tags=5, b_save=False):
    decfun = []
    if hasattr(clf, 'decision_function'):
        decfun = clf.decision_function(X)
    elif hasattr(clf, 'predict_proba'):
        decfun = clf.predict_proba(X)
    else:
        return None

    best_tags = np.argsort(decfun)[:, :-(n_tags+1): -1]

    if b_save:
        save_obj(clf, 'Classifier')
        save_obj(lb, 'MultiLabelBinarizer')

    return lb.classes_[best_tags]

In [55]:
#X_train, X_test, y_train, y_test = train_test_split(dataraw['TextCleaned'], dataraw['TagsCleaned'], test_size=0.3)
X_train = df_train['TextCleaned']
X_test = df_test['TextCleaned']
y_train = df_train['TagsCleaned']
y_test = df_test['TagsCleaned']

In [56]:
# On binarise la target
mlb = preprocessing.MultiLabelBinarizer(classes=max_tags)
mlb.fit([w.split() for w in y_train])

y_train_mlb = mlb.transform([w.split() for w in y_train])
y_test_mlb = mlb.transform([w.split() for w in y_test])

In [27]:
def test_model(mod):
    t0 = time()

    pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    #('vect', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(mod))])

    parameters = {
        #'vect__min_df': (5, 10, 15),
        'vect__min_df': [5],
        #'vect__max_df': (0.5, 0.75, 1.0),
        'vect__max_df': [0.5],
        #'vect__max_features': (None, 5000, 10000),
        'vect__max_features': [None],
        #'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
        'vect__ngram_range': [(1, 2)],  # unigrams or bigrams
        #'tfidf__use_idf': (True, False),
        'tfidf__use_idf': [True],
        #'tfidf__norm': ['l1', 'l2'],
        'tfidf__norm': ['l2'],
    }
    gs = GridSearchCV(pipeline, parameters, cv=5)
    gs.fit(X_train, y_train_mlb)

    print("Best score: %0.3f" % gs.best_score_)
    print("Best parameters set:")
    best_parameters = gs.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    print ("Accuracy Score: ", accuracy_score(y_test_mlb, gs.best_estimator_.predict(X_test)))

    result = get_best_tags(gs.best_estimator_, X_test, mlb)
    print(result[:5])
    print("done in %0.3fs." % (time() - t0))
    

### LinearSVC - bon score, bonnes perf

In [139]:
test_model(LinearSVC())

Best score: 0.199
Best parameters set:
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__max_features: None
	vect__min_df: 5
	vect__ngram_range: (1, 2)
Accuracy Score:  0.2321400533835767
../autotag/save/Classifier.pkl saved
../autotag/save/MultiLabelBinarizer.pkl saved
[['javascript' 'ajax' 'jquery' 'asp.net' 'debugging']
 ['jdbc' 'batch-file' 'command-line' 'java' 'windows']
 ['c#' 'generics' 'reflection' 'java' 'asp.net']
 ['iphone' 'cocoa-touch' 'networking' 'testing' 'objective-c']
 ['wpf' 'xaml' 'listbox' 'triggers' 'controls']]
done in 2648.384s.


### RandomForestClassifier - score moyen, temps moyen

In [120]:
test_model(RandomForestClassifier())

Best score: 0.046
Best parameters set:
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__max_features: None
	vect__min_df: 5
	vect__ngram_range: (1, 2)
Accuracy Score:  0.075
../autotag/save/Classifier.pkl saved
../autotag/save/MultiLabelBinarizer.pkl saved
[['javascript' 'asp.net' '.net' 'css' 'database']
 ['java' 'linq' 'batch-file' 'vim' 'design-patterns']
 ['c#' 'interface' 'generics' 'user-interface' 'version-control']
 ['iphone' 'linux' 'database' 'c#' 'controls']
 ['wpf' 'perl' 'css' 'graphics' 'file-io']]
done in 363.318s.


### SGDClassifier - score moyen, bonne perf

In [121]:
test_model(SGDClassifier(loss="log", max_iter=5, tol=None))

Best score: 0.059
Best parameters set:
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__max_features: None
	vect__min_df: 5
	vect__ngram_range: (1, 2)
Accuracy Score:  0.082
../autotag/save/Classifier.pkl saved
../autotag/save/MultiLabelBinarizer.pkl saved
[['fluent-nhibernate' 'flex3' 'sharepoint-2007' 'uitableview' 'dynamic']
 ['dynamic' 'uitableview' 'fluent-nhibernate' 'flex3' 'silverlight-2.0']
 ['sharepoint-2007' 'flex3' 'uitableview' 'silverlight-2.0' 'dynamic']
 ['dynamic' 'uitableview' 'flex3' 'silverlight-2.0' 'fluent-nhibernate']
 ['sharepoint-2007' 'fluent-nhibernate' 'flex3' 'uitableview' 'dynamic']]
done in 64.331s.


### AdaBoostClassifier - bon score, temps médiocre

In [122]:
from sklearn.ensemble import AdaBoostClassifier
test_model(AdaBoostClassifier())

Best score: 0.137
Best parameters set:
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__max_features: None
	vect__min_df: 5
	vect__ngram_range: (1, 2)
Accuracy Score:  0.146
../autotag/save/Classifier.pkl saved
../autotag/save/MultiLabelBinarizer.pkl saved
[['javascript' 'jquery' 'ajax' 'asp.net' 'dynamic']
 ['dynamic' 'flex3' 'uitableview' 'sharepoint-2007' 'silverlight-2.0']
 ['c#' 'fluent-nhibernate' 'flex3' 'dynamic' 'uitableview']
 ['networking' 'flex3' 'uitableview' 'dynamic' 'silverlight-2.0']
 ['colors' 'wpf' 'sharepoint-2007' 'silverlight-2.0' 'uitableview']]
done in 6639.907s.


### ExtraTreesClassifier - score faible, temps moyen

In [123]:
from sklearn.ensemble import ExtraTreesClassifier
test_model(ExtraTreesClassifier())

Best score: 0.045
Best parameters set:
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__max_features: None
	vect__min_df: 5
	vect__ngram_range: (1, 2)
Accuracy Score:  0.075
../autotag/save/Classifier.pkl saved
../autotag/save/MultiLabelBinarizer.pkl saved
[['asp.net' 'debugging' 'javascript' 'css' 'wpf']
 ['java' 'c#' 'java-ee' 'windows-vista' 'crystal-reports']
 ['c#' '.net' 'nhibernate' 'powershell' 'linq']
 ['iphone' 'sqlite' 'sockets' 'bash' 'ios']
 ['wpf' 'c#' 'internationalization' 'wcf' 'localization']]
done in 330.661s.


### GradientBoostingClassifier - score moyen, temps énorme

In [124]:
from sklearn.ensemble import GradientBoostingClassifier
test_model(GradientBoostingClassifier())

Best score: 0.102
Best parameters set:
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__max_features: None
	vect__min_df: 5
	vect__ngram_range: (1, 2)
Accuracy Score:  0.1
../autotag/save/Classifier.pkl saved
../autotag/save/MultiLabelBinarizer.pkl saved
[['javascript' 'flex3' 'dynamic' 'uitableview' 'silverlight-2.0']
 ['flex3' 'dynamic' 'uitableview' 'silverlight-2.0' 'fluent-nhibernate']
 ['interface' 'uitableview' 'flex3' 'sharepoint-2007' 'dynamic']
 ['ios' 'sharepoint-2007' 'dynamic' 'uitableview' 'silverlight-2.0']
 ['colors' 'wpf' 'uitableview' 'sharepoint-2007' 'silverlight-2.0']]
done in 62652.606s.


### GaussianProcessClassifier

In [125]:
from sklearn.gaussian_process import GaussianProcessClassifier
test_model(GaussianProcessClassifier())

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

### Réduction dimensionnelle

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=5, max_features=None, ngram_range=[(1, 2)])
tfidf = tfidf_vectorizer.fit_transform(X_train)

In [None]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=500)
svdMatrix = svd.fit_transform(tfidf)
print(svd.explained_variance_ratio_.sum())

### Exploration LinearSVC

In [57]:
t0 = time()

pipeline = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
#('vect', TfidfVectorizer()),
('clf', OneVsRestClassifier(LinearSVC()))])

parameters = {
    #'vect__min_df': (5, 10, 15),
    'vect__min_df': [5],
    #'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_df': [0.5],
    'vect__max_features': (None, 5000, 10000),
    #'vect__max_features': [None],
    #'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'vect__ngram_range': [(1, 2)],  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    'tfidf__use_idf': [True],
    #'tfidf__norm': ['l1', 'l2'],
    'tfidf__norm': ['l2'],
}
gs = GridSearchCV(pipeline, parameters, cv=5)
gs.fit(X_train, y_train_mlb)

print("Best score: %0.3f" % gs.best_score_)
print("Best parameters set:")
best_parameters = gs.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

print ("Accuracy Score: ", accuracy_score(y_test_mlb, gs.best_estimator_.predict(X_test)))

result = get_best_tags(gs.best_estimator_, X_test, mlb, b_save=True)
print(result[:5])
print("done in %0.3fs." % (time() - t0))
    

Best score: 0.169
Best parameters set:
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__max_features: 5000
	vect__min_df: 5
	vect__ngram_range: (1, 2)
Accuracy Score:  0.19466666666666665
../autotag/save/Classifier.pkl saved
../autotag/save/MultiLabelBinarizer.pkl saved
[['javascript' 'ajax' 'jquery' 'asp.net' 'debugging']
 ['c#' 'asp.net' 'command-line' 'batch-file' 'windows']
 ['c#' 'generics' 'vb.net' 'code-generation' 'asp.net']
 ['iphone' 'ios' 'networking' 'cocoa-touch' 'xcode']
 ['wpf' 'colors' 'triggers' '.net' 'controls']]
done in 698.795s.


In [142]:
gs.best_estimator_

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=5,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))])