In [1]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_colwidth', -1)
import os
from time import time
from pprint import pprint
import collections

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
sns.set(font_scale=1.3)

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib

import warnings
warnings.filterwarnings('ignore')

np.random.seed(37)

In [2]:
df_model = pd.read_pickle('../data/pickle_emotion/df_model_fr_train.p')
df_model_test = pd.read_pickle('../data/pickle_emotion/df_model_fr_test.p')
df_model.append(df_model_test)

Unnamed: 0,count_words,count_mentions,count_hashtags,count_capital_words,count_excl_quest_marks,count_urls,count_emojis,sentiment,origin_text,clean_text
0,24,3,0,0,1,0,0,MEPRIS,@PhanEd @AznAlainT @_Baekholic alors j\'ai une blague mais dure a comprendre On est ecologiques a nous 3 on se rebelle groupe Anti-Train !,alors blague dure comprendre ecologiques rebelle groupe anti train
1,21,0,0,0,0,0,0,DESACCORD,"Madame Ségolène Royale ministre de l\'écologie et du développement durable Non à la destruction des \""nuisibles\"" http//t.co/jUYWddlIMe",madame ségolène royale ministre écologie développement durable non destruction nuisibles juywddlime
2,20,0,0,0,0,0,0,VALORISATION,"Le nouveau Monsieur \"" développement durable \"" Jacques Tapin l’ex-élu municipal niortais vient d’être porté ... http//t.co/hrDGKOkyJd",nouveau monsieur développement durable jacques tapin l’ex élu municipal niortais vient d’être porté hrdgkokyjd
3,20,1,0,0,0,0,0,DESACCORD,@F_Choquette pourquoi une pétition contre le changement climatique si vous etes meme pas capable de traiter un simple dossier citoyen,pourquoi pétition contre changement climatique si etes meme pas capable traiter simple dossier citoyen
4,19,1,0,0,0,0,0,PLAISIR,Content de participer au groupe de travail @LaFabriqueEcolo sur mondialisation et développement durable beaucoup d\'échanges et d\'énergie,content participer groupe travail mondialisation développement durable beaucoup échanges énergie
5,26,0,2,0,0,0,0,DEVALORISATION,#prix des voyages #aériens départ France au même niveau en juin 2014 qu’en juin 2013 & baissent de 1\% au 1er semestre http//t.co/uU3djI8IU9,prix voyages aériens départ france niveau juin qu’en juin baissent er semestre uudjiiu
6,16,0,0,0,1,0,0,MEPRIS,La réponse aux ringards de Valeurs Actuelles http//t.co/v3Gc1L16M1 Les éoliennes défigurent la France ?,réponse ringards valeurs actuelles vgclm éoliennes défigurent france
7,12,2,0,1,0,0,0,VALORISATION,CC @SDE2014 @Solar_Decathlon // L’innovation rayonne à Versailles >> http//t.co/G0ShyTRJvq,cc l’innovation rayonne versailles gshytrjvq
8,14,0,1,0,0,0,0,MEPRIS,#développementdurable Les appareils connectés en mode veille un gâchis planétaire http//t.co/7fEIyJ3RsI,développementdurable appareils connectés mode veille gâchis planétaire feiyjrsi
9,25,0,0,0,0,0,0,INSATISFACTION,remarque si on utilisait ça pour les peintures ce serait plus écologiques bon l\'odeur serait pas excellente mais un peu de febreze et hop,remarque si utilisait ça peintures plus écologiques bon odeur pas excellente peu febreze hop


In [3]:
df_model.head()

Unnamed: 0,count_words,count_mentions,count_hashtags,count_capital_words,count_excl_quest_marks,count_urls,count_emojis,sentiment,origin_text,clean_text
0,24,3,0,0,1,0,0,MEPRIS,@PhanEd @AznAlainT @_Baekholic alors j\'ai une blague mais dure a comprendre On est ecologiques a nous 3 on se rebelle groupe Anti-Train !,alors blague dure comprendre ecologiques rebelle groupe anti train
1,21,0,0,0,0,0,0,DESACCORD,"Madame Ségolène Royale ministre de l\'écologie et du développement durable Non à la destruction des \""nuisibles\"" http//t.co/jUYWddlIMe",madame ségolène royale ministre écologie développement durable non destruction nuisibles juywddlime
2,20,0,0,0,0,0,0,VALORISATION,"Le nouveau Monsieur \"" développement durable \"" Jacques Tapin l’ex-élu municipal niortais vient d’être porté ... http//t.co/hrDGKOkyJd",nouveau monsieur développement durable jacques tapin l’ex élu municipal niortais vient d’être porté hrdgkokyjd
3,20,1,0,0,0,0,0,DESACCORD,@F_Choquette pourquoi une pétition contre le changement climatique si vous etes meme pas capable de traiter un simple dossier citoyen,pourquoi pétition contre changement climatique si etes meme pas capable traiter simple dossier citoyen
4,19,1,0,0,0,0,0,PLAISIR,Content de participer au groupe de travail @LaFabriqueEcolo sur mondialisation et développement durable beaucoup d\'échanges et d\'énergie,content participer groupe travail mondialisation développement durable beaucoup échanges énergie


In [4]:
X_train, X_test, y_train, y_test = train_test_split(df_model.drop('sentiment', axis=1), df_model.sentiment, test_size=0.1, random_state=37)

In [5]:
class ColumnExtractor(TransformerMixin, BaseEstimator):
    def __init__(self, cols):
        self.cols = cols

    def transform(self, X, **transform_params):
        return X[self.cols]

    def fit(self, X, y=None, **fit_params):
        return self

In [6]:
def grid_vect(clf, parameters_clf, X_train, X_test, parameters_text=None, vect=None):
    
    textcountscols = ['count_capital_words','count_emojis','count_excl_quest_marks','count_hashtags'
                      ,'count_mentions','count_urls','count_words']
    

    features = FeatureUnion([('textcounts', ColumnExtractor(cols=textcountscols))
                            , ('pipe', Pipeline([('cleantext', ColumnExtractor(cols='clean_text')), ('vect', vect)]))]
                            , n_jobs=-1)

    
    pipeline = Pipeline([
        ('features', features)
        , ('clf', clf)
    ])
    
    # Join the parameters dictionaries together
    parameters = dict()
    if parameters_text:
        parameters.update(parameters_text)
    parameters.update(parameters_clf)

    # initiate gridsearchCV with parameters and pipline
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5)
    
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)

    t0 = time()
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best CV score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    print("Test score with best_estimator_: %0.3f" % grid_search.best_estimator_.score(X_test, y_test))
    print("\n")
    print("Classification Report Test Data")
    print(classification_report(y_test, grid_search.best_estimator_.predict(X_test)))

    print("all results")
    means = grid_search.cv_results_['mean_test_score']
    stds = grid_search.cv_results_['std_test_score']
    params = grid_search.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
    
    return grid_search

In [7]:
# Parameter grid settings for the vectorizers (Count and TFIDF)
parameters_vect = {
    'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
    'features__pipe__vect__ngram_range': ((1, 1), (1, 2)),
    'features__pipe__vect__min_df': (1,2)
}

# Parameter grid settings for MultinomialNB
parameters_mnb = {
    'clf__alpha': (0.25, 0.5, 0.75)
}
# Parameter grid settings for LogisticRegression
parameters_lg = {
    'clf__C': (0.25, 0.5, 1.0),
    'clf__penalty': ('l1', 'l2')
}

In [8]:
lg = LogisticRegression()

In [9]:
countvect = CountVectorizer()

In [10]:
# LogisticRegression
best_lg_countvect = grid_vect(lg, parameters_lg, X_train, X_test, parameters_text=parameters_vect, vect=countvect)

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__C': (0.25, 0.5, 1.0),
 'clf__penalty': ('l1', 'l2'),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   40.5s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  1.2min finished


done in 74.202s

Best CV score: 0.624
Best parameters set:
	clf__C: 1.0
	clf__penalty: 'l1'
	features__pipe__vect__max_df: 0.75
	features__pipe__vect__min_df: 2
	features__pipe__vect__ngram_range: (1, 2)
Test score with best_estimator_: 0.676


Classification Report Test Data
                   precision    recall  f1-score   support

           ACCORD       1.00      0.36      0.53        14
           COLERE       0.44      0.39      0.41        18
        DEPLAISIR       0.00      0.00      0.00         4
      DERANGEMENT       1.00      0.50      0.67         2
        DESACCORD       0.67      0.50      0.57        16
   DEVALORISATION       0.40      0.35      0.38        34
           MEPRIS       0.25      0.08      0.12        24
             PEUR       0.81      0.85      0.83        26
          PLAISIR       1.00      0.17      0.29         6
     SATISFACTION       0.60      0.50      0.55         6
SURPRISE_POSITIVE       0.00      0.00      0.00         1
        TRISTE

In [12]:
tfidfvect = TfidfVectorizer()

In [13]:
best_lg_tfidf = grid_vect(lg, parameters_lg, X_train, X_test, parameters_text=parameters_vect, vect=tfidfvect)

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__C': (0.25, 0.5, 1.0),
 'clf__penalty': ('l1', 'l2'),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   34.1s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  1.1min finished


done in 64.896s

Best CV score: 0.588
Best parameters set:
	clf__C: 1.0
	clf__penalty: 'l1'
	features__pipe__vect__max_df: 0.5
	features__pipe__vect__min_df: 2
	features__pipe__vect__ngram_range: (1, 1)
Test score with best_estimator_: 0.657


Classification Report Test Data
                   precision    recall  f1-score   support

           ACCORD       0.00      0.00      0.00        14
           COLERE       0.73      0.44      0.55        18
        DEPLAISIR       0.00      0.00      0.00         4
      DERANGEMENT       1.00      0.50      0.67         2
        DESACCORD       0.70      0.44      0.54        16
   DEVALORISATION       0.42      0.32      0.37        34
           MEPRIS       0.25      0.04      0.07        24
             PEUR       0.81      0.65      0.72        26
          PLAISIR       1.00      0.17      0.29         6
     SATISFACTION       0.75      0.50      0.60         6
SURPRISE_POSITIVE       0.00      0.00      0.00         1
        TRISTES

In [14]:
#max_df: 0.25 or maximum document frequency of 25%.
#min_df: 2 or the words need to appear in at least 2 tweets
#ngram_range: (1, 2), both single words as bi-grams are used
#clf__C: 1
#clf__penalty: l2
textcountscols = ['count_capital_words','count_emojis','count_excl_quest_marks','count_hashtags'
                      ,'count_mentions','count_urls','count_words']
    
features = FeatureUnion([('textcounts', ColumnExtractor(cols=textcountscols))
                         , ('pipe', Pipeline([('cleantext', ColumnExtractor(cols='clean_text'))
                                              , ('vect', CountVectorizer(max_df=0.75, min_df=2, ngram_range=(1,2)))]))]
                       , n_jobs=-1)

pipeline = Pipeline([
    ('features', features)
    , ('clf', LogisticRegression(C=1.0, penalty='l1'))
])

best_model = pipeline.fit(df_model.drop('sentiment', axis=1), df_model.sentiment)

In [15]:
df_model_pos = pd.read_pickle('../data/df_model_pos_fr.p')
best_model.predict(df_model_pos).tolist()

['VALORISATION', 'VALORISATION', 'VALORISATION']

In [16]:
df_model_neg = pd.read_pickle('../data/df_model_neg_fr.p')
best_model.predict(df_model_neg).tolist()

['VALORISATION', 'VALORISATION', 'VALORISATION']