In [1]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_colwidth', -1)
import os
from time import time
from pprint import pprint
import collections

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
sns.set(font_scale=1.3)

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib

import warnings
warnings.filterwarnings('ignore')

np.random.seed(37)

In [2]:
df_model = pd.read_pickle('../../data/final/NotIot_clean_FR_news_lexic.p')
df_model

Unnamed: 0,count_words,count_capital_words,count_excl_quest_marks,sentiment,origin_text,clean_text,lex_anger,lex_sadness,lex_joy,lex_fear,lex_surprise,lex_disgust,lex_neutre
0,12,0,0,joie,Un test pour prédire la rechute du cancer du sein est approuvé,test prédire rechute cancer sein approuvé,0.166667,0.333333,0.000000,0.333333,0.166667,0.166667,0.000000
1,11,0,0,tristesse,"Un allié de Saddam Hussein est pendu, selon un responsable irakien",allié saddam hussein pendu selon responsable irakien,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,5,1,0,joie,Vues et sons du CES,vues sons,0.000000,0.000000,0.000000,0.000000,0.000000,0.500000,0.000000
3,7,0,0,joie,Schuey voit Ferrari dévoiler une nouvelle voiture,schuey voit ferrari dévoiler nouvelle voiture,0.000000,0.000000,0.000000,0.166667,0.166667,0.000000,0.000000
4,15,0,0,peur,Les clôtures et les annulations sont les meilleurs conseils en cas d’épidémie de grippe,clôtures annulations meilleurs conseils cas d’épidémie grippe,0.111111,0.222222,0.000000,0.222222,0.111111,0.111111,0.000000
5,7,0,0,tristesse,Camions avalés dans l'effondrement du métro,camions avalés effondrement métro,0.000000,0.250000,0.000000,0.250000,0.250000,0.250000,0.000000
6,12,0,0,surprise,Une lettre de Sarkozy surprend l'audience sur les dessins animés français,lettre sarkozy surprend audience dessins animés français,0.000000,0.000000,0.142857,0.000000,0.000000,0.000000,0.000000
7,11,0,0,tristesse,"Construire un monument à un fils, un enfant à la fois",construire monument fils enfant fois,0.000000,0.000000,0.200000,0.000000,0.000000,0.000000,0.000000
8,14,0,0,surprise,Un législateur veut interdire les iPod dans les passages pour piétons à New York,législateur veut interdire ipod passages piétons new york,0.125000,0.125000,0.000000,0.125000,0.000000,0.000000,0.000000
9,7,0,0,tristesse,Diabetic attend des lunettes pendant des mois,diabetic attend lunettes pendant mois,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [3]:
#df_model.head()

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df_model.drop('sentiment', axis=1), df_model.sentiment, test_size=0.1, random_state=37)

In [5]:
class ColumnExtractor(TransformerMixin, BaseEstimator):
    def __init__(self, cols):
        self.cols = cols

    def transform(self, X, **transform_params):
        return X[self.cols]

    def fit(self, X, y=None, **fit_params):
        return self

In [6]:
def grid_vect(clf, parameters_clf, X_train, X_test, parameters_text=None, vect=None):
    
    textcountscols = ['count_capital_words','count_emojis','count_excl_quest_marks','count_hashtags'
                      ,'count_mentions','count_urls','count_words']
    

    features = FeatureUnion([ ('pipe', Pipeline([('cleantext', ColumnExtractor(cols='clean_text')), ('vect', vect)]))]
                            , n_jobs=-1)

    
    pipeline = Pipeline([
        ('features', features)
        , ('clf', clf)
    ])
    
    # Join the parameters dictionaries together
    parameters = dict()
    if parameters_text:
        parameters.update(parameters_text)
    parameters.update(parameters_clf)

    # initiate gridsearchCV with parameters and pipline
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5)
    
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)

    t0 = time()
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best CV score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    print("Test score with best_estimator_: %0.3f" % grid_search.best_estimator_.score(X_test, y_test))
    print("\n")
    print("Classification Report Test Data")
    print(classification_report(y_test, grid_search.best_estimator_.predict(X_test)))

    print("all results")
    means = grid_search.cv_results_['mean_test_score']
    stds = grid_search.cv_results_['std_test_score']
    params = grid_search.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
    
    return grid_search

In [7]:
# Parameter grid settings for the vectorizers (Count and TFIDF)
parameters_vect = {
    'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
    'features__pipe__vect__ngram_range': ((1, 1), (1, 2)),
    'features__pipe__vect__min_df': (1,2)
}

# Parameter grid settings for MultinomialNB
parameters_mnb = {
    'clf__alpha': (0.25, 0.5, 0.75)
}

In [8]:
mnb = MultinomialNB()

In [9]:
countvect = CountVectorizer()

In [10]:
# MultinomialNB
best_mnb_countvect = grid_vect(mnb, parameters_mnb, X_train, X_test, parameters_text=parameters_vect, vect=countvect)

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__alpha': (0.25, 0.5, 0.75),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:   27.8s finished


done in 28.712s

Best CV score: 0.505
Best parameters set:
	clf__alpha: 0.75
	features__pipe__vect__max_df: 0.25
	features__pipe__vect__min_df: 1
	features__pipe__vect__ngram_range: (1, 2)
Test score with best_estimator_: 0.465


Classification Report Test Data
              precision    recall  f1-score   support

      colère       0.00      0.00      0.00         7
    dégoûter       0.00      0.00      0.00         1
        joie       0.58      0.72      0.65        43
        peur       0.35      0.41      0.38        17
    surprise       0.57      0.20      0.30        20
   tristesse       0.22      0.36      0.28        11

   micro avg       0.46      0.46      0.46        99
   macro avg       0.29      0.28      0.27        99
weighted avg       0.45      0.46      0.44        99

all results
0.483653 (0.023609) with: {'clf__alpha': 0.25, 'features__pipe__vect__max_df': 0.25, 'features__pipe__vect__min_df': 1, 'features__pipe__vect__ngram_range': (1, 1)}
0.492672 (0.019689

In [11]:
tfidfvect = TfidfVectorizer()

In [12]:
best_mnb_tfidf = grid_vect(mnb, parameters_mnb, X_train, X_test, parameters_text=parameters_vect, vect=tfidfvect)

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__alpha': (0.25, 0.5, 0.75),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:   22.3s finished


done in 23.448s

Best CV score: 0.498
Best parameters set:
	clf__alpha: 0.25
	features__pipe__vect__max_df: 0.25
	features__pipe__vect__min_df: 1
	features__pipe__vect__ngram_range: (1, 2)
Test score with best_estimator_: 0.455


Classification Report Test Data
              precision    recall  f1-score   support

      colère       0.00      0.00      0.00         7
    dégoûter       0.00      0.00      0.00         1
        joie       0.53      0.74      0.62        43
        peur       0.35      0.35      0.35        17
    surprise       0.43      0.15      0.22        20
   tristesse       0.27      0.36      0.31        11

   micro avg       0.45      0.45      0.45        99
   macro avg       0.26      0.27      0.25        99
weighted avg       0.41      0.45      0.41        99

all results
0.487035 (0.023967) with: {'clf__alpha': 0.25, 'features__pipe__vect__max_df': 0.25, 'features__pipe__vect__min_df': 1, 'features__pipe__vect__ngram_range': (1, 1)}
0.498309 (0.020478

In [13]:
#max_df: 0.25 or maximum document frequency of 25%.
#min_df: 2 or the words need to appear in at least 2 tweets
#ngram_range: (1, 1)
#clf__alpha: 0.75
""""textcountscols = ['count_capital_words','count_emojis','count_excl_quest_marks','count_hashtags'
                      ,'count_mentions','count_urls','count_words']
    
features = FeatureUnion([('textcounts', ColumnExtractor(cols=textcountscols))
                         , ('pipe', Pipeline([('cleantext', ColumnExtractor(cols='clean_text'))
                                              , ('vect', CountVectorizer(max_df=0.25, min_df=1, ngram_range=(1,1)))]))]
                       , n_jobs=-1)

pipeline = Pipeline([
    ('features', features)
    , ('clf', MultinomialNB(alpha=0.25))
])

best_model = pipeline.fit(df_model.drop('sentiment', axis=1), df_model.sentiment)"""""

'"textcountscols = [\'count_capital_words\',\'count_emojis\',\'count_excl_quest_marks\',\'count_hashtags\'\n                      ,\'count_mentions\',\'count_urls\',\'count_words\']\n    \nfeatures = FeatureUnion([(\'textcounts\', ColumnExtractor(cols=textcountscols))\n                         , (\'pipe\', Pipeline([(\'cleantext\', ColumnExtractor(cols=\'clean_text\'))\n                                              , (\'vect\', CountVectorizer(max_df=0.25, min_df=1, ngram_range=(1,1)))]))]\n                       , n_jobs=-1)\n\npipeline = Pipeline([\n    (\'features\', features)\n    , (\'clf\', MultinomialNB(alpha=0.25))\n])\n\nbest_model = pipeline.fit(df_model.drop(\'sentiment\', axis=1), df_model.sentiment)'

In [14]:
#f_model_pos = pd.read_pickle('../data/df_model_pos_fr.p')
#best_model.predict(df_model_pos).tolist()

In [15]:
#df_model_neg = pd.read_pickle('../data/df_model_neg_fr.p')
#best_model.predict(df_model_neg).tolist()