In [1]:
""""!pip install numpy
!pip install pandas 
!pip install time
!pip install pprint
!pip install collections
!pip install matplotlib
!pip install seaborn 
!pip install sklearn
!pip install warnings"""""



In [2]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_colwidth', -1)
import os
from time import time
from pprint import pprint
import collections

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
sns.set(font_scale=1.3)

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib

import warnings
warnings.filterwarnings('ignore')

np.random.seed(37)

In [4]:
df_model = pd.read_pickle('../../data/final/NotIot_clean_FR_tweets.p')


In [5]:
len(df_model)

4430

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df_model.drop('sentiment', axis=1), df_model.sentiment, test_size=0.1, random_state=37)

In [7]:
class ColumnExtractor(TransformerMixin, BaseEstimator):
    def __init__(self, cols):
        self.cols = cols

    def transform(self, X, **transform_params):
        return X[self.cols]

    def fit(self, X, y=None, **fit_params):
        return self

In [8]:
def grid_vect(clf, parameters_clf, X_train, X_test, parameters_text=None, vect=None):
    
    textcountscols = ['count_capital_words','count_emojis','count_excl_quest_marks','count_hashtags'
                      ,'count_mentions','count_urls','count_words']
    

    features = FeatureUnion([ ('pipe', Pipeline([('cleantext', ColumnExtractor(cols='clean_text')), ('vect', vect)]))]
                            , n_jobs=-1)

    
    pipeline = Pipeline([
        ('features', features)
        , ('clf', clf)
    ])
    
    # Join the parameters dictionaries together
    parameters = dict()
    if parameters_text:
        parameters.update(parameters_text)
    parameters.update(parameters_clf)

    # initiate gridsearchCV with parameters and pipline
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5)
    
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)

    t0 = time()
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best CV score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    print("Test score with best_estimator_: %0.3f" % grid_search.best_estimator_.score(X_test, y_test))
    print("\n")
    print("Classification Report Test Data")
    print(classification_report(y_test, grid_search.best_estimator_.predict(X_test)))

    print("all results")
    means = grid_search.cv_results_['mean_test_score']
    stds = grid_search.cv_results_['std_test_score']
    params = grid_search.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
    
    return grid_search

In [9]:
# Parameter grid settings for the vectorizers (Count and TFIDF)
parameters_vect = {
    'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
    'features__pipe__vect__ngram_range': ((1, 1), (1, 2)),
    'features__pipe__vect__min_df': (1,2)
}

# Parameter grid settings for SVM
parameters_svm = {'clf__kernel':('linear', 'rbf'), 'clf__C':(1,0.25,0.5,0.75),'clf__gamma': (1,2,3)}

In [14]:
from sklearn.svm import SVC
svm = SVC(max_iter=100)
svm.get_params().keys()

dict_keys(['C', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])

In [15]:
countvect = CountVectorizer()

In [16]:
import sys
#sys.stdout = open("output_SVMModel_sentimentFR_BOW_count.txt", "w")

In [17]:
# MultinomialNB
best_mnb_countvect = grid_vect(svm, parameters_svm, X_train, X_test, parameters_text=parameters_vect, vect=countvect)

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__C': (1, 0.25, 0.5, 0.75),
 'clf__gamma': (1, 2, 3),
 'clf__kernel': ('linear', 'rbf'),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   47.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 29.1min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed: 36.2min finished


done in 2180.909s

Best CV score: 0.570
Best parameters set:
	clf__C: 0.5
	clf__gamma: 1
	clf__kernel: 'linear'
	features__pipe__vect__max_df: 0.25
	features__pipe__vect__min_df: 1
	features__pipe__vect__ngram_range: (1, 2)
Test score with best_estimator_: 0.587


Classification Report Test Data
                precision    recall  f1-score   support

        ACCORD       0.64      0.36      0.46        25
         AMOUR       0.00      0.00      0.00         1
    APAISEMENT       0.00      0.00      0.00         1
        COLERE       0.65      0.44      0.53        34
     DEPLAISIR       0.50      0.20      0.29         5
   DERANGEMENT       1.00      0.50      0.67         2
     DESACCORD       0.48      0.44      0.46        25
DEVALORISATION       0.18      0.04      0.06        54
         ENNUI       0.00      0.00      0.00         1
        MEPRIS       0.15      0.13      0.14        23
          PEUR       0.59      0.53      0.56        38
       PLAISIR       1.00     

In [18]:
tfidfvect = TfidfVectorizer()

In [19]:
#sys.stdout = open("output_SVMModel_sentimentFR_BOW_tfidf.txt", "w")

In [20]:
best_mnb_tfidf = grid_vect(svm, parameters_svm, X_train, X_test, parameters_text=parameters_vect, vect=tfidfvect)

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__C': (1, 0.25, 0.5, 0.75),
 'clf__gamma': (1, 2, 3),
 'clf__kernel': ('linear', 'rbf'),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 30.4min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 47.6min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed: 55.0min finished


done in 3305.496s

Best CV score: 0.550
Best parameters set:
	clf__C: 0.25
	clf__gamma: 2
	clf__kernel: 'rbf'
	features__pipe__vect__max_df: 0.25
	features__pipe__vect__min_df: 2
	features__pipe__vect__ngram_range: (1, 1)
Test score with best_estimator_: 0.562


Classification Report Test Data
                precision    recall  f1-score   support

        ACCORD       0.35      0.32      0.33        25
         AMOUR       0.00      0.00      0.00         1
    APAISEMENT       0.00      0.00      0.00         1
        COLERE       0.56      0.59      0.57        34
     DEPLAISIR       0.00      0.00      0.00         5
   DERANGEMENT       0.00      0.00      0.00         2
     DESACCORD       0.47      0.56      0.51        25
DEVALORISATION       0.23      0.20      0.22        54
         ENNUI       0.00      0.00      0.00         1
        MEPRIS       0.30      0.52      0.38        23
          PEUR       0.64      0.66      0.65        38
       PLAISIR       0.00      0

In [13]:
#max_df: 
#min_df: 
#ngram_range: 
#clf__C :
#textcountscols = ['count_capital_words','count_emojis','count_excl_quest_marks','count_hashtags'
#                      ,'count_mentions','count_urls','count_words']
#    az
#features = FeatureUnion([('textcounts', ColumnExtractor(cols=textcountscols))
#                         , ('pipe', Pipeline([('cleantext', ColumnExtractor(cols='clean_text'))
#                                              , ('vect', CountVectorizer(max_df=0.25, min_df=1, ngram_range=(1,2)))]))]
#                       , n_jobs=-1)
#
#pipeline = Pipeline([
#    ('features', features)
#    , ('clf', SVC(C=0.75))
#])

#best_model = pipeline.fit(df_model.drop('sentiment', axis=1), df_model.sentiment)

In [9]:
#!jupyter nbconvert --to script SVMModel_sentimentFR_BOW.ipynb

[NbConvertApp] Converting notebook SVMModel_sentimentFR_BOW.ipynb to script
[NbConvertApp] Writing 5804 bytes to SVMModel_sentimentFR_BOW.py
