In [1]:
"""""!pip install numpy
!pip install pandas 
!pip install time
!pip install pprint
!pip install collections
!pip install matplotlib
!pip install seaborn 
!pip install sklearn
!pip install warnings"""""



In [3]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_colwidth', -1)
import os
from time import time
from pprint import pprint
import collections

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
sns.set(font_scale=1.3)

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib

import warnings
warnings.filterwarnings('ignore')

np.random.seed(37)

In [5]:
df_model = pd.read_pickle('../data/final/NotIot_clean_EN_news_lexic.p')

In [6]:
df_model.head()

Unnamed: 0,count_words,count_capital_words,count_excl_quest_marks,sentiment,origin_text,clean_text,lex_anger,lex_sadness,lex_joy,lex_fear
0,8,0,0,joy,Test to predict breast cancer relapse is approved,test predict breast cancer relapse approved,0.096167,0.234167,0.081,0.224
1,8,0,0,sadness,"Two Hussein allies are hanged, Iraqi official says",two hussein allies hanged iraqi official says,0.0,0.0,0.0,0.0
2,5,1,0,joy,Sights and sounds from CES,sights sounds ces,0.0,0.0,0.0,0.0
3,6,0,0,joy,Schuey sees Ferrari unveil new car,schuey sees ferrari unveil new car,0.0,0.0,0.0,0.0
4,8,0,0,fear,Closings and cancellations top advice on flu outbreak,closings cancellations top advice flu outbreak,0.0,0.073,0.0,0.1745


In [13]:
X_train, X_test, y_train, y_test = train_test_split(df_model.drop('sentiment', axis=1), df_model.sentiment, test_size=0.1, random_state=37)

In [14]:
class ColumnExtractor(TransformerMixin, BaseEstimator):
    def __init__(self, cols):
        self.cols = cols

    def transform(self, X, **transform_params):
        return X[self.cols]

    def fit(self, X, y=None, **fit_params):
        return self

In [21]:
def grid_vect(clf, parameters_clf, X_train, X_test, parameters_text=None, vect=None):
    
    textcountscols = ['count_capital_words','count_excl_quest_marks','count_words']
    
    lexiccols = ['lex_anger','lex_joy','lex_sadness','lex_fear']

    features = FeatureUnion([ ('textcounts', ColumnExtractor(cols=textcountscols)),
                            ('lexiccols', ColumnExtractor(cols=lexiccols)),
                             ('pipe', Pipeline([('cleantext', ColumnExtractor(cols='clean_text')), ('vect', vect)]))]
                            , n_jobs=-1)

    
    pipeline = Pipeline([
        ('features', features)
        , ('clf', clf)
    ])
    
    # Join the parameters dictionaries together
    parameters = dict()
    if parameters_text:
        parameters.update(parameters_text)
    parameters.update(parameters_clf)

    # initiate gridsearchCV with parameters and pipline
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5)
    
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)

    t0 = time()
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best CV score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    print("Test score with best_estimator_: %0.3f" % grid_search.best_estimator_.score(X_test, y_test))
    print("\n")
    print("Classification Report Test Data")
    print(classification_report(y_test, grid_search.best_estimator_.predict(X_test)))

    print("all results")
    means = grid_search.cv_results_['mean_test_score']
    stds = grid_search.cv_results_['std_test_score']
    params = grid_search.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
    
    return grid_search

In [22]:
# Parameter grid settings for the vectorizers (Count and TFIDF)
parameters_vect = {
    'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
    'features__pipe__vect__ngram_range': ((1, 1), (1, 2)),
    'features__pipe__vect__min_df': (1,2)
}

# Parameter grid settings for SVM
parameters_svm = {'clf__kernel':('linear', 'rbf'), 'clf__C':(1,0.25,0.5,0.75),'clf__gamma': (1,2,3)}

In [23]:
from sklearn.svm import SVC
svm = SVC()
svm.get_params().keys()

dict_keys(['C', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])

In [24]:
countvect = CountVectorizer()

In [25]:
import sys
#sys.stdout = open("output_SVMModel_sentimentEN_features_count.txt", "w")

In [26]:
# MultinomialNB
best_mnb_countvect = grid_vect(svm, parameters_svm, X_train, X_test, parameters_text=parameters_vect, vect=countvect)

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__C': (1, 0.25, 0.5, 0.75),
 'clf__gamma': (1, 2, 3),
 'clf__kernel': ('linear', 'rbf'),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   54.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:  5.5min finished


done in 333.628s

Best CV score: 0.529
Best parameters set:
	clf__C: 0.25
	clf__gamma: 1
	clf__kernel: 'linear'
	features__pipe__vect__max_df: 0.25
	features__pipe__vect__min_df: 1
	features__pipe__vect__ngram_range: (1, 1)
Test score with best_estimator_: 0.455


Classification Report Test Data
              precision    recall  f1-score   support

       anger       1.00      0.07      0.13        14
        fear       0.44      0.47      0.45        15
         joy       0.47      0.83      0.60        35
     sadness       0.44      0.21      0.29        19
    surprise       0.36      0.25      0.30        16

   micro avg       0.45      0.45      0.45        99
   macro avg       0.54      0.37      0.35        99
weighted avg       0.52      0.45      0.40        99

all results
0.518059 (0.017089) with: {'clf__C': 1, 'clf__gamma': 1, 'clf__kernel': 'linear', 'features__pipe__vect__max_df': 0.25, 'features__pipe__vect__min_df': 1, 'features__pipe__vect__ngram_range': (1, 1)}
0.

In [27]:
tfidfvect = TfidfVectorizer()

In [28]:
#sys.stdout = open("output_SVMModel_sentimentEN_features_tfidf.txt", "w")

In [29]:
best_mnb_tfidf = grid_vect(svm, parameters_svm, X_train, X_test, parameters_text=parameters_vect, vect=tfidfvect)

Performing grid search...
pipeline: ['features', 'clf']
parameters:
{'clf__C': (1, 0.25, 0.5, 0.75),
 'clf__gamma': (1, 2, 3),
 'clf__kernel': ('linear', 'rbf'),
 'features__pipe__vect__max_df': (0.25, 0.5, 0.75),
 'features__pipe__vect__min_df': (1, 2),
 'features__pipe__vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   46.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:  5.3min finished


done in 319.465s

Best CV score: 0.547
Best parameters set:
	clf__C: 1
	clf__gamma: 1
	clf__kernel: 'linear'
	features__pipe__vect__max_df: 0.25
	features__pipe__vect__min_df: 1
	features__pipe__vect__ngram_range: (1, 1)
Test score with best_estimator_: 0.475


Classification Report Test Data
              precision    recall  f1-score   support

       anger       1.00      0.07      0.13        14
        fear       0.44      0.53      0.48        15
         joy       0.50      0.83      0.62        35
     sadness       0.46      0.32      0.37        19
    surprise       0.33      0.19      0.24        16

   micro avg       0.47      0.47      0.47        99
   macro avg       0.55      0.39      0.37        99
weighted avg       0.53      0.47      0.42        99

all results
0.547404 (0.020494) with: {'clf__C': 1, 'clf__gamma': 1, 'clf__kernel': 'linear', 'features__pipe__vect__max_df': 0.25, 'features__pipe__vect__min_df': 1, 'features__pipe__vect__ngram_range': (1, 1)}
0.542

In [13]:
#max_df: 
#min_df: 
#ngram_range: 
#clf__C :
#textcountscols = ['count_capital_words','count_emojis','count_excl_quest_marks','count_hashtags'
#                      ,'count_mentions','count_urls','count_words']
#    az
#features = FeatureUnion([('textcounts', ColumnExtractor(cols=textcountscols))
#                         , ('pipe', Pipeline([('cleantext', ColumnExtractor(cols='clean_text'))
#                                              , ('vect', CountVectorizer(max_df=0.25, min_df=1, ngram_range=(1,2)))]))]
#                       , n_jobs=-1)
#
#pipeline = Pipeline([
#    ('features', features)
#    , ('clf', SVC(C=0.75))
#])

#best_model = pipeline.fit(df_model.drop('sentiment', axis=1), df_model.sentiment)

In [1]:
!jupyter nbconvert --to script SVMModel_sentimentEN_features.ipynb

[NbConvertApp] Converting notebook SVMModel_sentimentEN_features.ipynb to script
[NbConvertApp] Writing 5772 bytes to SVMModel_sentimentEN_features.py
