In [1]:
import os
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
import numpy as np
import urllib.request
import pandas as pd
from gensim.test.utils import common_texts, get_tmpfile
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from tempfile import gettempdir
import zipfile
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, brier_score_loss
import matplotlib.pyplot as plt
import seaborn as sns

print(tf.__version__)

  return f(*args, **kwds)
Using TensorFlow backend.


1.10.0


In [2]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [3]:
# Import the Universal Sentence Encoder's TF Hub module
tf_use_embed = hub.Module(module_url)

INFO:tensorflow:Using /var/folders/9g/y473n1q933783ynddc4h9gwm0000gn/T/tfhub_modules to cache modules.


In [4]:
sentences = ["I love you so much", "I hate that movie, there are no plotline or even interesting scenes"]
# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    embedded_sentences = session.run(tf_use_embed(sentences))
    for i, embedded_sentence in enumerate(np.array(embedded_sentences).tolist()):
        print("Sentence: {}".format(sentences[i]))
        print("Embedding size: {}".format(len(embedded_sentence)))
        message_embedding_snippet = ", ".join(
            (str(x) for x in embedded_sentence[:3]))
        print("Embedding: [{}, ...]\n".format(message_embedding_snippet))

Sentence: I love you so much
Embedding size: 512
Embedding: [-0.03638804703950882, -0.07312508672475815, 0.007526349741965532, ...]

Sentence: I hate that movie, there are no plotline or even interesting scenes
Embedding size: 512
Embedding: [0.016185158863663673, 0.059069760143756866, -0.005612711422145367, ...]



In [5]:
def embed_use(sentences):
    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        embedded_sentences = session.run(tf_use_embed(sentences))
        return embedded_sentences

In [6]:
from pathlib import Path

class EmoIntDataset(object):
    BASE_URL = "http://saifmohammad.com/WebDocs/"
    TRAIN_URI = "EmoInt%20Train%20Data/{}-ratings-0to1.train.txt"
    TEST_URI = "EmoInt%20Test%20Gold%20Data/{}-ratings-0to1.test.gold.txt"
    EMOTION_CLASSES = ["anger", "fear", "joy", "sadness"]
    DATASET_NAME = "EmoInt"
  
    THRESHOLD = 0.33
  
    def __load_data_per_class(self, category, class_name, url, threshold=0):
        filename = "./data/{}_{}_{}.csv".format(category, self.DATASET_NAME, class_name)
        if Path(filename).is_file():
            df = pd.read_csv(filename)
        else:
            resource = urllib.request.urlopen(url)
            np_array = np.asarray([line.split('\t') for line in [line.strip() for line in resource.read().decode('utf-8').splitlines()]])
            df = pd.DataFrame(np_array, columns=["id", "text", "emotion", "emotion_level"])
            df['emotion_level'] = df['emotion_level'].astype(float)
            df = df.query('emotion_level>' + str(threshold))
            df.to_csv(filename, index=False)
        return df[["text", "emotion"]]
  
    def load_data(self, set_threshold=False):
        train_data = None
        test_data = None
    
        for emotion in self.EMOTION_CLASSES:
            # load train dataset
            train_df = self.__load_data_per_class("train", emotion, self.BASE_URL + self.TRAIN_URI.format(emotion), threshold=(self.THRESHOLD if set_threshold else 0))
      
            # load test dataset
            test_df = self.__load_data_per_class("test", emotion, self.BASE_URL + self.TEST_URI.format(emotion), threshold=(self.THRESHOLD if set_threshold else 0))
      
            train_data = (train_df if train_data is None else train_data.append(train_df))
            test_data = (test_df if test_data is None else test_data.append(test_df))
      
        return train_data, test_data

In [7]:
emo_int_dataset = EmoIntDataset()
train_data, test_data = emo_int_dataset.load_data(set_threshold=True)

# val_iddata=train_data.sample(frac=0.1,random_state=200)
# train_data=train_data.drop(val_data.index)

print(train_data.shape)
# print(valid_data.shape)
print(test_data.shape)

(2901, 2)
(2508, 2)


In [8]:
emotions = train_data.emotion.unique()
dic = dict()
labels = []
for i, emotion in enumerate(emotions):
    dic[emotion]=i
    labels.append(emotion)
print(dic)
print(labels)

{'anger': 0, 'fear': 1, 'joy': 2, 'sadness': 3}
['anger', 'fear', 'joy', 'sadness']


# Embed with Universal Sentence Encoder

In [9]:
train_embedded_use = embed_use(train_data.text.tolist())
test_embedded_use = embed_use(test_data.text.tolist())

In [10]:
train_labels = np.array(train_data.emotion.apply(lambda x:dic[x]))
test_labels = np.array(test_data.emotion.apply(lambda x:dic[x]))

# SVM Classifier

In [11]:
from sklearn import svm
clfSVM = svm.SVC(C=10000.0, gamma='auto', kernel='rbf')
clfSVM.fit(train_embedded_use, train_labels)

SVC(C=10000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
test_pred_labels = clfSVM.predict(test_embedded_use)

In [13]:
cmSVM = confusion_matrix(test_labels, test_pred_labels)
cmSVM

array([[429, 111,  28,  75],
       [131, 533,  46,  79],
       [ 48,  46, 410,  39],
       [ 80, 133,  40, 280]])

In [14]:
reportSVM = classification_report(test_labels, test_pred_labels, target_names=labels)
print(reportSVM)

             precision    recall  f1-score   support

      anger       0.62      0.67      0.64       643
       fear       0.65      0.68      0.66       789
        joy       0.78      0.76      0.77       543
    sadness       0.59      0.53      0.56       533

avg / total       0.66      0.66      0.66      2508



# TfIdfVectorizer + SVM Classifier

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=70, min_df=2, ngram_range=(1, 3))
train_tfidf_vectors = tfidf_vectorizer.fit_transform(train_data.text.tolist())
test_tfidf_vectors = tfidf_vectorizer.transform(test_data.text.tolist())

In [16]:
from sklearn import svm
clf_SVM_Tfidf = svm.SVC(C=10000.0, gamma='auto', kernel='rbf')
clf_SVM_Tfidf.fit(train_tfidf_vectors, train_labels)

SVC(C=10000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [17]:
test_pred_labels_Tfidf = clf_SVM_Tfidf.predict(test_tfidf_vectors)

In [18]:
cmSVM_Tfidf = confusion_matrix(test_labels, test_pred_labels_Tfidf)
cmSVM_Tfidf

array([[512,  67,  20,  44],
       [ 48, 629,  29,  83],
       [ 21,  47, 455,  20],
       [ 32,  79,  15, 407]])

In [19]:
reportSVM = classification_report(test_labels, test_pred_labels_Tfidf, target_names=labels)
print(reportSVM)

             precision    recall  f1-score   support

      anger       0.84      0.80      0.82       643
       fear       0.77      0.80      0.78       789
        joy       0.88      0.84      0.86       543
    sadness       0.73      0.76      0.75       533

avg / total       0.80      0.80      0.80      2508



# CountVectorizer + SVM Classifier

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Initialize a CountVectorizer object: count_vectorizer
count_vectorizer = CountVectorizer(stop_words="english", analyzer='word', 
                                   ngram_range=(1, 3), max_df=70, min_df=2, max_features=None)

train_count_vectors = count_vectorizer.fit_transform(train_data.text.tolist())
test_count_vectors = count_vectorizer.transform(test_data.text.tolist())

clf_NB_count = MultinomialNB(alpha=0.1)
clf_SVM_count = svm.SVC(C=10000.0, gamma='auto', kernel='rbf')
clf_NB_count.fit(train_count_vectors, train_labels)
clf_SVM_count.fit(train_count_vectors, train_labels)

SVC(C=10000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
# get the feature names as numpy array
feature_names = np.array(count_vectorizer.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = clf_NB_count.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['सत य_ब ध_य' 'late worst fear' 'late worst' 'late night thoughts'
 'late night' 'late agent let' 'late agent' 'large' 'taken care'
 'taken care want']

Largest Coefs: 
['anger' 'angry' 'rage' 'bitter' 'offended' 'revenge' 'fuming' 'got' 'll'
 'outrage']


In [22]:
test_pred_labels_count = clf_SVM_count.predict(test_count_vectors)

cmSVM_count = confusion_matrix(test_labels, test_pred_labels_count)
cmSVM_count

array([[526,  39,  13,  65],
       [ 31, 634,  20, 104],
       [ 14,  37, 463,  29],
       [ 37,  55,  17, 424]])

In [23]:
reportSVM = classification_report(test_labels, test_pred_labels_count, target_names=labels)
print(reportSVM)

             precision    recall  f1-score   support

      anger       0.87      0.82      0.84       643
       fear       0.83      0.80      0.82       789
        joy       0.90      0.85      0.88       543
    sadness       0.68      0.80      0.73       533

avg / total       0.82      0.82      0.82      2508



# CV 5 fold with CountVectorizer + TfIdfTransformer + SVM Classifier 

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', svm.SVC()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__C': (10000.0,),
    'clf__gamma': ('auto',),
    'clf__kernel': ('rbf',),
}

grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)

grid_search.fit(train_data.text.tolist(), train_labels)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   53.3s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__max_df': (0.5, 0.75, 1.0), 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)), 'tfidf__use_idf': (True, False), 'clf__C': (10000.0,), 'clf__gamma': ('auto',), 'clf__kernel': ('rbf',)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [34]:
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.828
Best parameters set:
	clf__C: 10000.0
	clf__gamma: 'auto'
	clf__kernel: 'rbf'
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__ngram_range: (1, 1)


In [35]:
test_pred_labels_grid = grid_search.predict(test_data.text.tolist())

cmSVM_grid = confusion_matrix(test_labels, test_pred_labels_grid)
cmSVM_grid

array([[498,  68,  19,  58],
       [ 56, 611,  32,  90],
       [ 22,  51, 458,  12],
       [ 50,  83,  16, 384]])

In [36]:
reportSVM_grid = classification_report(test_labels, test_pred_labels_grid, target_names=labels)
print(reportSVM_grid)

             precision    recall  f1-score   support

      anger       0.80      0.77      0.78       643
       fear       0.75      0.77      0.76       789
        joy       0.87      0.84      0.86       543
    sadness       0.71      0.72      0.71       533

avg / total       0.78      0.78      0.78      2508



# CV 5 fold with CountVectorizer + TfIdfTransformer + SGDClassifier 

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer

# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (5,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__max_iter': (10, 50, 80),
}

grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)

grid_search.fit(train_data.text.tolist(), train_labels)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    8.3s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__max_df': (0.5, 0.75, 1.0), 'vect__ngram_range': ((1, 1), (1, 2)), 'clf__max_iter': (5,), 'clf__alpha': (1e-05, 1e-06), 'clf__penalty': ('l2', 'elasticnet')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [29]:
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.832
Best parameters set:
	clf__alpha: 1e-05
	clf__max_iter: 5
	clf__penalty: 'elasticnet'
	vect__max_df: 0.75
	vect__ngram_range: (1, 1)


In [30]:
train_pred_labels_grid = grid_search.predict(train_data.text.tolist())

cmSVM_grid = confusion_matrix(train_labels, train_pred_labels_grid)
cmSVM_grid

array([[720,   2,   1,   9],
       [  0, 909,   0,   6],
       [  1,   0, 635,   0],
       [  6,  11,   1, 600]])

In [31]:
test_pred_labels_grid = grid_search.predict(test_data.text.tolist())

cmSVM_grid = confusion_matrix(test_labels, test_pred_labels_grid)
cmSVM_grid

array([[443,  97,  39,  64],
       [ 32, 643,  38,  76],
       [  5,  45, 471,  22],
       [ 27,  79,  30, 397]])

In [32]:
reportSVM_grid = classification_report(test_labels, test_pred_labels_grid, target_names=labels)
print(reportSVM_grid)

             precision    recall  f1-score   support

      anger       0.87      0.69      0.77       643
       fear       0.74      0.81      0.78       789
        joy       0.81      0.87      0.84       543
    sadness       0.71      0.74      0.73       533

avg / total       0.79      0.78      0.78      2508

