In [1]:
import os
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
import numpy as np
import urllib.request
import pandas as pd
from gensim.test.utils import common_texts, get_tmpfile
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from tempfile import gettempdir
import zipfile
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, brier_score_loss
import matplotlib.pyplot as plt
import seaborn as sns

print(tf.__version__)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


1.10.0


In [2]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [3]:
# Import the Universal Sentence Encoder's TF Hub module
tf_use_embed = hub.Module(module_url)

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.
INFO:tensorflow:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-large/3'.
INFO:tensorflow:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-large/3'.


In [4]:
sentences = ["I love you so much", "I hate that movie, there are no plotline or even interesting scenes"]
# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    embedded_sentences = session.run(tf_use_embed(sentences))
    for i, embedded_sentence in enumerate(np.array(embedded_sentences).tolist()):
        print("Sentence: {}".format(sentences[i]))
        print("Embedding size: {}".format(len(embedded_sentence)))
        message_embedding_snippet = ", ".join(
            (str(x) for x in embedded_sentence[:3]))
        print("Embedding: [{}, ...]\n".format(message_embedding_snippet))

Sentence: I love you so much
Embedding size: 512
Embedding: [-0.036388035863637924, -0.07312507927417755, 0.0075263469479978085, ...]

Sentence: I hate that movie, there are no plotline or even interesting scenes
Embedding size: 512
Embedding: [0.016185179352760315, 0.059069760143756866, -0.005612705834209919, ...]



In [5]:
def embed_use(sentences):
    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        embedded_sentences = session.run(tf_use_embed(sentences))
        return embedded_sentences

In [6]:
class EmoIntDataset(object):
    BASE_URL = "http://saifmohammad.com/WebDocs/"
    TRAIN_URI = "EmoInt%20Train%20Data/{}-ratings-0to1.train.txt"
    TEST_URI = "EmoInt%20Test%20Gold%20Data/{}-ratings-0to1.test.gold.txt"
    EMOTION_CLASSES = ["anger", "fear", "joy", "sadness"]
  
    THRESHOLD = 0.33
  
    def __load_data_per_class(self, url, threshold=0):
        resource = urllib.request.urlopen(url)
        np_array = np.asarray([line.split('\t') for line in [line.strip() for line in resource.read().decode('utf-8').splitlines()]])
        df = pd.DataFrame(np_array, columns=["id", "text", "emotion", "emotion_level"])
        df['emotion_level'] = df['emotion_level'].astype(float)
        df = df.query('emotion_level>' + str(threshold))
        return df[["text", "emotion"]]
  
    def load_data(self, set_threshold=False):
        train_data = None
        test_data = None
    
        for emotion in self.EMOTION_CLASSES:
            # load train dataset
            train_df = self.__load_data_per_class(self.BASE_URL + self.TRAIN_URI.format(emotion), threshold=(self.THRESHOLD if set_threshold else 0))
      
            # load test dataset
            test_df = self.__load_data_per_class(self.BASE_URL + self.TEST_URI.format(emotion), threshold=(self.THRESHOLD if set_threshold else 0))
      
            train_data = (train_df if train_data is None else train_data.append(train_df))
            test_data = (test_df if test_data is None else test_data.append(test_df))
      
        return train_data, test_data

In [7]:
emo_int_dataset = EmoIntDataset()
train_data, test_data = emo_int_dataset.load_data(set_threshold=True)

# val_iddata=train_data.sample(frac=0.1,random_state=200)
# train_data=train_data.drop(val_data.index)

print(train_data.shape)
# print(valid_data.shape)
print(test_data.shape)

(2901, 2)
(2508, 2)


In [8]:
emotions = train_data.emotion.unique()
dic = dict()
labels = []
for i, emotion in enumerate(emotions):
    dic[emotion]=i
    labels.append(emotion)
print(dic)
print(labels)

{'anger': 0, 'fear': 1, 'joy': 2, 'sadness': 3}
['anger', 'fear', 'joy', 'sadness']


# Embed with Universal Sentence Encoder

In [9]:
train_embedded_use = embed_use(train_data.text.tolist())
# valid_embedded_use = embed_use(valid_data.text.tolist())
test_embedded_use = embed_use(test_data.text.tolist())

In [10]:
train_labels = np.array(train_data.emotion.apply(lambda x:dic[x]))
# valid_labels = np.array(valid_data.emotion.apply(lambda x:dic[x]))
test_labels = np.array(test_data.emotion.apply(lambda x:dic[x]))

# SVM Classifier

In [11]:
from sklearn import svm
clfSVM = svm.SVC(gamma='scale', decision_function_shape='ovo')
clfSVM.fit(train_embedded_use, train_labels)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovo', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [25]:
test_pred_labels = clfSVM.predict(test_embedded_use)

In [26]:
cmSVM = confusion_matrix(test_labels, test_pred_labels)
cmSVM

array([[422, 157,  37,  27],
       [125, 588,  33,  43],
       [ 28,  57, 436,  22],
       [ 80, 170,  43, 240]])

In [27]:
reportSVM = classification_report(test_labels, test_pred_labels, target_names=labels)
print(reportSVM)

              precision    recall  f1-score   support

       anger       0.64      0.66      0.65       643
        fear       0.60      0.75      0.67       789
         joy       0.79      0.80      0.80       543
     sadness       0.72      0.45      0.55       533

   micro avg       0.67      0.67      0.67      2508
   macro avg       0.69      0.66      0.67      2508
weighted avg       0.68      0.67      0.67      2508



# TfIdfVectorizer + SVM Classifier

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.7, min_df=3, use_idf=True) 
train_tfidf_vectors = tfidf_vectorizer.fit_transform(train_data.text.tolist())
test_tfidf_vectors = tfidf_vectorizer.transform(test_data.text.tolist())

In [18]:
from sklearn import svm
clf_SVM_Tfidf = svm.SVC(C=10000.0, gamma='auto', kernel='rbf')
clf_SVM_Tfidf.fit(train_tfidf_vectors, train_labels)

SVC(C=10000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [29]:
test_pred_labels_Tfidf = clf_SVM_Tfidf.predict(test_tfidf_vectors)

In [30]:
cmSVM_Tfidf = confusion_matrix(test_labels, test_pred_labels_Tfidf)
cmSVM_Tfidf

array([[503,  63,  21,  56],
       [ 88, 562,  40,  99],
       [ 30,  58, 440,  15],
       [ 62,  78,  21, 372]])

In [31]:
reportSVM = classification_report(test_labels, test_pred_labels_Tfidf, target_names=labels)
print(reportSVM)

              precision    recall  f1-score   support

       anger       0.74      0.78      0.76       643
        fear       0.74      0.71      0.73       789
         joy       0.84      0.81      0.83       543
     sadness       0.69      0.70      0.69       533

   micro avg       0.75      0.75      0.75      2508
   macro avg       0.75      0.75      0.75      2508
weighted avg       0.75      0.75      0.75      2508



# CountVectorizer + TfIdfVectorizer + SVM Classifier

In [None]:
# Initialize a CountVectorizer object: count_vectorizer
count_vec = CountVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)