# Tweet Emotion Recognition: Natural Language Processing with TensorFlow

---

Dataset: [Tweet Emotion Dataset](https://github.com/dair-ai/emotion_dataset)

## Environment Setup

1. Installing Hugging Face's nlp package
2. Importing libraries

In [1]:
!pip install nlp

In [2]:
%matplotlib inline

import re
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import nlp
import random
import collections
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.corpus import wordnet
nltk.download("stopwords", quiet = True)
nltk.download("wordnet", quiet = True)
nltk.download("punkt", quiet = True)
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import word_tokenize
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import Sequential
from keras.layers import Embedding
from keras.layers import Bidirectional
from keras.layers import LSTM
from keras.layers import Dense
from keras.callbacks import Callback

## Importing Data

1. Importing the Tweet Emotion dataset
2. Creating train, validation and test sets
3. Extracting tweets and labels from the examples

In [3]:
data = nlp.load_dataset('emotion')

In [4]:
data

In [5]:
train = data['train']
val = data['validation']
test = data['test']

In [6]:
train_tweet, train_label = train['text'], train['label']
val_tweet, val_label = val['text'], val['label']
test_tweet, test_label = test['text'], test['label']

In [7]:
train_tweet[0], train_label[0]

## Exploratory Data Analysis


1.   Initializing Hyperparameters
2.   Simple Data Visualization



In [8]:
# Initialising Hyperparameters

NUM_WORDS = 10000
OOV_TOKEN = '<OOV>'
MAX_LEN = 50
TRUNCATING = 'post'
PADDING = 'post'

In [9]:
length_of_words = [len(tweet.split(' ')) for tweet in train_tweet]

In [10]:
plt.hist(length_of_words, bins=len(set(length_of_words)))
plt.xlabel('Lengths of Tweets')
plt.ylabel('No. of Tweets')
plt.show()

In [11]:
all_labels = train_label + test_label + val_label

In [12]:
len(all_labels)

In [13]:
classes = set(train_label)
classes

In [14]:
length_labels = dict(collections.Counter(train_label))
length_labels

In [15]:
plt.hist(train_label, bins = 11)
plt.show()

## Data Preprocessing


1.   Removing url links and punctuations from the tweets
2.   Removing stopwords
3.   Tokenizing and Lematizing the tokens
4.   Creating padded sequences
5.   Creating classes to index and index to classes dictionaries
6.   Vectorizing text labels




In [16]:
# Removing stopwords, links and punctuations
# later lematizing the tokens

def preprocess(tweet):
  tweet = re.sub(r'[^\w\s]', '', tweet.lower())
  tweet = re.sub(r"\S*https?:\S*", "", tweet)
  tweet_tokens = word_tokenize(tweet)
  result=''
  for t in tweet_tokens:
        lemma = lemmatizer.lemmatize(t)
        if lemma not in set(stopwords.words('english')):
            result += lemma + ' '
  return result

In [17]:
new_train_tweets = [preprocess(tweet) for tweet in train_tweet]
new_val_tweets = [preprocess(tweet) for tweet in val_tweet]

In [18]:
new_train_tweets[:2]

In [19]:
train_tweet[:2]

In [20]:
# Tokenize the tweets
def tokenize(num_words, oov_token):
  tokenizer = Tokenizer(num_words = num_words, oov_token = oov_token)
  tokenizer.fit_on_texts(new_train_tweets)
  return tokenizer

In [21]:
tokenizer = tokenize(NUM_WORDS, OOV_TOKEN)

In [22]:
# Creating sequences of the tokens and padding them

def padded_sequences(tokenizer, tweet, maxlen, truncating, padding):
  sequences = tokenizer.texts_to_sequences(tweet)
  padded = pad_sequences(sequences, maxlen=maxlen, padding=padding, truncating=truncating)
  return padded

In [23]:
train_padded_sequences = padded_sequences(tokenizer, new_train_tweets, MAX_LEN, TRUNCATING, PADDING)

In [24]:
new_train_tweets[0], train_padded_sequences[0]

In [25]:
# Creating classes to index and index to classes dictionaries
class2idx = dict((c, idx) for idx, c in enumerate(classes))
idx2class = dict((idx, c) for idx, c in enumerate(classes))

In [26]:
class2idx, idx2class

In [27]:
# vectorizing text labels by converting it to numeric labels
names2ids = lambda labels: np.array([class2idx.get(x) for x in labels])

In [28]:
train_vectorized_labels = names2ids(train_label)

In [29]:
train_vectorized_labels, len(train_vectorized_labels)

In [30]:
train_vectorized_labels[0]

## Building the Model

In [78]:
def build_compile_model(num_words, maxlen):
  model = Sequential([
                  Embedding(input_dim=num_words, output_dim=16, input_length = maxlen),
                  Bidirectional(LSTM(64, return_sequences=True)),
                  Bidirectional(LSTM(32)),
                  Dense(12, activation='relu'),
                  Dense(6, activation='softmax')
  ])

  model.compile(
                optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy']
  )

  return model

In [79]:
model = build_compile_model(NUM_WORDS, MAX_LEN)

In [80]:
model.summary()

# Training the Model

In [81]:
val_padded_sequence = padded_sequences(tokenizer, new_val_tweets, MAX_LEN, TRUNCATING, PADDING)

In [82]:
val_vectorized_labels = names2ids(val_label)

In [83]:
class myCallback(Callback):
  def on_epoch_end(self, epoch, logs={}):
    # Check accuracy
    if(logs.get('loss') < 0.1):

      # Stop if threshold is met
      print("\nLoss is lower than 0.02 so cancelling training!")
      self.model.stop_training = True

# Instantiate class
callbacks = myCallback()

In [84]:
history = model.fit(
                  train_padded_sequences, train_vectorized_labels,
                  validation_data=(val_padded_sequence, val_vectorized_labels),
                  epochs=15,
                  #callbacks=[callbacks]
)

## Evaluating the Model

1. Visualizing training history
2. Prepraring a test set
3. A look at individual predictions on the test set
4. A look at all predictions on the test set

In [85]:
def show_history(h):
    epochs_trained = len(h.history['loss'])
    plt.figure(figsize=(16, 6))

    plt.subplot(1, 2, 1)
    plt.plot(range(0, epochs_trained), h.history.get('accuracy'), label='Training')
    plt.plot(range(0, epochs_trained), h.history.get('val_accuracy'), label='Validation')
    plt.ylim([0., 1.])
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(range(0, epochs_trained), h.history.get('loss'), label='Training')
    plt.plot(range(0, epochs_trained), h.history.get('val_loss'), label='Validation')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [86]:
# LSTM model
# Visualizing training history

show_history(history)

In [87]:
# Prepraring a test set
new_test_tweets = [preprocess(tweet) for tweet in test_tweet]

In [88]:
# Padding test set
# Vectorizing test set
test_padded_sequence = padded_sequences(tokenizer, test_tweet, MAX_LEN, TRUNCATING, PADDING)
test_vectorized_labels = names2ids(test_label)

In [89]:
# Evaluating the model on test set
_ = model.evaluate(test_padded_sequence, test_vectorized_labels)

In [90]:
# Prediction

i = random.randint(0, len(test_label) - 1)

print('Sentence:', test_tweet[i])
print('Emotion:', test_label[i])

p = model.predict(np.expand_dims(test_padded_sequence[i], axis=0))[0]
print('Predicted Emotion:', idx2class.get(np.argmax(p)))

In [91]:
preds = model.predict(test_padded_sequence)
preds.shape

In [92]:
convert_preds = [np.argmax(preds[i]) for i in range(len(preds))]

In [93]:
new_preds = [idx2class[i] for i in convert_preds]

In [94]:
new_preds[:5]

In [95]:
def show_confusion_matrix(y_true, y_pred, classes):
    from sklearn.metrics import confusion_matrix
    
    cm = confusion_matrix(y_true, y_pred, normalize='true')

    plt.figure(figsize=(8, 8))
    sp = plt.subplot(1, 1, 1)
    ctx = sp.matshow(cm)
    plt.xticks(list(range(0, 6)), labels=classes)
    plt.yticks(list(range(0, 6)), labels=classes)
    plt.colorbar(ctx)
    plt.show()

In [96]:
show_confusion_matrix(test_label, new_preds, list(classes))