# Text classification with RNNs
## Preamble: installing and importing packages

In [786]:
try:
    import datasets
except ModuleNotFoundError:
    !pip install datasets
    import datasets

In [787]:
try:
    import nltk
except ModuleNotFoundError:
    !pip install nltk
    import nltk

In [788]:
try:
    from unidecode import unidecode
except ModuleNotFoundError:
    !pip install unidecode
    from unidecode import unidecode

In [789]:
import os
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds



## Load training dataset

We are going to work with a [ dataset that contains 58k carefully curated Reddit comments labeled for 27 emotions](https://www.tensorflow.org/datasets/catalog/goemotions). 
This dataset can be retreived using the [`datasets` library from the catalog of tensorflow ](https://huggingface.co/docs/datasets/index).

The next cells load some information on the dataset:

In [790]:
SEED = 34

In [791]:
DATA_HANDLE = "go_emotions"

In [792]:
from datasets import load_dataset_builder
ds_builder = load_dataset_builder(DATA_HANDLE)




In [793]:
ds_builder.info.description

'The GoEmotions dataset contains 58k carefully curated Reddit comments labeled for 27 emotion categories or Neutral.\nThe emotion categories are admiration, amusement, anger, annoyance, approval, caring, confusion, curiosity, desire,\ndisappointment, disapproval, disgust, embarrassment, excitement, fear, gratitude, grief, joy, love, nervousness,\noptimism, pride, realization, relief, remorse, sadness, surprise.\n'

Each element in the dataset has two features: the review text itself, and the associated label:

In [794]:
ds_builder.info.features

{'text': Value(dtype='string', id=None),
 'labels': Sequence(feature=ClassLabel(names=['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'], id=None), length=-1, id=None),
 'id': Value(dtype='string', id=None)}

Now we are going to load the training data:

In [795]:
from datasets import load_dataset

train_ds = load_dataset(DATA_HANDLE, split="train")
test_ds =  load_dataset(DATA_HANDLE, split="test")




As seen in `ds_builder.info.features`, each data sample has three fields: the `text` and the `label` string and the id of the text. Here is the text for one particular sample

In [796]:
train_ds

Dataset({
    features: ['text', 'labels', 'id'],
    num_rows: 43410
})

### Normalizing characters
Some of the tools we'll be using later cannot flawlessly handle all unicode characters. To avoid problems, we will normalize all characters to their closest ASCII equivalent using the function `unidecode` (imported from [`unidecode` package](https://pypi.org/project/Unidecode/)).

The function basically replaces all characters bearing [diacritic signs](https://en.wikipedia.org/wiki/Diacritic) with their corresponding plain character, as well as any symbols with close ASCII equivalents. The result is a text with no accents, cedillas, no € symbol, etc.

In [797]:
unidecode(train_ds[10]['text'])

"Demographics? I don't know anybody under 35 who has cable tv."

In [798]:
train_ds = train_ds.map(lambda sample: {'text': unidecode(sample['text']), 'labels': sample['labels'],'id':sample['id']})



## Stemming



In [799]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [800]:
emotion_categories=['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

In [808]:
print(len(emotion_categories))

28


In [801]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Preprocess the text data
def preprocess(text):
  lemmatizer = WordNetLemmatizer()
  tokens = nltk.word_tokenize(text)
  tokens = [token.lower() for token in tokens if token.isalpha()]
  stop_words = set(stopwords.words("english"))
  tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
  return " ".join(tokens)

In [802]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import itertools

# Preprocess the data
def tokenize(texts):
  # Tokenize the texts using the Tokenizer class
  tokenizer = Tokenizer(num_words=20000)
  tokenizer.fit_on_texts(texts)
  sequences = tokenizer.texts_to_sequences(texts)
  return sequences, tokenizer


def encode_labels(labels, num_classes):
  # Flatten the list of labels to a single list or array
  labels = list(itertools.chain.from_iterable(labels))
  # Or use a list comprehension: labels = [label for sublist in labels for label in sublist]
  
  # One-hot encode the labels
  one_hot_labels = to_categorical(labels, num_classes=num_classes)
  return one_hot_labels


def preprocess_(dataset, max_length):
  # Select only the examples with non-empty text and labels
  dataset = [example for example in dataset if (example["text"]!="" and example["labels"]!="")]

  # Tokenize and encode the text and labels
  texts = [example["text"] for example in dataset]
  print(f" text : {len(texts)}")
  sequences, tokenizer = tokenize(texts)
  sequences = pad_sequences(sequences, maxlen=max_length)
  print(sequences.shape)
  labels = [example["labels"] for example in dataset]
  print(f" labels : {len(labels)}")
  one_hot_labels = encode_labels(labels, num_classes=len(emotion_categories))
  print(f" after one hot labels {one_hot_labels.shape}")
  padded_sequences = pad_sequences(sequences, maxlen=max_length)
  padded_one_hot_labels = pad_sequences(one_hot_labels, maxlen=max_length)
  print(padded_one_hot_labels.shape)
  print(padded_sequences.shape)
  return padded_sequences, padded_one_hot_labels, tokenizer




In [803]:

def preprocess(dataset, max_length):
  # Select only the examples with non-empty text and labels
  dataset = [example for example in dataset if example["text"] and example["labels"]]
  # Tokenize and encode the text and labels
  texts = [example["text"] for example in dataset]
  sequences, tokenizer = tokenize(texts)
  sequences = pad_sequences(sequences, maxlen=max_length)
  labels = [example["labels"] for example in dataset]
  one_hot_labels = encode_labels(labels, num_classes=len(emotion_categories))
  # Create a TensorFlow dataset from the preprocessed data
  tf_dataset = tf.data.Dataset.from_tensor_slices((sequences, one_hot_labels))
  # Shuffle and batch the examples
  tf_dataset = tf_dataset.shuffle(buffer_size=len(sequences)).batch(BATCH_SIZE)
  return tf_dataset




In [804]:
BATCH_SIZE = 80


In [805]:

# Split the data into testing sets
test_ds = load_dataset(DATA_HANDLE, split="test")





In [806]:
!jupyter --config-dir
!echo "c.NotebookApp.iopub_data_rate_limit = 10000000" >> /root/.jupyter/jupyter_notebook_config.py


/root/.jupyter


In [807]:
# Preprocess the data

# Get the maximum length of the examples
max_length = max([len(example["text"]) for example in train_ds])

train_tfds = preprocess(train_ds, max_length)


ValueError: ignored

In [None]:
# Preprocess the data
max_length = 50  # maximum length of the sequences
X_train, y_train, tokenizer = preprocess(train_ds, max_length)
X_test, y_test, _ = preprocess(test_ds, max_length)

# Build the model
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(20000, 128, input_length=max_length),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(len(emotion_categories), activation='softmax')
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
X_train

In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test loss:", loss)
print("Test accuracy:", accuracy)


OPTIMIZED MODEL 
