Use the Sarcasm Dataset to train a Bi-LSTM Model


In [1]:
## Download the dataset
!wget https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json

--2022-06-03 19:02:03--  https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.218.128, 142.251.31.128, 142.251.18.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.218.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘sarcasm.json’


2022-06-03 19:02:03 (91.4 MB/s) - ‘sarcasm.json’ saved [5643545/5643545]



In [2]:
import json

#load the JSON file
with open('./sarcasm.json', 'r') as f:
  datastore = json.load(f)

#Initialize the lists
sentences = []
labels = []

#Collect sentences and labels into the lists
for item in datastore:
  sentences.append(item['headline'])
  labels.append(item['is_sarcastic'])

##Split the dataset

In [7]:
training_size = 20000

#Split the sentences 
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]

#Split the labels 
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]


##Data preprocessing

Generate the vocab and padded sequences 

In [8]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 

vocab_size = 10000
max_length = 120
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

#Initialize the Tokenizer class
tokenizer = Tokenizer(num_words = vocab_size,
                      oov_token = oov_tok)


#Generate the word index dictionary
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

#Generate and pad the training and testing sequences 
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences,
                                maxlen = max_length,
                                padding = padding_type,
                                truncating = trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,
                                maxlen = max_length,
                                padding = padding_type,
                                truncating = trunc_type)

#Convert the lbaels lists into numpy arrays
training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)

##Build and compile the model

The architecture is almost identical from the one used with the IMDB dataset

In [9]:
import tensorflow as tf

#Parameters 
embedding_dim = 16
lstm_dim = 32
dense_dim = 24

#Model definition with LSTM
model_lstm = tf.keras.Sequential([
      tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
      tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_dim)),
      tf.keras.layers.Dense(dense_dim, activation='relu'),
      tf.keras.layers.Dense(1, activation='sigmoid')
])

#Compile the model and set the training parameters
model_lstm.compile(loss = 'binary_crossentropy',
                   optimizer = 'adam',
                   metrics = ['accuracy'])


model_lstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 120, 16)           160000    
                                                                 
 bidirectional (Bidirectiona  (None, 64)               12544     
 l)                                                              
                                                                 
 dense (Dense)               (None, 24)                1560      
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
Total params: 174,129
Trainable params: 174,129
Non-trainable params: 0
_________________________________________________________________


In [10]:
NUM_EPOCHS = 10

#Train the model 
history_lstm = model_lstm.fit(training_padded, 
                              training_labels, 
                              epochs = NUM_EPOCHS,
                              validation_data = (testing_padded, testing_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
