In [7]:
import json
import tensorflow as tf
from keras.optimizers import *

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

print(tf.__version__)

2.0.0-beta0


Using TensorFlow backend.


In [2]:
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O /tmp/sarcasm.json

--2019-11-04 07:51:31--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 2607:f8b0:4005:802::2010, 172.217.0.48
Connecting to storage.googleapis.com (storage.googleapis.com)|2607:f8b0:4005:802::2010|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘/tmp/sarcasm.json’


2019-11-04 07:51:34 (2.14 MB/s) - ‘/tmp/sarcasm.json’ saved [5643545/5643545]



In [17]:
vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000
batch_size=32
epochs=50
optimizer=SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False)

with open("/tmp/sarcasm.json", 'r') as f:
    datastore = json.load(f)


sentences = []
labels = []
urls = []
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print("training_padded: "+str(training_padded.shape))
print(str(training_padded))

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print("testing_padded: "+str(testing_padded.shape))
print(str(testing_padded))


training_padded: (20000, 120)
[[328   1 799 ...   0   0   0]
 [  4   1   1 ...   0   0   0]
 [153 890   2 ...   0   0   0]
 ...
 [ 79   1   1 ...   0   0   0]
 [ 53   1   1 ...   0   0   0]
 [312 705   1 ...   0   0   0]]
testing_padded: (6709, 120)
[[  1   1   1 ...   0   0   0]
 [202   1   8 ...   0   0   0]
 [ 18 380 191 ...   0   0   0]
 ...
 [  1   9  67 ...   0   0   0]
 [  1 374   1 ...   0   0   0]
 [  1   1   6 ...   0   0   0]]


In [18]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [19]:
model.compile(loss='binary_crossentropy',optimizer='SGD',metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 120, 16)           16000     
_________________________________________________________________
bidirectional_9 (Bidirection (None, 120, 64)           12544     
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 120, 64)           24832     
_________________________________________________________________
bidirectional_11 (Bidirectio (None, 64)                24832     
_________________________________________________________________
dense_6 (Dense)              (None, 128)               8320      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 129       
Total params: 86,657
Trainable params: 86,657
Non-trainable params: 0
__________________________________________________

In [None]:
history = model.fit(training_padded, 
                    training_labels, 
                    batch_size=batch_size, 
                    epochs=epochs, 
                    validation_data=(testing_padded, testing_labels),
                    verbose=1)

Train on 20000 samples, validate on 6709 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

plot_graphs(history, 'acc')
plot_graphs(history, 'loss')

In [None]:
model.save("test.h5")