In [1]:
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
datastore = []
with open("sarcasm.json", "r") as f:
    for line in f:
        datastore.append(json.loads(line)) # each line is one json object {}
print(type(datastore[0]))
for key in datastore[0].keys(): 
    print(key)

<class 'dict'>
is_sarcastic
headline
article_link


In [3]:
labels = []
headlines = []
for item in datastore:
    labels.append(item["is_sarcastic"])
    headlines.append(item["headline"])

len(labels), len(headlines)

(28619, 28619)

In [4]:
# TODO: hyperparameter tuning
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "UNK"
training_size = 20000

In [5]:
headlines_train = headlines[:training_size]
headlines_test = headlines[training_size:]
labels_train = labels[:training_size]
labels_test = labels[training_size:]

In [6]:
tokenizer = Tokenizer(oov_token = oov_tok, num_words = vocab_size)
tokenizer.fit_on_texts(headlines_train)
word_index = tokenizer.word_index
seqs_train = tokenizer.texts_to_sequences(headlines_train)
seqs_test = tokenizer.texts_to_sequences(headlines_test)
seqs_train = pad_sequences(seqs_train, maxlen=max_length, padding= padding_type, truncating= trunc_type)
seqs_test = pad_sequences(seqs_test, maxlen=max_length, padding= padding_type, truncating = trunc_type)

In [7]:
import numpy as np 
labels_train = np.array(labels_train)
labels_test = np.array(labels_test)

In [8]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [9]:
num_epochs = 10
history = model.fit(seqs_train, labels_train, epochs=num_epochs, validation_data=(seqs_test, labels_test), verbose=2)

Train on 20000 samples, validate on 8619 samples
Epoch 1/10
20000/20000 - 5s - loss: 0.6500 - accuracy: 0.6220 - val_loss: 0.5138 - val_accuracy: 0.8105
Epoch 2/10
20000/20000 - 3s - loss: 0.4045 - accuracy: 0.8379 - val_loss: 0.3688 - val_accuracy: 0.8416
Epoch 3/10
20000/20000 - 4s - loss: 0.3046 - accuracy: 0.8769 - val_loss: 0.3537 - val_accuracy: 0.8412
Epoch 4/10
20000/20000 - 4s - loss: 0.2547 - accuracy: 0.9022 - val_loss: 0.3291 - val_accuracy: 0.8591
Epoch 5/10
20000/20000 - 4s - loss: 0.2188 - accuracy: 0.9176 - val_loss: 0.3353 - val_accuracy: 0.8538
Epoch 6/10
20000/20000 - 3s - loss: 0.1918 - accuracy: 0.9293 - val_loss: 0.3416 - val_accuracy: 0.8561
Epoch 7/10
20000/20000 - 3s - loss: 0.1717 - accuracy: 0.9365 - val_loss: 0.3556 - val_accuracy: 0.8530
Epoch 8/10
20000/20000 - 3s - loss: 0.1521 - accuracy: 0.9443 - val_loss: 0.3719 - val_accuracy: 0.8498
Epoch 9/10
20000/20000 - 3s - loss: 0.1363 - accuracy: 0.9521 - val_loss: 0.3941 - val_accuracy: 0.8448
Epoch 10/10
200

In [10]:
import matplotlib.pyplot as plt

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")