In [None]:
import json

datastore = []
with open('/content/Sarcasm_Headlines_Dataset.json', 'r') as f:
    for line in f:
        datastore.append(json.loads(line))


sentences = []
labels = []
urls = []

for item in datastore:
  sentences.append(item['headline'])
  labels.append(item['is_sarcastic'])
  urls.append(item['article_link'])

# Run this to ensure TensorFlow 2.x is used
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer=Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index=tokenizer.word_index

sequences=tokenizer.texts_to_sequences(sentences)
padded=pad_sequences(sequences, padding='post')
print(padded[0])
print(padded.shape)



[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(26709, 40)


In [None]:
training_size=20000
training_sentences=sentences[0:training_size]
testing_sentences=sentences[training_size:]
training_labels=labels[0:training_size]
testing_labels=labels[training_size: ]

In [None]:
vocab_size=10000
max_length=100
trunc_type='post'
padding_type='post'
tokenizer=Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(training_sentences)

word_index=tokenizer.word_index

training_sequences=tokenizer.texts_to_sequences(training_sentences)
training_padded=pad_sequences(training_sequences, padding=padding_type, maxlen=max_length, truncating=trunc_type)

testing_sequences=tokenizer.texts_to_sequences(testing_sentences)
testing_padded=pad_sequences(testing_sequences, padding=padding_type, maxlen=max_length, truncating=trunc_type)

In [None]:
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [None]:
embedding_dim=16
model=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy']
              )



In [None]:
num_epochs=30
history=model.fit(training_padded, training_labels, epochs=num_epochs,
                  validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/30
625/625 - 8s - 12ms/step - accuracy: 0.6925 - loss: 0.5755 - val_accuracy: 0.8274 - val_loss: 0.4200
Epoch 2/30
625/625 - 10s - 17ms/step - accuracy: 0.8637 - loss: 0.3358 - val_accuracy: 0.8512 - val_loss: 0.3530
Epoch 3/30
625/625 - 9s - 14ms/step - accuracy: 0.9100 - loss: 0.2364 - val_accuracy: 0.8542 - val_loss: 0.3374
Epoch 4/30
625/625 - 5s - 9ms/step - accuracy: 0.9380 - loss: 0.1726 - val_accuracy: 0.8530 - val_loss: 0.3469
Epoch 5/30
625/625 - 9s - 15ms/step - accuracy: 0.9567 - loss: 0.1271 - val_accuracy: 0.8480 - val_loss: 0.3743
Epoch 6/30
625/625 - 6s - 10ms/step - accuracy: 0.9703 - loss: 0.0942 - val_accuracy: 0.8541 - val_loss: 0.3854
Epoch 7/30
625/625 - 4s - 7ms/step - accuracy: 0.9799 - loss: 0.0703 - val_accuracy: 0.8441 - val_loss: 0.4280
Epoch 8/30
625/625 - 6s - 10ms/step - accuracy: 0.9856 - loss: 0.0518 - val_accuracy: 0.8472 - val_loss: 0.4662
Epoch 9/30
625/625 - 9s - 15ms/step - accuracy: 0.9888 - loss: 0.0416 - val_accuracy: 0.8413 - val_loss: 

In [None]:
sentence=[
    'granny starting to fear spiders in the garden might be real',
    'the weather today is bright and sunny'
]

sequences=tokenizer.texts_to_sequences(sentence)
padded=pad_sequences(sequences, maxlen=max_length,
                     padding='post',
                     truncating='post'
                     )
predictions=model.predict(padded)
round_predictions=np.round(predictions)
print(round_predictions)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[[0.]
 [0.]]


In [None]:
test_predictions=model.predict(testing_padded)
round_test_predictions=np.round(test_predictions)
print(round_test_predictions[:5])

[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[[0.]
 [0.]
 [0.]
 [1.]
 [1.]]


In [None]:
testing_sentences[:5]

['pediatricians announce 2011 newborns are ugliest babies in 30 years',
 "don't rely on your fitness tracker to lose weight",
 'you might want to cut back on the soap',
 'bausch & lomb introduces line of aviator contacts',
 '7 everyday habits for glowing, younger-looking skin']

In [None]:
testing_labels[:5]

array([1, 0, 0, 1, 0])