In [56]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import requests

In [57]:
json_sarcastic_file = requests.get("https://storage.googleapis.com/learning-datasets/sarcasm.json")
sarcasm_data = json_sarcastic_file.json()

In [58]:
len(sarcasm_data), type(sarcasm_data),

(26709, list)

In [59]:
sarcasm_data[:5]

[{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
  'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
  'is_sarcastic': 0},
 {'article_link': 'https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365',
  'headline': "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
  'is_sarcastic': 0},
 {'article_link': 'https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697',
  'headline': "mom starting to fear son's web series closest thing she will have to grandchild",
  'is_sarcastic': 1},
 {'article_link': 'https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302',
  'headline': 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
  'is_sarcastic': 1},
 {'article_link': 'https://www.huffingtonpost.com/entry/jk-rowling-w

In [60]:
sarcastic_sentences = []
is_sarcastic_lbl = []
article_links = []

In [61]:
for item in sarcasm_data:
  sarcastic_sentences.append(item["headline"])
  is_sarcastic_lbl.append(item["is_sarcastic"])
  article_links.append(item["article_link"])

In [62]:
TRAINING_SIZE = 20000
VOCAB_SIZE = 10000
OOV_TOK = "<OOV>"
MAX_LENGTH = 100
TRUNC_TYPE = "post"
PADDING_TYPE = "post"
EMBEDDING_DIM = 16

In [63]:
training_sentences = sarcastic_sentences[:TRAINING_SIZE]
testing_sentences = sarcastic_sentences[TRAINING_SIZE:]
training_lbls = is_sarcastic_lbl[:TRAINING_SIZE]
testing_lbls = is_sarcastic_lbl[TRAINING_SIZE:]

In [64]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOK)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=MAX_LENGTH,
                                padding=PADDING_TYPE, truncating=TRUNC_TYPE)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=MAX_LENGTH,
                               padding=PADDING_TYPE, truncating=TRUNC_TYPE)

In [65]:
testing_padded, training_padded

(array([[   1, 1100, 6663, ...,    0,    0,    0],
        [ 202,    1,    8, ...,    0,    0,    0],
        [  18,  380,  191, ...,    0,    0,    0],
        ...,
        [   1,    9,   67, ...,    0,    0,    0],
        [1556,  374, 4114, ...,    0,    0,    0],
        [   1, 1700,    6, ...,    0,    0,    0]], dtype=int32),
 array([[ 328,    1,  799, ...,    0,    0,    0],
        [   4, 6840, 3096, ...,    0,    0,    0],
        [ 153,  890,    2, ...,    0,    0,    0],
        ...,
        [  79, 1729,    1, ...,    0,    0,    0],
        [  53, 5108, 4735, ...,    0,    0,    0],
        [ 312,  705,    1, ...,    0,    0,    0]], dtype=int32))

In [66]:
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_lbls)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_lbls)

In [67]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LENGTH),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [68]:
num_epochs = 30
history = model.fit(training_padded, training_labels, epochs=num_epochs,
                    validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/30
625/625 - 25s - loss: 0.6703 - accuracy: 0.5771 - val_loss: 0.6000 - val_accuracy: 0.6330 - 25s/epoch - 40ms/step
Epoch 2/30
625/625 - 3s - loss: 0.4390 - accuracy: 0.8246 - val_loss: 0.3837 - val_accuracy: 0.8402 - 3s/epoch - 5ms/step
Epoch 3/30
625/625 - 3s - loss: 0.3119 - accuracy: 0.8760 - val_loss: 0.3552 - val_accuracy: 0.8474 - 3s/epoch - 5ms/step
Epoch 4/30
625/625 - 3s - loss: 0.2607 - accuracy: 0.8972 - val_loss: 0.3424 - val_accuracy: 0.8521 - 3s/epoch - 5ms/step
Epoch 5/30
625/625 - 3s - loss: 0.2232 - accuracy: 0.9133 - val_loss: 0.3417 - val_accuracy: 0.8556 - 3s/epoch - 4ms/step
Epoch 6/30
625/625 - 3s - loss: 0.1950 - accuracy: 0.9260 - val_loss: 0.3622 - val_accuracy: 0.8509 - 3s/epoch - 5ms/step
Epoch 7/30
625/625 - 3s - loss: 0.1730 - accuracy: 0.9362 - val_loss: 0.3635 - val_accuracy: 0.8538 - 3s/epoch - 4ms/step
Epoch 8/30
625/625 - 3s - loss: 0.1547 - accuracy: 0.9430 - val_loss: 0.3793 - val_accuracy: 0.8533 - 3s/epoch - 4ms/step
Epoch 9/30
625/625 - 

In [71]:
sentences = [
    "Congratulations on being five minutes early. You're a real hero.",
    "I love spending time with my friends and family.",
    "Sure, because I absolutely love spending my weekends doing chores.",
    "The weather is so nice today, perfect for a picnic.",
    "Oh, great. Another Monday. Just what I needed.",
    "I enjoy going for a walk in the park on a sunny day.",
    "Wow, your brilliant plan worked out perfectly, didn't it?",
    "I appreciate your help with the project; it made a big difference.",
    "Oh, wonderful! More paperwork to make our lives even more exciting.",
    "I'm really looking forward to the weekend getaway.",
    "Well, that was a fantastic idea. I can't believe no one else thought of it.",
    "Spending time with loved ones is truly the best part of life.",
    "Brilliant move, parking your car in the middle of the road.",
    "I'm grateful for the support of my colleagues at work.",
    "Oh, you're the expert on everything, aren't you?",
    "I can't wait to try that new restaurant everyone's talking about.",
    "Gee, thanks for the unsolicited advice. I couldn't survive without it.",
]

sentence_seq = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sentence_seq, maxlen=MAX_LENGTH, padding=PADDING_TYPE,
                       truncating=TRUNC_TYPE)
model.predict(padded)




array([[1.01818964e-01],
       [3.86433298e-04],
       [7.61609524e-04],
       [1.16263074e-03],
       [7.33894646e-01],
       [3.58694728e-04],
       [3.18806196e-05],
       [4.51463020e-05],
       [1.20013028e-01],
       [6.93548373e-06],
       [9.89354014e-01],
       [1.97016388e-01],
       [3.03525576e-05],
       [1.29536265e-05],
       [1.30575048e-04],
       [2.67289276e-03],
       [7.96675275e-04]], dtype=float32)