In [1]:
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

In [2]:
with open('Sarcasm_Headlines_Dataset.json', 'r') as f:
    datastore = json.load(f)

In [3]:
sentences = []
labels = []
urls = []

In [4]:
# adiciona os dados do json nas listas
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

In [5]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

In [6]:
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')

In [7]:
print(padded[0])
print(padded.shape)

[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(26709, 40)


In [8]:
# variaveis
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [9]:
# separa os dados de treino e teste

training_sentences = sentences[:training_size]
testing_sentences = sentences[training_size:]

training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [10]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

In [11]:
word_index = tokenizer.word_index

In [12]:
# transforma as frases em sequencias
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences,
                                maxlen=max_length,
                                padding=padding_type,
                                truncating=trunc_type)

In [13]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,
                               maxlen=max_length,
                               padding=padding_type,
                               truncating=trunc_type)

In [14]:
import numpy as np

# transforma as listas em arrays
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [15]:
# definição do modelo
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 128)               2176      
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                        

In [17]:
print(len(training_padded), len(training_labels))


20000 20000


In [18]:
num_epochs = 30

history = model.fit(training_padded,
                    training_labels,
                    epochs=num_epochs,
                    validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/30
625/625 - 3s - loss: 0.5783 - accuracy: 0.6679 - val_loss: 0.4159 - val_accuracy: 0.8037 - 3s/epoch - 4ms/step
Epoch 2/30
625/625 - 1s - loss: 0.3204 - accuracy: 0.8695 - val_loss: 0.3596 - val_accuracy: 0.8504 - 1s/epoch - 2ms/step
Epoch 3/30
625/625 - 1s - loss: 0.2391 - accuracy: 0.9086 - val_loss: 0.3619 - val_accuracy: 0.8568 - 1s/epoch - 2ms/step
Epoch 4/30
625/625 - 1s - loss: 0.1988 - accuracy: 0.9265 - val_loss: 0.3939 - val_accuracy: 0.8468 - 1s/epoch - 2ms/step
Epoch 5/30
625/625 - 1s - loss: 0.1703 - accuracy: 0.9365 - val_loss: 0.4421 - val_accuracy: 0.8499 - 1s/epoch - 2ms/step
Epoch 6/30
625/625 - 1s - loss: 0.1490 - accuracy: 0.9473 - val_loss: 0.4575 - val_accuracy: 0.8453 - 1s/epoch - 2ms/step
Epoch 7/30
625/625 - 1s - loss: 0.1312 - accuracy: 0.9543 - val_loss: 0.5171 - val_accuracy: 0.8380 - 1s/epoch - 2ms/step
Epoch 8/30
625/625 - 1s - loss: 0.1092 - accuracy: 0.9629 - val_loss: 0.6555 - val_accuracy: 0.8083 - 1s/epoch - 2ms/step
Epoch 9/30
625/625 - 1s 

In [None]:
# a avaliação melhorou com a adição de mais camadas e dropout
sentence = [
    "granny starting to fear spiders in the garden migth be real",
    "the weather today is brigth and sunny"
]

sequences = tokenizer.texts_to_sequences(sentence)

padded = pad_sequences(sequences,
                       maxlen=max_length,
                       padding=padding_type,
                       truncating=trunc_type)

# preve a chance de sarcasmo de cada frase
print(model.predict(padded))

[[0.9917742 ]
 [0.00365997]]
