In [96]:
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

In [97]:
with open('Sarcasm_Headlines_Dataset.json', 'r') as f:
    datastore = json.load(f)

In [98]:
sentences = []
labels = []
urls = []

In [99]:
# adiciona os dados do json nas listas
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

In [100]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

In [101]:
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')

In [102]:
print(padded[0])
print(padded.shape)

[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(26709, 40)


In [103]:
# variaveis
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [104]:
# separa os dados de treino e teste

training_sentences = sentences[:training_size]
testing_sentences = sentences[training_size:]

training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [105]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

In [106]:
word_index = tokenizer.word_index

In [107]:
# transforma as frases em sequencias
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences,
                                maxlen=max_length,
                                padding=padding_type,
                                truncating=trunc_type)

In [108]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,
                               maxlen=max_length,
                               padding=padding_type,
                               truncating=trunc_type)

In [109]:
import numpy as np

# transforma as listas em arrays
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [110]:
# definição do modelo
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [111]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d_6   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_12 (Dense)            (None, 24)                408       
                                                                 
 dense_13 (Dense)            (None, 1)                 25        
                                                                 
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [112]:
print(len(training_padded), len(training_labels))


20000 20000


In [113]:
num_epochs = 30

history = model.fit(training_padded,
                    training_labels,
                    epochs=num_epochs,
                    validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/30
625/625 - 2s - loss: 0.6692 - accuracy: 0.5856 - val_loss: 0.5836 - val_accuracy: 0.7751 - 2s/epoch - 3ms/step
Epoch 2/30
625/625 - 1s - loss: 0.4240 - accuracy: 0.8316 - val_loss: 0.3763 - val_accuracy: 0.8430 - 1s/epoch - 2ms/step
Epoch 3/30
625/625 - 1s - loss: 0.3057 - accuracy: 0.8770 - val_loss: 0.3472 - val_accuracy: 0.8560 - 1s/epoch - 2ms/step
Epoch 4/30
625/625 - 1s - loss: 0.2558 - accuracy: 0.8991 - val_loss: 0.3683 - val_accuracy: 0.8366 - 1s/epoch - 2ms/step
Epoch 5/30
625/625 - 1s - loss: 0.2190 - accuracy: 0.9136 - val_loss: 0.3434 - val_accuracy: 0.8560 - 1s/epoch - 2ms/step
Epoch 6/30
625/625 - 1s - loss: 0.1927 - accuracy: 0.9263 - val_loss: 0.3619 - val_accuracy: 0.8493 - 1s/epoch - 2ms/step
Epoch 7/30
625/625 - 1s - loss: 0.1718 - accuracy: 0.9349 - val_loss: 0.3638 - val_accuracy: 0.8569 - 1s/epoch - 2ms/step
Epoch 8/30
625/625 - 1s - loss: 0.1515 - accuracy: 0.9445 - val_loss: 0.3985 - val_accuracy: 0.8463 - 1s/epoch - 2ms/step
Epoch 9/30
625/625 - 1s 

In [None]:
sentence = [
    "granny starting to fear spiders in the garden might be real",
    "the weather today is bright and sunny"
]

sequences = tokenizer.texts_to_sequences(sentence)

padded = pad_sequences(sequences,
                       maxlen=max_length,
                       padding=padding_type,
                       truncating=trunc_type)

# preve a chance de sarcasmo de cada frase
print(model.predict(padded))

[[8.9687830e-01]
 [7.7226263e-04]]
