In [1]:
import json

In [2]:
#Abrindo o arquivo com as sentenças.
#Extraindo os dados, sentenças, colunas e urls.
with open("/tmp/sarcasm.json", 'r') as f:
    datastore = json.load(f)

sentences = []
labels = []
urls = []

for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

In [3]:
"""
Importando as bibliotecas.
Padronizando o Out of vocabulary
Tokenizando as sentenças.
Recebendo qual palavra tem qual indice.
Adicionando o padding para todas terem o mesmo formato.
"""

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(oov_token = '<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')
print(padded[0])
print(padded.shape)

2025-08-14 14:32:35.253767: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-14 14:32:35.331629: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-08-14 14:32:35.331664: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-08-14 14:32:35.350452: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-14 14:32:35.384536: I tensorflow/core/platform/cpu_feature_guar

[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(26709, 40)


In [11]:
#Separando dados de treino e dados de teste
training_size = 20000
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [13]:
"""
Tokenizando porém com limite de 10000 palavras
E separando melhor os dados de treino e os dados de teste
"""

tokenizer = Tokenizer(num_words = 10000, oov_token = '<OOV>')
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen = 100,
                                padding = 'post', truncating = 'post')

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen = 100,
                               padding = 'post', truncating = 'post')
                      

In [20]:
#Criação da rede neural, com uma camada de embedding, um pooling, uma camada densa com 24 neuronios e a camada de saída com um neuronio e a função de ativação sigmoid
import tensorflow as tf
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 16, input_length = 100),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

2025-08-14 14:52:39.776141: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-08-14 14:52:39.980822: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-08-14 14:52:39.984927: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [22]:
#Precisa dessa célular para formatar a entrada dos dados na rede neural
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [23]:
#Treinando o modelo
num_epochs = 30
history = model.fit(training_padded, training_labels, epochs=num_epochs, 
                    validation_data = (testing_padded, testing_labels), verbose=2)

Epoch 1/30


2025-08-14 14:56:43.849063: I external/local_xla/xla/service/service.cc:168] XLA service 0x7a1bfc670b20 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-08-14 14:56:43.849087: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 6GB Laptop GPU, Compute Capability 8.6
2025-08-14 14:56:43.860698: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-08-14 14:56:43.888309: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
I0000 00:00:1755194203.945262    9903 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


625/625 - 9s - loss: 0.6665 - accuracy: 0.5818 - val_loss: 0.5931 - val_accuracy: 0.6606 - 9s/epoch - 14ms/step
Epoch 2/30
625/625 - 2s - loss: 0.4440 - accuracy: 0.8260 - val_loss: 0.3911 - val_accuracy: 0.8393 - 2s/epoch - 2ms/step
Epoch 3/30
625/625 - 2s - loss: 0.3182 - accuracy: 0.8744 - val_loss: 0.3554 - val_accuracy: 0.8542 - 2s/epoch - 3ms/step
Epoch 4/30
625/625 - 2s - loss: 0.2664 - accuracy: 0.8975 - val_loss: 0.3536 - val_accuracy: 0.8480 - 2s/epoch - 3ms/step
Epoch 5/30
625/625 - 1s - loss: 0.2298 - accuracy: 0.9119 - val_loss: 0.3469 - val_accuracy: 0.8517 - 1s/epoch - 2ms/step
Epoch 6/30
625/625 - 1s - loss: 0.2026 - accuracy: 0.9231 - val_loss: 0.3566 - val_accuracy: 0.8533 - 1s/epoch - 2ms/step
Epoch 7/30
625/625 - 2s - loss: 0.1804 - accuracy: 0.9317 - val_loss: 0.3654 - val_accuracy: 0.8499 - 2s/epoch - 3ms/step
Epoch 8/30
625/625 - 1s - loss: 0.1631 - accuracy: 0.9389 - val_loss: 0.3688 - val_accuracy: 0.8559 - 1s/epoch - 2ms/step
Epoch 9/30
625/625 - 2s - loss: 0.

In [26]:
#Testando a predição do modelo em frases criadas
sentence = [
    'granny starting to fear spiders in the garden might be real'
    'the weather today is bright and sunny'
]
sequences = tokenizer.texts_to_sequences(sentence)

padded = pad_sequences(sequences, maxlen=100,
                       padding = 'post' , truncating = 'post')

print(model.predict(padded))

[[0.00351257]]
