In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models
import numpy as np
import json
import re
from sklearn.model_selection import train_test_split

In [None]:
# STEP 1: Load the dataset

# Dataset: Sarcasm Headlines Dataset (downloadable here: https://www.kaggle.com/datasets/rmisra/news-headlines-dataset-for-sarcasm-detection?resource=download)
# Each line is a JSON object: {"headline": "text...", "is_sarcastic": 0 or 1}

with open("Sarcasm_Headlines_Dataset.json", 'r') as f:
    data = [json.loads(line) for line in f]

texts = [item['headline'] for item in data]
labels = np.array([item['is_sarcastic'] for item in data])

In [None]:
# STEP 2: Preprocess text

def clean_text(text):
    text = text.lower() # lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation
    return text

texts = [clean_text(t) for t in texts] # iterates through all texts, cleaning each one

In [None]:
# STEP 3: Tokenize and pad

vocab_size = 10000  # limit vocabulary to prevent overfitting
max_length = 20     # reasonable for short headlines
oov_token = "<OOV>" # token for out-of-vocabulary words

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token) # initialize tokenizer
tokenizer.fit_on_texts(texts) # fit tokenizer on texts

sequences = tokenizer.texts_to_sequences(texts) # convert texts to sequences of integers
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post') # pad sequences

In [6]:
# STEP 4: Split train/test

X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, labels, test_size=0.2, random_state=42
)

In [7]:
# STEP 5: Build the model

model = models.Sequential([
    layers.Embedding(vocab_size, 64, input_length=max_length),
    layers.Bidirectional(layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2)),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [8]:
# STEP 6: Train the model

history = model.fit(
    X_train, y_train,
    epochs=5,
    validation_data=(X_test, y_test),
    batch_size=64,
    verbose=1
)

Epoch 1/5
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 26ms/step - accuracy: 0.7916 - loss: 0.4274 - val_accuracy: 0.8514 - val_loss: 0.3322
Epoch 2/5
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - accuracy: 0.9004 - loss: 0.2513 - val_accuracy: 0.8624 - val_loss: 0.3249
Epoch 3/5
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - accuracy: 0.9342 - loss: 0.1761 - val_accuracy: 0.8594 - val_loss: 0.3636
Epoch 4/5
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - accuracy: 0.9533 - loss: 0.1279 - val_accuracy: 0.8564 - val_loss: 0.4226
Epoch 5/5
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - accuracy: 0.9667 - loss: 0.0944 - val_accuracy: 0.8540 - val_loss: 0.4589


In [9]:
# STEP 7: Evaluate

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.3f}")

[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8540 - loss: 0.4589
Test Accuracy: 0.854


In [12]:
# STEP 8: Try a few samples

def predict_sarcasm(sentence):
    sentence = clean_text(sentence)
    seq = tokenizer.texts_to_sequences([sentence])
    pad = pad_sequences(seq, maxlen=max_length, padding='post')
    pred = model.predict(pad)[0][0]
    print(f"'{sentence}' → {'Sarcastic' if pred > 0.5 else 'Not Sarcastic'} ({pred:.2f})")

predict_sarcasm("Politician promises to fix everything, world breathes a sigh of relief.")
predict_sarcasm("Local business promises to stop scamming customers if they pay more.")
predict_sarcasm("AI Model fails to predict sarcasm, world is shocked.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
'politician promises to fix everything world breathes a sigh of relief' → Sarcastic (0.99)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
'local business promises to stop scamming customers if they pay more' → Sarcastic (1.00)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
'ai model fails to predict sarcasm world is shocked' → Not Sarcastic (0.01)
