In [None]:
import pandas as pd

df = pd.read_json('/content/drive/MyDrive/Sarcasm_Headlines_Dataset.json', lines=True)

df.head()


Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [None]:
df["headline"][2]

"mom starting to fear son's web series closest thing she will have to grandchild"

In [None]:
def get_optimal_values(headlines):
    # Calculate the average and maximum headline lengths
    avg_length = sum(len(headline.split()) for headline in headlines) / len(headlines)
    max_length = max(len(headline.split()) for headline in headlines)

    # Calculate the number of unique words
    all_words = set()
    for headline in headlines:
        words = headline.split()
        all_words.update(words)
    vocab_size = len(all_words)

    return int(avg_length), max_length, vocab_size


In [None]:
import pandas as pd

# Load JSON data into a Pandas DataFrame
data = pd.read_json('/content/drive/MyDrive/Sarcasm_Headlines_Dataset.json', lines=True)
headlines = data['headline'].values

# Get the optimal max_length and vocab_size values
avg_length, max_length, vocab_size = get_optimal_values(headlines)

print("Average length:", avg_length)
print("Max length:", max_length)
print("Vocab size:", vocab_size)


Average length: 9
Max length: 39
Vocab size: 36599


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26709 entries, 0 to 26708
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   article_link  26709 non-null  object
 1   headline      26709 non-null  object
 2   is_sarcastic  26709 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 626.1+ KB


In [None]:
df.shape

(26709, 3)

In [None]:
df.columns

Index(['article_link', 'headline', 'is_sarcastic'], dtype='object')

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# 1. Load JSON data into a Pandas DataFrame
data = pd.read_json('/content/drive/MyDrive/Sarcasm_Headlines_Dataset.json', lines=True)

# 2. Preprocess the data
max_length = 39
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
vocab_size = 36599

headlines = data['headline'].values
labels = data['is_sarcastic'].values

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(headlines)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(headlines)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)


In [None]:
# 3. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [None]:
# 4. Build and train a TensorFlow model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 32, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

num_epochs = 10
history = model.fit(X_train, y_train, epochs=num_epochs, validation_data=(X_test, y_test))


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 39, 32)            1171168   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              49664     
 l)                                                              
                                                                 
 dense (Dense)               (None, 24)                3096      
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
Total params: 1,223,953
Trainable params: 1,223,953
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# 5. Make a prediction on a dummy headline
dummy_headline = "dad starting to fear son's web series closest thing she will have to grandchild"
dummy_sequence = tokenizer.texts_to_sequences([dummy_headline])
dummy_padded_sequence = pad_sequences(dummy_sequence, maxlen=max_length, padding=padding_type, truncating=trunc_type)

prediction = model.predict(dummy_padded_sequence)
predicted_label = int(np.round(prediction[0][0]))

if predicted_label == 1:
    print("The dummy headline is sarcastic.")
else:
    print("The dummy headline is not sarcastic.")


The dummy headline is sarcastic.


In [1]:
from google.colab import files
files.download('example.csv')


FileNotFoundError: ignored