# Sarcasm Detector

## Get and Load Data

In [2]:
!pip install --upgrade --no-cache-dir gdown



In [3]:
!gdown --id 1ytPDo88FEC2ArOjdqErAiarAZBNJzEJz

Downloading...
From: https://drive.google.com/uc?id=1ytPDo88FEC2ArOjdqErAiarAZBNJzEJz
To: /home/astrapi69/fake-news-detector/notebook/SarcasmDetect.json
100%|██████████████████████████████████████| 6.06M/6.06M [00:00<00:00, 6.57MB/s]


In [4]:
import pandas as pd

df = pd.read_json('./SarcasmDetect.json', lines=True)
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


## Remove all records with no headline text

In [6]:
df = df[df['headline'] != '']
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
 2   article_link  28619 non-null  object
dtypes: int64(1), object(2)
memory usage: 670.9+ KB


Lets start with the key objectives

# Key Objectives

Train-Test Split should be 70:30, use random_state=42 for same data split throughout. Drop the article_link column as it can be used as a leakage to identify fake news directly (theonion makes only fake news)

In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Split the dataset
X = df['headline']
y = df['is_sarcastic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [30]:
!pip install tensorflow



In [31]:
import tensorflow as tf
max_features = 10000  # Maximum vocab size
sequence_length = 250  # Maximum length of each headline

text_vectorization = tf.keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)


text_vectorization.adapt(X_train)

sample_headline = ["This is a sample headline"]
print(text_vectorization(sample_headline))

model = tf.keras.Sequential([
    text_vectorization,
    tf.keras.layers.Embedding(max_features + 1, 128),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


tf.Tensor(
[[  20   11    7 9790 3001    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0  

In [32]:
from tensorflow.keras.callbacks import EarlyStopping

# Initialize the EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor the validation loss
    patience=3,          # Number of epochs with no improvement after which training will be stopped
    verbose=1,           # Log when training is stopped
    restore_best_weights=True  # Restore model weights from the epoch with the best value of the monitored quantity
)

history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test),
    callbacks=[early_stopping]  # Add the EarlyStopping callback here
)

Epoch 1/10


[1m627/627[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.5154 - loss: 0.6937 - val_accuracy: 0.5190 - val_loss: 0.6840
Epoch 2/10
[1m627/627[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.5565 - loss: 0.6830 - val_accuracy: 0.5945 - val_loss: 0.6533
Epoch 3/10
[1m627/627[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.6417 - loss: 0.6398 - val_accuracy: 0.6587 - val_loss: 0.5779
Epoch 4/10
[1m627/627[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.7438 - loss: 0.5376 - val_accuracy: 0.5262 - val_loss: 1.0149
Epoch 5/10
[1m627/627[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.7491 - loss: 0.5233 - val_accuracy: 0.8069 - val_loss: 0.4448
Epoch 6/10
[1m627/627[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8112 - loss: 0.4268 - val_accuracy: 0.7982 - val_loss: 0.4324
Epoch 7/10
[1m627/627[0m [32m━━━━━━━

In [33]:
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_acc}')

269/269 - 0s - 1ms/step - accuracy: 0.8379 - loss: 0.3781
Test Loss: 0.37808957695961
Test Accuracy: 0.8378756046295166


In [34]:
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# Predict classes with the model
predictions = model.predict(X_test)
predicted_classes = np.round(predictions).astype(int).flatten()  # Round predictions to 0 or 1 and flatten the array

# Actual classes
actual_classes = y_test.values

# Confusion Matrix
conf_matrix = confusion_matrix(actual_classes, predicted_classes)
print('Confusion Matrix:')
print(conf_matrix)

# Classification Report
class_report = classification_report(actual_classes, predicted_classes)
print('Classification Report:')
print(class_report)


[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Confusion Matrix:
[[3855  600]
 [ 792 3339]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85      4455
           1       0.85      0.81      0.83      4131

    accuracy                           0.84      8586
   macro avg       0.84      0.84      0.84      8586
weighted avg       0.84      0.84      0.84      8586



NameError: name 'word_index' is not defined