# Set up environment

## Install lime

We use the lime library for explaining the model.

In [71]:
!pip install lime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Download NLTK dictionaries

In [72]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Connect to Google Drive

In [73]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Unzip the data

In [74]:
! unzip drive/MyDrive/Sarcasm_Headlines_Dataset_v2.zip -d ./

Archive:  drive/MyDrive/Sarcasm_Headlines_Dataset_v2.zip
replace ./Sarcasm_Headlines_Dataset_v2.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

## Imports

In [None]:
import pandas as pd
import json
import re
import string
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer

# Data preparation

## Load

In [76]:
# Load the JSON file into a list
with open('./Sarcasm_Headlines_Dataset_v2.json') as f:
    data = f.readlines()

# Parse each JSON string in the list
parsed_data = [json.loads(d) for d in data]

In [77]:
# Create a DataFrame from the parsed data with the 'headline' column included
df = pd.DataFrame(parsed_data, columns=['is_sarcastic', 'headline', 'article_link'])
df = df.drop('article_link', axis=1)

## Validate the data

In [78]:
# Check for missing values
print(df.isnull().sum())

# Check for duplicates
print(df.duplicated().sum())

is_sarcastic    0
headline        0
dtype: int64
116


## Preprocess the data

This part is commented out because the models give better results without the preprocessing.

In [79]:
# Worse results with preprocessing
# Preprocess the headlines - useless, vectorizer does this
# df['headline'] = df['headline'].str.lower()
# df['headline'] = df['headline'].apply(lambda x: x.split()) # use nltk tokenizer instead
# stop_words = set(stopwords.words('english'))
# df['headline'] = df['headline'].apply(lambda x: [word for word in x if word not in stop_words])

# # Stem the headlines
# stemmer = PorterStemmer()
# df['headline'] = df['headline'].apply(lambda x: [stemmer.stem(word) for word in x])
# df['headline'] = df['headline'].apply(lambda x: ' '.join(x))
# print(df['headline'][0])

## Train test split

In [80]:
# Extract the headlines and labels as separate arrays
X = df['headline'].values
y = df['is_sarcastic'].values
print(X[0], y[0])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the headlines to a numerical representation using TF-IDF
# tfidf = TfidfVectorizer(ngram_range=(1, 2))
# tfidf = TfidfVectorizer(stop_words='english')
tfidf = TfidfVectorizer(stop_words='english')
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)
print(X_train[0], X_test[0])

thirtysomething scientists unveil doomsday clock of hair loss 1
  (0, 18734)	0.449539392876801
  (0, 7568)	0.5049956071401446
  (0, 15425)	0.28933989401228427
  (0, 16750)	0.400432629778379
  (0, 7880)	0.3925656546967618
  (0, 17027)	0.31798300078904773
  (0, 21773)	0.20882749752975768   (0, 22967)	0.40267627245985715
  (0, 20974)	0.3308502517601071
  (0, 20067)	0.3369928164965712
  (0, 19418)	0.4150957696653228
  (0, 19098)	0.37424283376461503
  (0, 16425)	0.4270135553300358
  (0, 6877)	0.34658764993000457


In [81]:
# Save tfidf
tfidf_name = 'tfidf.sav'
pickle.dump(tfidf, open(tfidf_name, 'wb'))

# Models

We decided to start with some basic algorithms, and then decide if it is worth to try more complicated ones. We tested and compared three different models - Naive Bayes algorithm, Logistic Regression, and Random Forest. The metrics used for the accuracies are: accuracy, precision, recall, and F1.

## Naive Bayes algorithm

In [82]:
# Train the model using Naive Bayes algorithm
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

In [83]:
# Predict on the testing set and evaluate the model's performance
y_pred = naive_bayes.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the performance metrics
print("Accuracy: {:.2f}%".format(accuracy*100))
print("Precision: {:.2f}%".format(precision*100))
print("Recall: {:.2f}%".format(recall*100))
print("F1-score: {:.2f}%".format(f1*100))

Accuracy: 80.49%
Precision: 82.14%
Recall: 75.49%
F1-score: 78.67%


## Logistic regression

In [84]:
# Train a logistic regression model
clf = LogisticRegression(random_state=42, max_iter=1000)
clf.fit(X_train, y_train)

In [85]:
# Predict on test set
y_pred = clf.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the performance metrics
print("Accuracy: {:.2f}%".format(accuracy*100))
print("Precision: {:.2f}%".format(precision*100))
print("Recall: {:.2f}%".format(recall*100))
print("F1-score: {:.2f}%".format(f1*100))

Accuracy: 79.87%
Precision: 80.65%
Recall: 76.04%
F1-score: 78.27%


In [86]:
# Save model
clf_name = 'clf.sav'
pickle.dump(clf, open(clf_name, 'wb'))

## Random Forest

In [None]:
# Train a random forest model
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train, y_train)

In [None]:
# Predict on test set
y_pred = rf.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the performance metrics
print("Accuracy: {:.2f}%".format(accuracy*100))
print("Precision: {:.2f}%".format(precision*100))
print("Recall: {:.2f}%".format(recall*100))
print("F1-score: {:.2f}%".format(f1*100))

## Conclusions

Random forest is the slowest model, and there isn't significant difference in the accuracy compared to the two others. The Logistic Regression is best overall, because it is fast and with good results on all the metrics. Naive Bayes algorithm is better than the Logistic Regression only in the precision, but is worse in the three other metrics, so we decide to use the Logistic Regression.

Since this model is fast with an accuracy around 84%, we decided to stick with it, because the more complicated models would take much more time to train.

# Explain the model with Lime

## Set up a Lime model

In [None]:
# Create a logistic regression model
c = make_pipeline(tfidf, clf)

# Explain the model's predictions using LIME
explainer = LimeTextExplainer(class_names=["Not Sarcastic", "Sarcastic"])

## Example with a not-sarcastic headline

In [None]:
# Select a headline to explain
headline_idx = 10
headline = df['headline'][headline_idx]
label = df['is_sarcastic'][headline_idx]

print(headline)
print(c.predict_proba([headline]))

# Explain the headline
exp = explainer.explain_instance(headline, c.predict_proba, num_features=len(headline.split()))
print('Document id: %d' % headline_idx)
print('Probability(sarcastic) =', c.predict_proba([headline])[0,1])
print('True class: %s' % label)

In [None]:
# Visualise the explanation
exp.show_in_notebook(text=True)

In [None]:
# Print the explanation as list
exp.as_list()

## Example with a sarcastic headline

In [None]:
# Select a headline to explain
headline_idx = 4
headline = df['headline'][headline_idx]
label = df['is_sarcastic'][headline_idx]

# Explain the headline
exp = explainer.explain_instance(headline, c.predict_proba, num_features=len(headline.split()))
print('Document id: %d' % headline_idx)
print('Probability(sarcastic) =', c.predict_proba([headline])[0,1])
print('True class: %s' % label)

In [None]:
# Visualise the explanation
exp.show_in_notebook(text=True)

In [None]:
# Print the explanation as list
exp.as_list()

## Validate

We will try to remove some of the words in the headline that are marked sarcastic and non-sarcastic, and we expect to see that the prediction probabilities decrease and increase accordingly.

### Remove the most sarcastic word "pretty"

In [None]:
headline_new = headline.replace("pretty", "")

# Explain the headline
exp = explainer.explain_instance(headline_new, c.predict_proba, num_features=len(headline_new.split()))
print('Document id: %d' % headline_idx)
print('Probability(sarcastic) =', c.predict_proba([headline_new])[0,1])
print('True class: %s' % label)

In [None]:
# Visualise the explanation
exp.show_in_notebook(text=True)

### Remove the least sarcastic word "mother"

In [None]:
headline_new = headline.replace("mother", "")

# Explain the headline
exp = explainer.explain_instance(headline_new, c.predict_proba, num_features=len(headline_new.split()))
print('Document id: %d' % headline_idx)
print('Probability(sarcastic) =', c.predict_proba([headline_new])[0,1])
print('True class: %s' % label)

In [None]:
# Visualise the explanation
exp.show_in_notebook(text=True)

### Remove the most not sarcastic word "streaming"

In [None]:
headline_new = headline.replace("streaming", "")

# Explain the headline
exp = explainer.explain_instance(headline_new, c.predict_proba, num_features=len(headline_new.split()))
print('Document id: %d' % headline_idx)
print('Probability(sarcastic) =', c.predict_proba([headline_new])[0,1])
print('True class: %s' % label)

In [None]:
# Visualise the explanation
exp.show_in_notebook(text=True)

### Conclusions

As expected, the prediction probabilities changed. When we removed the most sarcastic word "pretty", the sarcasitc probability dropped from 88% to 78%. When we removed one of the least sarcastic word "mother", the sarcastic probability dropped only to 85%. Finally, when we removed the most non-sarcastic word "streaming", the sarcastic probability increased to 91%.