In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [54]:
import pandas as pd
import json
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

In [65]:
# Load the JSON file into a list
with open('/Users/omarxbadawi/Downloads/datasets/archive/Sarcasm_Headlines_Dataset_v2.json') as f:
    data = f.readlines()

# Parse each JSON string in the list
parsed_data = [json.loads(d) for d in data]

In [66]:
# Create a DataFrame from the parsed data with the 'headline' column included
df = pd.DataFrame(parsed_data, columns=['is_sarcastic', 'headline', 'article_link'])

# Drop the 'article_link' column
df = df.drop('article_link', axis=1)

In [67]:
# Check for missing values
# print(df.isnull().sum())

# Check for duplicates
# print(df.duplicated().sum())

In [70]:
# Convert the 'headline' column to lowercase
df['headline'] = df['headline'].str.lower()

# Tokenize the 'headline' column
df['headline'] = df['headline'].apply(lambda x: x.split())

# Remove stop words from the 'headline' column
stop_words = set(stopwords.words('english'))
df['headline'] = df['headline'].apply(lambda x: [word for word in x if word not in stop_words])

# Perform stemming on the 'headline' column
stemmer = PorterStemmer()
df['headline'] = df['headline'].apply(lambda x: [stemmer.stem(word) for word in x])

In [71]:
# extract the labels as a separate array
y = df['is_sarcastic'].values

# Convert the 'headline' column to a numerical representation using TF-IDF
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['headline'].apply(lambda x: ' '.join(x)))

In [72]:
print (X[0], y[0])

  (0, 11054)	0.3513365891167093
  (0, 8298)	0.3316683359373775
  (0, 3782)	0.3815303693418395
  (0, 5641)	0.4421994333729299
  (0, 19627)	0.29966856917940493
  (0, 16289)	0.30349811302033614
  (0, 18619)	0.49349832676309063 1


In [73]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [74]:
# Train the model using Naive Bayes algorithm
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

In [75]:
# Predict on the testing set and evaluate the model's performance
y_pred = naive_bayes.predict(X_test)

In [76]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy: {:.2f}%".format(accuracy*100))
print("Precision: {:.2f}%".format(precision*100))
print("Recall: {:.2f}%".format(recall*100))
print("F1-score: {:.2f}%".format(f1*100))

Accuracy: 79.66%
Precision: 80.11%
Recall: 76.29%
F1-score: 78.15%


In [77]:
# Train a logistic regression model
clf = LogisticRegression(random_state=42, max_iter=1000)
clf.fit(X_train, y_train)

In [78]:
# Predict on test set and calculate performance metrics
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the performance metrics
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1-score: ", f1)

Accuracy:  0.793151642208246
Precision:  0.8021118498240125
Recall:  0.751557347013558
F1-score:  0.7760121074536512


In [79]:
from lime.lime_text import LimeTextExplainer

# Select a headline to explain
headline_idx = 10
headline = df['headline'][headline_idx]

# Initialize the explainer
explainer = LimeTextExplainer(class_names=['not sarcastic', 'sarcastic'])

# Define a function to predict the label for a given headline
predict_fn = lambda x: clf.predict_proba(tfidf.transform(x))[0]

# Generate an explanation for the model's classification decision
explanation = explainer.explain_instance(headline, predict_fn, num_features=10)
explanation.show_in_notebook()

TypeError: expected string or bytes-like object

In [85]:
print(X_train[0])

  (0, 6566)	0.381607135106793
  (0, 16497)	0.41374303988032085
  (0, 14870)	0.41167413927344954
  (0, 13799)	0.4106672369393236
  (0, 15095)	0.3274803106740721
  (0, 6824)	0.33954219154350135
  (0, 19103)	0.21360037496695014
  (0, 13133)	0.2773303824698808


In [93]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from lime.lime_text import LimeTextExplainer

# Extract the headlines
headlines = df['headline'].apply(str).values

# Convert the headlines to TF-IDF representation
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(headlines)

# Get the labels
y = df['is_sarcastic'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a logistic regression model
model = make_pipeline(LogisticRegression())

# Train the model on the training set
model.fit(X_train, y_train)

# Test the model on the testing set
y_pred = model.predict(X_test)

# Evaluate the model's performance
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

# Explain the model's predictions using LIME
explainer = LimeTextExplainer(class_names=["Not Sarcastic", "Sarcastic"])

# Select a headline to explain
headline = headlines[0]

# Get the model's prediction for the selected headline
pred = model.predict(vectorizer.transform([headline]))[0]

# Explain the model's prediction for the selected headline
exp = explainer.explain_instance(headline, model.predict_proba, num_features=10)

# Print the explanation
exp.show_in_notebook(text=True)

Accuracy: 0.7928022361984626
Precision: 0.8017207665232694
Recall: 0.7511909124221327
F1 Score: 0.7756337495270526


ValueError: Expected 2D array, got 1D array instead:
array=["['thirtysometh', 'scientist', 'unveil', 'doomsday', 'clock', 'hair', 'loss']"
 "['thirtysometh', 'scientist', '', 'doomsday', '', 'hair', 'loss']"
 "['', '', '', '', '', '', 'loss']" ...
 "['thirtysometh', 'scientist', 'unveil', 'doomsday', '', '', 'loss']"
 "['', 'scientist', '', 'doomsday', 'clock', 'hair', 'loss']"
 "['', '', '', '', '', '', '']"].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.