<a href="https://colab.research.google.com/github/mchanwa/COS424/blob/main/COS424FinalProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Processing

In [None]:
import json
import pandas as pd

def parse_data(file):
    for l in open(file,'r'):
        yield json.loads(l)

data = list(parse_data('/content/drive/MyDrive/Sarcasm_Headlines_Dataset_v2.json'))
data_df = pd.DataFrame(data)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_df

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import regex as re
import string

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

stop_words = set(stopwords.words('english')) - {'all'}

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
#   https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

#   https://www.pluralsight.com/guides/building-a-twitter-sentiment-analysis-in-python
def preprocess_text(text):
    text = text.lower()
    # Removes punctuation
    text = re.sub(r'\p{P}+', '', text)
    # Removes stopwords
    tokens = [w for w in word_tokenize(text) if not w in stop_words]
    # Perfoms lemmatization on tokens
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in tokens]
    return " ".join(lemma_words)


# preprocess the whole dataframe
def preprocess_df(df):
  headline = 'headline'
  length = len(df[headline])
  for i in range(length):
    text = str(df[headline][i])
    df.loc[i, headline] = preprocess_text(text)


preprocess_df(data_df)

# Analysis

In [None]:
import time

from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_validate

In [None]:
import statistics

def cross_validation(classifier, numfolds):
  cv_results_accuracy = \
    cross_validate(classifier, X_train_df, y_train, cv=numfolds, scoring='accuracy')
  cv_results_recall = \
    cross_validate(classifier, X_train_df, y_train, cv=numfolds, scoring='recall')
  cv_results_f1 = \
    cross_validate(classifier, X_train_df, y_train, cv=numfolds, scoring='f1')
  cv_results_precision = \
    cross_validate(classifier, X_train_df, y_train, cv=numfolds, scoring='precision')

  print("accuracy cv: {:.4f}".format(statistics.mean(cv_results_accuracy['test_score'])))
  print("precision cv: {:.4f}".format(statistics.mean(cv_results_precision['test_score'])))
  print("recall cv: {:.4f}".format(statistics.mean(cv_results_recall['test_score'])))
  print("f1 cv: {:.4f}".format(statistics.mean(cv_results_f1['test_score'])))

## LSTM

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data_df['headline'].values)

X = tokenizer.texts_to_sequences(data_df['headline'].values)
X = pad_sequences(X)

Y = pd.get_dummies(data_df['is_sarcastic']).values

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.25, random_state = 42)

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

import numpy as np
import matplotlib.pyplot as plt

In [None]:
def build_LSTM_model(vocab_size):
  batch_size = 32
  embedding_dim = 128
  lstm_out = 196

  model = Sequential()
  model.add(Embedding(vocab_size, embedding_dim,input_length = X.shape[1]))
  model.add(SpatialDropout1D(0.4))
  model.add(LSTM(lstm_out, activation='relu', dropout=0.2, recurrent_dropout=0.2))
  model.add(Dense(2,activation='softmax'))

  return model

model_lstm = build_LSTM_model(2000)
model_lstm.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy', 'Recall', 'Precision'])



In [None]:
start = time.time()
history = model_lstm.fit(X_train, Y_train, epochs=10, validation_data=(X_test, Y_test))
end = time.time()

print()
print("fit and predict time (LSTM): {:.4f}".format(end-start))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

fit and predict time (LSTM): 1331.3954


## CNN

In [None]:
X = data_df['headline']
Y = data_df['is_sarcastic']

X_train_sentences, X_test_sentences, Y_train, Y_test = train_test_split(X,Y, test_size = 0.25, random_state = 42)

In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train_sentences)

X_train = tokenizer.texts_to_sequences(X_train_sentences)
X_test = tokenizer.texts_to_sequences(X_test_sentences)

vocab_size = len(tokenizer.word_index) + 1                          

X_train = pad_sequences(X_train, padding='post', maxlen=100)
X_test = pad_sequences(X_test, padding='post', maxlen=100)

In [None]:
from tensorflow.keras import layers

def build_CNN_model():
  embedding_dim = 100

  model = Sequential()
  model.add(layers.Embedding(vocab_size, embedding_dim))
  model.add(layers.Conv1D(128, 5, activation='relu'))
  model.add(layers.GlobalMaxPooling1D())
  model.add(layers.Dense(10, activation='relu'))
  model.add(layers.Dense(1, activation='sigmoid'))

  return model

In [None]:
cnn_model = build_CNN_model()

cnn_model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics = ['accuracy', 'Recall', 'Precision'])


start = time.time()
history = cnn_model.fit(X_train, Y_train, epochs=10, validation_data=(X_test, Y_test))
end = time.time()

print()
print("fit and predict time (CNN): {:.4f}".format(end-start))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

fit and predict time (CNN): 155.3909


## Logistic Regression with Unigrams



In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(data_df['headline'], data_df['is_sarcastic'], test_size = 0.25, random_state = 42)

In [None]:
# bag of words

from sklearn.feature_extraction.text import CountVectorizer

matrix = CountVectorizer(max_features=1000)
X_train_bag = matrix.fit_transform(X_train).toarray()
X_test_bag = matrix.fit_transform(X_test).toarray()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

start = time.time()
clf1 = LogisticRegression(random_state=0).fit(X_train_bag, Y_train)
prediction_clf1 = clf1.predict(X_test_bag)
end = time.time()

print("Training Accuracy Score for Logistic Regresssion with Unigrams: {:.4f}".format(clf1.score(X_train_bag, Y_train)))
print("Validation Accuracy Score for Logistic Regresssion with Unigrams: {:.4f}".format(clf1.score(X_test_bag, Y_test)))
print("Precision Score: {:.4f}".format(precision_score(Y_test, prediction_clf1)))
print("F1 Score: {:.4f}".format(f1_score(Y_test, prediction_clf1)))
print("Recall Score: {:.4f}".format(recall_score(Y_test, prediction_clf1)))

print()
print("fit and predict time (Logistic Regresssion with Unigrams): {:.4f}".format(end-start))

Training Accuracy Score for Logistic Regresssion with Unigrams: 0.7653
Validation Accuracy Score for Logistic Regresssion with Unigrams: 0.5139
Precision Score: 0.4916
F1 Score: 0.4726
Recall Score: 0.4549

fit and predict time (Logistic Regresssion with Unigrams): 2.3109


## Logistic Regression with Sentence Transformers

In [None]:
!pip install sentence-transformers



In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression

In [None]:
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

In [None]:
X_train_embeddings = model.encode(X_train.to_list())
X_test_embeddings = model.encode(X_test.to_list())

In [None]:
start = time.time()
clf2 = LogisticRegression(random_state=0, max_iter = 1000).fit(X_train_embeddings, Y_train)
prediction_clf2 = clf2.predict(X_test_embeddings)
end = time.time()

print("Training Accuracy Score for Logistic Regresssion with Sentence Transformers: {:.4f}" .format(clf2.score(X_train_embeddings, Y_train)))
print("Validation Accuracy Score for Logistic Regresssion with Sentence Transformers: {:.4f}".format(clf2.score(X_test_embeddings, Y_test)))
print("Precision Score: {:.4f}".format(precision_score(Y_test, prediction_clf2)))
print("F1 Score: {:.4f}".format(f1_score(Y_test, prediction_clf2)))
print("Recall Score: {:.4f}".format(recall_score(Y_test, prediction_clf2)))

print()
print("fit and predict time (Logistic Regresssion with Sentence Transformers): {:.4f}".format(end-start))

Training Accuracy Score for Logistic Regresssion with Sentence Transformers: 0.7947
Validation Accuracy Score for Logistic Regresssion with Sentence Transformers: 0.7620
Precision Score: 0.7547
F1 Score: 0.7497
Recall Score: 0.7448

fit and predict time (Logistic Regresssion with Sentence Transformers): 4.0143


  ## SVM with Unigrams

In [None]:
from sklearn.svm import SVC

start = time.time()
clf_svm_1 = SVC().fit(X_train_bag, Y_train)
prediction_clf_svm_1 = clf_svm_1.predict(X_test_bag)
end = time.time()


print("Training Accuracy Score for Logistic Regresssion with Unigrams: {:.4f}".format(clf_svm_1.score(X_train_bag, Y_train)))
print("Validation Accuracy Score for Logistic Regresssion with Unigrams: {:.4f}".format(clf_svm_1.score(X_test_bag, Y_test)))
print("Precision Score: {:.4f}".format(precision_score(Y_test, prediction_clf_svm_1)))
print("F1 Score: {:.4f}".format(f1_score(Y_test, prediction_clf_svm_1)))
print("Recall Score: {:.4f}".format(recall_score(Y_test, prediction_clf_svm_1)))

print()
print("fit and predict time (SVG with Unigrams): {:.4f}".format(end-start))

## SVM with Embeddings

In [None]:
start = time.time()
clf_svm_2 = SVC().fit(X_train_embeddings, Y_train)
prediction_clf_svm_2 = clf_svm_2.predict(X_test_embeddings)
end = time.time()

print("Training Accuracy Score for Logistic Regresssion with Unigrams: {:.4f}".format(clf_svm_2.score(X_train_embeddings, Y_train)))
print("Validation Accuracy Score for Logistic Regresssion with Unigrams: {:.4f}".format(clf_svm_2.score(X_test_embeddings, Y_test)))
print("Precision Score: {:.4f}".format(precision_score(Y_test, prediction_clf_svm_2)))
print("F1 Score: {:.4f}".format(f1_score(Y_test, prediction_clf_svm_2)))
print("Recall Score: {:.4f}".format(recall_score(Y_test, prediction_clf_svm_2)))

print("fit and predict time (SVG with Sentence Transformers): {:.4f}".format(end-start))

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier


start = time.time()
clf_knn = KNeighborsClassifier(n_neighbors=3).fit(X_train_embeddings, Y_train)
prediction_clf_knn = clf_knn.predict(X_test_embeddings)
end = time.time()

In [None]:
print("Training Accuracy Score for KNN with Sentence Transformers: {:.4f}".format(clf_knn.score(X_train_embeddings, Y_train)))
print("Validation Accuracy Score for KNN with Sentence Transformers: {:.4f}".format(clf_knn.score(X_test_embeddings, Y_test)))
print("Precision Score: {:.4f}".format(precision_score(Y_test, prediction_clf_knn)))
print("F1 Score: {:.4f}".format(f1_score(Y_test, prediction_clf_knn)))
print("Recall Score: {:.4f}".format(recall_score(Y_test, prediction_clf_knn)))

print("fit and predict time (SVG with Sentence Transformers): {:.4f}".format(end-start))

Training Accuracy Score for KNN with Sentence Transformers: 0.850633619083116
Validation Accuracy Score for KNN with Sentence Transformers: 0.6982529699510832
Precision Score:  0.7057867360208062
F1 Score:  0.6678972465774495
Recall Score:  0.6338686131386861
fit and predict time (SVG with Sentence Transformers): 288.61064887046814
