## 4.2 N-grams Features ##

#### Import Packages ####

In [1]:
import json
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, make_pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer

#### Load and Split Data ####

In [2]:
df = pd.read_json('../data/Sarcasm_Headlines_Dataset_v2.json', lines=True)
X = df['headline']
y = df['is_sarcastic']

# Split data: 80% training, 10% validation and 10% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=50, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=50, stratify=y_temp)
#random_seeds = [50, 98, 54, 6, 34] # generated using random seed=0 

print("Dataset missing values:")
print(df.isnull().sum())
print("Sarcastic count:", df[df['is_sarcastic'] == 1].shape[0])
print("Non-sarcastic count:", df[df['is_sarcastic'] == 0].shape[0])

Dataset missing values:
is_sarcastic    0
headline        0
article_link    0
dtype: int64
Sarcastic count: 13634
Non-sarcastic count: 14985


In [3]:
def train_and_evaluate(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    show_metrics(model, X_train, y_train, 'training set')
    show_metrics(model, X_val, y_val, "validation set")
    
def show_metrics(model, X, y, label_str):
    y_pred = model.predict(X)

    print(f"{label_str} metrics:")
    acc = accuracy_score(y, y_pred)
    print("Accuracy = {:.4f}".format(acc))

    f1 = f1_score(y, y_pred, average='macro')
    print("Macro F1 = {:.4f}".format(f1))
    print("-" * 50)

#### Choosing Models (NB, LR, NN) ####

In [4]:
NB_model = make_pipeline(CountVectorizer(ngram_range=(1,1)), MultinomialNB())
train_and_evaluate(NB_model, X_train, y_train, X_val, y_val)

training set metrics:
Accuracy = 0.9300
Macro F1 = 0.9298
--------------------------------------------------
validation set metrics:
Accuracy = 0.8477
Macro F1 = 0.8470
--------------------------------------------------


In [5]:
LR_model = make_pipeline(CountVectorizer(ngram_range=(1,1)), LogisticRegression(max_iter=150))
train_and_evaluate(LR_model, X_train, y_train, X_val, y_val)

training set metrics:
Accuracy = 0.9610
Macro F1 = 0.9609
--------------------------------------------------
validation set metrics:
Accuracy = 0.8435
Macro F1 = 0.8429
--------------------------------------------------


In [6]:
NN_model = make_pipeline(CountVectorizer(ngram_range=(1,1)), MLPClassifier(hidden_layer_sizes=(15,), max_iter=70, early_stopping=True))
train_and_evaluate(NN_model, X_train, y_train, X_val, y_val)

training set metrics:
Accuracy = 0.9281
Macro F1 = 0.9279
--------------------------------------------------
validation set metrics:
Accuracy = 0.8487
Macro F1 = 0.8483
--------------------------------------------------


#### Baseline Model ####
Choose LR
- is linear, can directly interpret the feature weights
- NB, NN, LR have similar macro F1

In [7]:
LR_model = make_pipeline(CountVectorizer(ngram_range=(1,1)), LogisticRegression(max_iter=150))
train_and_evaluate(LR_model, X_train, y_train, X_val, y_val)

training set metrics:
Accuracy = 0.9610
Macro F1 = 0.9609
--------------------------------------------------
validation set metrics:
Accuracy = 0.8435
Macro F1 = 0.8429
--------------------------------------------------


#### Experimental Model ####
Choose ngram_range=(1, 2)
macro F1 = 0.8544

In [8]:
model1 = make_pipeline(CountVectorizer(ngram_range=(1,2)), LogisticRegression(max_iter=150))
train_and_evaluate(model1, X_train, y_train, X_val, y_val)

training set metrics:
Accuracy = 0.9990
Macro F1 = 0.9989
--------------------------------------------------
validation set metrics:
Accuracy = 0.8550
Macro F1 = 0.8544
--------------------------------------------------


In [9]:
model2 = make_pipeline(CountVectorizer(ngram_range=(2,2)), LogisticRegression(max_iter=150))
train_and_evaluate(model2, X_train, y_train, X_val, y_val)

training set metrics:
Accuracy = 0.9995
Macro F1 = 0.9995
--------------------------------------------------
validation set metrics:
Accuracy = 0.7764
Macro F1 = 0.7731
--------------------------------------------------


In [10]:
model3 = make_pipeline(CountVectorizer(ngram_range=(2,3)), LogisticRegression(max_iter=150))
train_and_evaluate(model3, X_train, y_train, X_val, y_val)

training set metrics:
Accuracy = 1.0000
Macro F1 = 1.0000
--------------------------------------------------
validation set metrics:
Accuracy = 0.7722
Macro F1 = 0.7681
--------------------------------------------------


In [11]:
model4 = make_pipeline(CountVectorizer(ngram_range=(3,3)), LogisticRegression(max_iter=150))
train_and_evaluate(model4, X_train, y_train, X_val, y_val)

training set metrics:
Accuracy = 0.9985
Macro F1 = 0.9985
--------------------------------------------------
validation set metrics:
Accuracy = 0.6279
Macro F1 = 0.5719
--------------------------------------------------


#### Preprocessing + Hyperparameter Finetuning #####

In [12]:
model = make_pipeline(CountVectorizer(ngram_range=(1,2), stop_words='english'), LogisticRegression(max_iter=150))
train_and_evaluate(model, X_train, y_train, X_val, y_val)

training set metrics:
Accuracy = 0.9976
Macro F1 = 0.9976
--------------------------------------------------
validation set metrics:
Accuracy = 0.8012
Macro F1 = 0.7988
--------------------------------------------------


In [9]:
model = make_pipeline(CountVectorizer(ngram_range=(1,2)), LogisticRegression(max_iter=200, penalty='l1', solver='saga'))
train_and_evaluate(model, X_train, y_train, X_val, y_val)



training set metrics:
Accuracy = 0.9135
Macro F1 = 0.9133
--------------------------------------------------
validation set metrics:
Accuracy = 0.8393
Macro F1 = 0.8387
--------------------------------------------------


In [22]:
model = make_pipeline(CountVectorizer(ngram_range=(1,2)), LogisticRegression(max_iter=300, penalty='l1', solver='saga', C=10))
train_and_evaluate(model, X_train, y_train, X_val, y_val)



training set metrics:
Accuracy = 0.9995
Macro F1 = 0.9995
--------------------------------------------------
validation set metrics:
Accuracy = 0.8512
Macro F1 = 0.8506
--------------------------------------------------


In [4]:
model = make_pipeline(CountVectorizer(ngram_range=(1,2)), LogisticRegression(max_iter=150, C=0.8))
train_and_evaluate(model, X_train, y_train, X_val, y_val)

training set metrics:
Accuracy = 0.9982
Macro F1 = 0.9982
--------------------------------------------------
validation set metrics:
Accuracy = 0.8546
Macro F1 = 0.8540
--------------------------------------------------


In [12]:
model = make_pipeline(CountVectorizer(ngram_range=(1,2)), LogisticRegression(max_iter=150, C=0.6))
train_and_evaluate(model, X_train, y_train, X_val, y_val)

training set metrics:
Accuracy = 0.9962
Macro F1 = 0.9962
--------------------------------------------------
validation set metrics:
Accuracy = 0.8539
Macro F1 = 0.8534
--------------------------------------------------


In [5]:
model = make_pipeline(CountVectorizer(ngram_range=(1,2)), LogisticRegression(max_iter=150, C=0.5))
train_and_evaluate(model, X_train, y_train, X_val, y_val)

training set metrics:
Accuracy = 0.9943
Macro F1 = 0.9943
--------------------------------------------------
validation set metrics:
Accuracy = 0.8550
Macro F1 = 0.8545
--------------------------------------------------


In [13]:
model = make_pipeline(CountVectorizer(ngram_range=(1,2)), LogisticRegression(max_iter=150, C=0.4))
train_and_evaluate(model, X_train, y_train, X_val, y_val)

training set metrics:
Accuracy = 0.9904
Macro F1 = 0.9904
--------------------------------------------------
validation set metrics:
Accuracy = 0.8522
Macro F1 = 0.8517
--------------------------------------------------


In [6]:
model = make_pipeline(CountVectorizer(ngram_range=(1,2)), LogisticRegression(max_iter=150, C=0.3))
train_and_evaluate(model, X_train, y_train, X_val, y_val)

training set metrics:
Accuracy = 0.9842
Macro F1 = 0.9842
--------------------------------------------------
validation set metrics:
Accuracy = 0.8515
Macro F1 = 0.8510
--------------------------------------------------


#### Model Performance on Test Data ####

In [14]:
model = make_pipeline(CountVectorizer(ngram_range=(1,2)), LogisticRegression(max_iter=150, C=0.5))
train_and_evaluate(model, X_train, y_train, X_val, y_val)

training set metrics:
Accuracy = 0.9943
Macro F1 = 0.9943
--------------------------------------------------
validation set metrics:
Accuracy = 0.8550
Macro F1 = 0.8545
--------------------------------------------------


In [15]:
show_metrics(model, X_test, y_test, 'testing set')

testing set metrics:
Accuracy = 0.8498
Macro F1 = 0.8495
--------------------------------------------------


#### Top 10 Features ####

In [16]:
vectorizer = model.named_steps['countvectorizer']
classifier = model.named_steps['logisticregression']

feature_names = vectorizer.get_feature_names_out()
coefficients = classifier.coef_[0]

top10_idx = np.argsort(coefficients)[-10:]
top10_features = feature_names[top10_idx]
top10_weights = coefficients[top10_idx]

print("Top 10 features indicative of sarcasm:")
for feature, weight in zip(top10_features[::-1], top10_weights[::-1]):
    print(f"{feature}: {weight:.4f}")

Top 10 features indicative of sarcasm:
nation: 2.9191
area: 2.5409
local: 1.9461
onion: 1.7815
fucking: 1.7719
introduces: 1.7114
announces: 1.7062
report: 1.6971
only: 1.6423
clearly: 1.5863


## Research Question ##
#### N-gram Corpus Analysis ####

In [21]:
from collections import Counter

top_features = top10_features[::-1].tolist()

sarcastic_headlines = df[df['is_sarcastic'] == 1]['headline']
nonsarcastic_headlines = df[df['is_sarcastic'] == 0]['headline']

vectorizer = CountVectorizer(ngram_range=(1, 2))
vectorizer.fit(X_train)

def count_words(texts, words):
    counts = Counter()
    analyzer = vectorizer.build_analyzer()
    for line in texts:
        ngram_tokens = analyzer(line)
        for word in words:
            counts[word] += ngram_tokens.count(word)
    return counts

sarcastic_counts = count_words(sarcastic_headlines, top_features)
nonsarcastic_counts = count_words(nonsarcastic_headlines, top_features)

# for comparison
freq_df = pd.DataFrame({
    'word': top_features,
    'sarcastic_count': [sarcastic_counts[w] for w in top_features],
    'nonsarcastic_count': [nonsarcastic_counts[w] for w in top_features]
})
freq_df['sarcastic_percentage'] = freq_df['sarcastic_count'] / (freq_df['sarcastic_count'] + freq_df['nonsarcastic_count'])

print(freq_df)

         word  sarcastic_count  nonsarcastic_count  sarcastic_percentage
0      nation              389                  23              0.944175
1        area              490                  11              0.978044
2       local              158                  14              0.918605
3       onion               56                   1              0.982456
4     fucking              102                   0              1.000000
5  introduces              112                   9              0.925620
6   announces              134                  19              0.875817
7      report              516                  89              0.852893
8        only              221                  56              0.797834
9     clearly               56                   0              1.000000
