In [8]:
import pandas as pd
import os
import re
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [9]:
import json

# Load both JSON files
def load_json(file_path):
    with open(file_path, 'r') as f:
        data = [json.loads(line) for line in f]
    return pd.DataFrame(data)

df1 = load_json(r"C:\Users\minha\Downloads\News Headlines Dataset\Sarcasm_Headlines_Dataset_v2.json")
df2 = load_json(r"C:\Users\minha\Downloads\News Headlines Dataset\Sarcasm_Headlines_Dataset_v2.json")

# Combine datasets
df = pd.concat([df1, df2], ignore_index=True)

# Keep only 'headline' and 'is_sarcastic' columns
df = df[['headline', 'is_sarcastic']]
df.rename(columns={'is_sarcastic': 'label'}, inplace=True)


In [10]:
df

Unnamed: 0,headline,label
0,thirtysomething scientists unveil doomsday clo...,1
1,dem rep. totally nails why congress is falling...,0
2,eat your veggies: 9 deliciously different recipes,0
3,inclement weather prevents liar from getting t...,1
4,mother comes pretty close to using word 'strea...,1
...,...,...
57233,jews to celebrate rosh hashasha or something,1
57234,internal affairs investigator disappointed con...,1
57235,the most beautiful acceptance speech this week...,0
57236,mars probe destroyed by orbiting spielberg-gat...,1


In [11]:
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

# Download required data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'@\w+|#', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove punctuation
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return ' '.join(words)

df['cleaned'] = df['headline'].astype(str).apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\minha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\minha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\minha\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [12]:
# Vectorize
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned'])
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save model
os.makedirs("sarcasm_model", exist_ok=True)
joblib.dump(model, "sarcasm_model/model.pkl")
joblib.dump(vectorizer, "sarcasm_model/vectorizer.pkl")
print("Model and vectorizer saved successfully.")


Accuracy: 0.851240391334731
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86      5953
           1       0.86      0.82      0.84      5495

    accuracy                           0.85     11448
   macro avg       0.85      0.85      0.85     11448
weighted avg       0.85      0.85      0.85     11448

Model and vectorizer saved successfully.
