In [1]:
import kagglehub
import numpy as np
import re
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load data from kaggle
headlinespath = kagglehub.dataset_download("rmisra/news-headlines-dataset-for-sarcasm-detection")
twitterpath   = kagglehub.dataset_download("nikhiljohnk/tweets-with-sarcasm-and-irony")
redditpath    = kagglehub.dataset_download("sherinclaudia/sarcastic-comments-on-reddit")

# Headline datasets
headlines1 = pd.read_json(f"{headlinespath}/Sarcasm_Headlines_Dataset.json", lines=True)
headlines2 = pd.read_json(f"{headlinespath}/Sarcasm_Headlines_Dataset_v2.json", lines=True)
data_headlines = pd.concat([headlines1, headlines2], ignore_index=True)

label_headlines = (
    data_headlines
    .drop(columns=["article_link"], errors="ignore")
    .rename(columns={"headline": "text", "is_sarcastic": "is_sarcastic"})
    .assign(source="headline")
)

# Reddit dataset
reddit = pd.read_csv(f"{redditpath}/train-balanced-sarcasm.csv")

# Stratified 10% sampling for reddit
reddit_sample = reddit.groupby("label", group_keys=False).apply(
    lambda x: x.sample(frac=0.10, random_state=42)
).reset_index(drop=True)

label_reddit = (
    reddit_sample
    .rename(columns={"comment": "text", "label": "is_sarcastic"})
    .assign(source="reddit")
    [["text", "is_sarcastic", "source"]]
)

# Twitter dataset (train + test)
train_twitter = pd.read_csv(f"{twitterpath}/train.csv")
test_twitter  = pd.read_csv(f"{twitterpath}/test.csv")

# Filter out figurative & assign labels
def preprocess_twitter(df):
    df = df[df["class"] != "figurative"].copy()
    df["is_sarcastic"] = np.where(df["class"] == "regular", 0, 1)
    df = df.rename(columns={"tweets": "text"}).drop(columns=["class"])
    df["source"] = "twitter"
    return df[["text", "is_sarcastic", "source"]]

label_twitter = pd.concat([preprocess_twitter(train_twitter),
                           preprocess_twitter(test_twitter)],
                          ignore_index=True)


combine_df = pd.concat([label_reddit, label_headlines, label_twitter], ignore_index=True)
print("Combined size:", len(combine_df))
print(combine_df["source"].value_counts())
print(combine_df["is_sarcastic"].value_counts())

print(combine_df.shape)
print(combine_df.head())

  reddit_sample = reddit.groupby("label", group_keys=False).apply(


Combined size: 222664
source
reddit      101082
twitter      66254
headline     55328
Name: count, dtype: int64
is_sarcastic
1    121699
0    100965
Name: count, dtype: int64
(222664, 3)
                                                text  is_sarcastic  source
0                                                Yes             0  reddit
1  Is there a subreddit for innocent thumbnails t...             0  reddit
2                                   The truth sadly.             0  reddit
3            Yes, all of those countries I remember.             0  reddit
4  erm.. "the more u play the less RNG matters" ....             0  reddit


In [None]:
def clean_text(txt):
    if pd.isna(txt):
        return np.nan
    
    txt = str(txt)
    # Replace URLs
    txt = re.sub(r"http\S+|www\S+|https\S+", "<URL>", txt)
    # Replace user mentions
    txt = re.sub(r"@\w+", "<USER>", txt)
    # Remove explicit sarcasm/irony tags (case-insensitive)
    txt = re.sub(r"#\s*(sarcasm|irony)\b", "", txt, flags=re.IGNORECASE)
    # Remove '#' from hashtags (keep the word)
    txt = re.sub(r"#(\w+)", r"\1", txt)
    # Normalize whitespace
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt


combine_df["text"] = combine_df["text"].progress_apply(clean_text)

combine_df.to_csv("combine_data_clean.csv", index=False)
print("Saved cleaned combined dataset to combine_data_clean.csv")

100%|██████████| 222664/222664 [00:01<00:00, 164336.64it/s]


Saved cleaned combined dataset to combine_data_clean.csv
