In [None]:
import os
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text as text
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv("/content/drive/My Drive/Kaggle Club/SARCASM PROJECT '25/train-balanced-sarcasm.csv").fillna(' ')
# side-by-side comparison
df.drop(columns=['author','subreddit','score','ups','downs','date','created_utc','parent_comment'], inplace=True)

df

Unnamed: 0,label,comment
0,0,NC and NH.
1,0,You do know west teams play against west teams...
2,0,"They were underdogs earlier today, but since G..."
3,0,"This meme isn't funny none of the ""new york ni..."
4,0,I could use one of those tools.
...,...,...
1010821,1,I'm sure that Iran and N. Korea have the techn...
1010822,1,"whatever you do, don't vote green!"
1010823,1,Perhaps this is an atheist conspiracy to make ...
1010824,1,The Slavs got their own country - it is called...


#1. Data Cleaning & Normalization
- Convert text to lowercase → Ensures consistency in tokenization (important for LSTM, less so for BERT/RoBERTa).
- Expand contractions → Convert "can't" to "cannot" for better tokenization.
- Remove URLs, special characters, and emojis → Ensures cleaner text input.
- Remove HTML tags (if applicable).
- Remove extra whitespaces and line breaks.
- Expand slang

In [None]:
!pip install contractions


Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (118 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.3/118.3 kB[0m 

In [None]:
slang_dict = {
    "gtfo": "get the fuck out",
    "idk": "i don't know",
    "idfk": "i don't fucking know",
    "tbh": "to be honest",
    "smh": "shaking my head",
    "lmk": "let me know",
    "brb": "be right back",
    "btw": "by the way",
    "imo": "in my opinion",
    "imho": "in my humble opinion",
    "stfu": "shut the fuck up",
    "kys": "kill yourself",
    "wth": "what the hell",
    "wtf": "what the fuck",
    "fyi": "for your information",
    "ikr": "i know right",
    "nvm": "never mind",
    "gg": "good game",
    "glhf": "good luck have fun",
    "afk": "away from keyboard",
    "rofl": "rolling on the floor laughing",
    "lmao": "laughing my ass off",
    "lmfao": "laughing my fucking ass off",
    "lol": "laugh out loud",
    "omg": "oh my god",
    "omfg": "oh my fucking god",
    "thx": "thanks",
    "ty": "thank you",
    "tysm": "thank you so much",
    "np": "no problem",
    "ggwp": "good game well played",
    "hmu": "hit me up",
    "dm": "direct message",
    "irl": "in real life",
    "asap": "as soon as possible",
    "jk": "just kidding",
    "wyd": "what are you doing",
    "wbu": "what about you",
    "hbu": "how about you",
    "g2g": "got to go",
    "gtg": "got to go",
    "tgif": "thank god it's friday",
    "rn": "right now",
    "u": "you",
    "ur": "your",
    "rly": "really",
    "gonna": "going to",
    "wanna": "want to",
    "lemme": "let me",
    "dunno": "do not know",
    "gimme": "give me",
    "cuz": "because"
}

def expand_slang(text):
    for slang, full in slang_dict.items():
        # \b means "word boundary", so it matches whole words only
        pattern = r'\b' + re.escape(slang) + r'\b'
        text = re.sub(pattern, full, text, flags=re.IGNORECASE)
    return text

stop_words = {"am", "the", "from", "on", "in", "at", "of", "a", "an"}
def remove_stopwords(text):
    # Add word boundaries so we match whole words even with punctuation
    for word in stop_words:
        pattern = r'\b' + re.escape(word) + r'\b'
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    # Remove extra spaces left behind
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def expand_emoticons(text):
    # Define a dictionary mapping common emoticons to their corresponding words.
    emoticon_dict = {
        r":-\)": "smile",
        r":\)": "smile",
        r":-\d": "laugh",
        r":d": "laugh",
        r":-\(": "sad",
        r":\(": "sad",
        r";-\)": "wink",
        r";\)": "wink",
        r":-p": "playful",
        r":p": "playful",
        r":-O": "surprised",
        r":O": "surprised",
        r":-/": "skeptical",
        r":/": "skeptical",
        r":'\(": "crying",
        r":o": "surprised"
    }

    # Loop through each emoticon and replace it with its corresponding word.
    for emoticon, word in emoticon_dict.items():
        text = re.sub(emoticon, f" {word} ", text)
    return text

In [None]:
import pandas as pd
import re
import contractions

def preprocessing_text(s):
    if not isinstance(s, str):
        return ""

    # Convert to lowercase and strip spaces
    s = s.lower().strip()

    # Remove multiple spaces
    s = re.sub(r'\s+', ' ', s).strip()

    # Remove newlines
    s = re.sub(r'\n', ' ', s)

    # Expand contractions
    s = contractions.fix(s)

    # Remove URLs
    s = re.sub(r'http\S+|www\S+|https\S+', '', s)

    # Remove HTML tags
    s = re.sub(r'<.*?>', '', s)

    # (NEW for paper rep) remove punctuation not used in daily speech
    s = re.sub(r"[^\w\s.,!?\']", " ", s)

    # (NEW for paper rep) expand emoticons
    s = expand_emoticons(s)

    # expand slang
    s = expand_slang(s)

    # (NEW for paper rep) remove stop words
    s = remove_stopwords(s)

    # Remove emojis
    emoji_pattern = re.compile("["
      u"\U0001F000-\U0001F9FF"  # Most emojis and symbols
      u"\U00002702-\U000027B0"  # Dingbats
      u"\U000024C2-\U0001F251"  # Enclosed characters
      u"\U0001F926-\U0001F937"  # Extra emoticons
      u"\U0001F1E0-\U0001F1FF"  # Flags
      u"\U00002600-\U000026FF"  # Miscellaneous Symbols
      u"\U00002700-\U0000277F"  # More Dingbats
      u"\U00002B50"             # Star
      u"\U00002640-\U00002642"  # Gender symbols
      u"\U00002600-\U00002B55"  # Other symbols
      u"\U00002B05-\U00002B07"  # Arrows
      u"\U00002934-\U00002935"  # Other symbols
      u"\U00003030"             # Wavy dash
      u"\U0000303D"             # Part Alternation Mark
      u"\U0001F170-\U0001F171"  # Letter symbols
      u"\U0001F17E-\U0001F17F"  # More letter symbols
      u"\U0001F18E"             # AB button
      u"\U0001F191-\U0001F19A"  # Other symbols
      u"\U0001F1E6-\U0001F1FF"  # Regional indicators (flags)
      u"\U0001F201-\U0001F202"  # Squared katakana
      u"\U0001F21A"             # Other squared symbols
      u"\U0001F22F"             # Squared CJK symbols
      u"\U0001F232-\U0001F23A"  # More CJK squared symbols
      u"\U0001F250-\U0001F251"  # Other squared symbols
      "]+", flags=re.UNICODE)

    s = emoji_pattern.sub(r'', s)
    return s



# Apply preprocessing
df['cleaned_comment'] = df['comment'].astype(str).apply(preprocessing_text)

df


Unnamed: 0,label,comment,cleaned_comment
0,0,NC and NH.,nc and nh.
1,0,You do know west teams play against west teams...,you do know west teams play against west teams...
2,0,"They were underdogs earlier today, but since G...","they were underdogs earlier today, but since g..."
3,0,"This meme isn't funny none of the ""new york ni...",this meme is not funny none new york nigga one...
4,0,I could use one of those tools.,i could use one those tools.
...,...,...,...
1010821,1,I'm sure that Iran and N. Korea have the techn...,i sure that iran and n. korea have technology ...
1010822,1,"whatever you do, don't vote green!","whatever you do, do not vote green!"
1010823,1,Perhaps this is an atheist conspiracy to make ...,perhaps this is atheist conspiracy to make chr...
1010824,1,The Slavs got their own country - it is called...,slavs got their own country it is called kosovo




In [None]:
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize

# # Load stop words
# stop_words = set(stopwords.words('english'))

# # Assuming your DataFrame is called df and the column is 'comments'
# def remove_stopwords(text):
#     words = word_tokenize(str(text))  # tokenize the comment
#     filtered = [word for word in words if word.lower() not in stop_words]
#     return ' '.join(filtered)

# # Apply the function to the comments column
# df['comments_clean'] = df['comments'].apply(remove_stopwords)

# print("Preprocessing complete.")

In [None]:
# save just the cleaned comment
df['comment'] = df['cleaned_comment']
df.drop(columns=['cleaned_comment'], inplace=True)
df

Unnamed: 0,label,comment
0,0,nc and nh.
1,0,you do know west teams play against west teams...
2,0,"they were underdogs earlier today, but since g..."
3,0,this meme is not funny none new york nigga one...
4,0,i could use one those tools.
...,...,...
1010821,1,i sure that iran and n. korea have technology ...
1010822,1,"whatever you do, do not vote green!"
1010823,1,perhaps this is atheist conspiracy to make chr...
1010824,1,slavs got their own country it is called kosovo


In [23]:
# Lemmatize
import spacy
nlp = spacy.load("en_core_web_sm")

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_punct])  # Remove punctuation

# (FOR TESTING SAMPLES)
# sample_comments = df["comment"].head(5)
# sample_comments_lemmatized = sample_comments.apply(lemmatize_text)
# for original, lemmatized in zip(sample_comments, sample_comments_lemmatized):
#     print(f"Original: {original}\nLemmatized: {lemmatized}\n")

df['comment'] = df['comment'].apply(lemmatize_text)
df


Unnamed: 0,label,comment
0,0,nc and nh
1,0,you do know west team play against west team m...
2,0,they be underdogs early today but since gronk ...
3,0,this meme be not funny none new york nigga one be
4,0,I could use one those tool
...,...,...
1010821,1,I sure that iran and n. korea have technology ...
1010822,1,whatever you do do not vote green
1010823,1,perhaps this be atheist conspiracy to make chr...
1010824,1,slavs get their own country it be call kosovo


In [24]:
output_path = "/content/drive/My Drive/Kaggle Club/SARCASM PROJECT '25/cleaned_reddit_comments_NEW.csv"
df.to_csv(output_path, index=False)