# Preprocessing 

In [17]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split

In [4]:
import re

def replace_emoticons(text):
    """
    Replace common emoticons and emojis in the text with their textual descriptions.
    """
    # Dictionary of common emoticons/emojis and their textual descriptions
    emoticons_dict = {
        r':\)': 'Smile',
        r':-\)': 'Smile',
        r':\(': 'Sad',
        r':-\(': 'Sad',
        r';\)': 'Wink',
        r';-\)': 'Wink',
        r':D': 'Laugh',
        r':-D': 'Laugh',
        r':P': 'Tongue',
        r':-P': 'Tongue',
        r':p': 'Tongue',
        r':-p': 'Tongue',
        r':O': 'Surprised',
        r':-O': 'Surprised',
        r':o': 'Surprised',
        r':-o': 'Surprised',
        r':\|': 'Straight',
        r':-\|': 'Straight',
        r':/': 'Skeptical',
        r':-\/': 'Skeptical',
        r':\\': 'Skeptical',
        r':-\\': 'Skeptical',
        r':\*': 'Kiss',
        r':-\*': 'Kiss',
        r'>:\(': 'Angry',
        r'>:-\(': 'Angry',
        r':-\]': 'Grin',
        r'8\)': 'Cool',
        r'8-\)': 'Cool',
        r'O:\)': 'Angel',
        r'O:-\)': 'Angel',
        r'3:\)': 'Devil',
        r'3:-\)': 'Devil',
        r'B\)': 'Cool',
        r'B-\)': 'Cool',
        r'X\(': 'Dead',
        r'X-\(': 'Dead',
        r':\'\(': 'Crying',
        r':\'-\(': 'Crying',
        r':-\[': 'Shy',
        r'<3': 'Heart',
        r'T_T': 'Crying',
        r'\\o/': 'Celebration',
        r'o/': 'Excited',
        r'm/': 'Monkey',
        r'-_-': 'Unamused',
        r':yum:': 'Tasty',
        r':X': 'Silent',
        r'zzZ': 'Sleeping',
        r'XD': 'Laughing',
        r'XP': 'Playful',
        r'n_n': 'Sleeping',
        r'-_- zzz': 'Tired',
        r':evil:': 'Evil',
        r':mask:': 'Masked',
        r'O_O': 'Shocked',
        r':hear_no_evil:': 'Hear No Evil',
        r'O:\)': 'Innocent',
        r'_/\\_': 'Bowing',
        r'😍': 'Heart Eyes',
        r'😭': 'Crying Loudly',
        r'🙏': 'Praying',
        r'🙌': 'Celebrating',
        r'🙈': 'See No Evil',
        r'😒': 'Unamused',
        r'😋': 'Yummy',
        r'🙊': 'Speak No Evil',
        r'😴': 'Sleeping',
        r'😆': 'Laughing',
        r'😜': 'Winking Tongue Out',
        r'😫': 'Exhausted',
        r'😈': 'Mischievous',
        r'😷': 'Sick',
        r'😲': 'Amazed',
        r'🙉': 'Hear No Evil',
        r'😇': 'Angel',
        r'🙇': 'Bowing'
        # Add more emojis as needed
    }

    # Replace each emoticon or emoji with its description
    for emoticon, description in emoticons_dict.items():
        text = re.sub(re.escape(emoticon), description, text)
    
    return text


In [5]:
def replace_token(token):
    """
    Replace tokens based on specific patterns.
    """
    # Check for punctuations
    if re.match(r'^[.,_-]+$', token):
        return 'punctuation'

    # Check for special characters
    if re.match(r'^[@%&*=]+$', token):
        return 'special'

    # Check for currency symbols
    if re.match(r'^[£$€¥₹₽₩₪₫₴₦₲₵₮₱₭₺₼₠₡₢₣₤₥₧₨₩₪₫€₭₮₯₰₱₲₳₴₵₶₷₸₹₺₻₼₽₾₿]+$', token):
        return 'currency'

    return token

def preprocess_text(text):
    """
    Preprocess the text by applying various transformations.
    """
    # Replace emoticons
    text = replace_emoticons(text)

    # Replace possible sarcasm expressions with possibility
    text = re.sub(r'\. \.\.', ' possibility ', text)
    text = re.sub(r'\. \.\.\.', ' possibility ', text)

    # Replace patterns like "250,000", "3,600" with "NUM"
    text = re.sub(r'\b\d{1,3}(?:,\d{3})*(?!\d)', 'NUM', text)

    # Replace alphanumeric patterns like "r800", "23pts" with "ALPHANUM"
    text = re.sub(r'\b(?:[a-zA-Z]+\d+|\d+[a-zA-Z]+)[a-zA-Z]*\b', 'ALPHANUM', text)

    # Replace range patterns like "8-13" with "RANGE"
    text = re.sub(r'\b\d+-\d+\b', 'RANGE', text)

    # Replace time patterns like 3:30, 1am, 2hrs, 2am, etc.
    text = re.sub(r'(?:(\d{1,2}:\d{1,2})|(\d{1,2}(?:am|pm))|(\d{1,2}hrs))', 'time', text)

    # Replace fractions or numbers like 1/2
    text = re.sub(r'\d{1,2}/\d{1,2}', 'fraction', text)

    # Replace year patterns like 2003
    text = re.sub(r'\b\d{4}\b', 'year', text)

    # Replace amount patterns like 136k, 500k, 7million, 8.25, etc.
    text = re.sub(r'(?:(\d+(?:\.\d+)?(?:k|m|million|billion))|(\d+\.\d{2}))', 'amount', text)

    # Replace version patterns like 4.0
    text = re.sub(r'\d+\.\d+', 'version', text)

    # Replace temperature patterns like 100.20, 971.2
    text = re.sub(r'(?:(\d+\.\d{1,2})|(\d{2,3}\.\d))\b', 'temperature', text)

    # Replace time or duration patterns like 21st, 90s, 200yrs
    text = re.sub(r'(?:(\d{1,3}(st|nd|rd|th))|(\d{1,2}s)|(\d+yrs))', 'duration', text)

    # Replace time patterns like 7:18, 12s, 24hr, 15minutes
    text = re.sub(r'(?:(\d{1,2}:\d{1,2})|(\d{1,2}s)|(\d{1,2}hr)|(\d{1,2}minutes))', 'time', text)

    # Replace position patterns like 18th
    text = re.sub(r'\d{1,2}(st|nd|rd|th)', 'position', text)

    # Replace speed patterns like 90kph
    text = re.sub(r'\d+kph', 'speed', text)

    # Replace distance patterns like 155m
    text = re.sub(r'\d+m', 'distance', text)

    # Replace URLs
    text = re.sub(r"http\S+|www\S+|https\S+", 'URL', text, flags=re.MULTILINE)

    # Replace standalone numbers
    text = re.sub(r'\b\d+\b', 'NUM', text)

    # Replace hyphens with spaces to break into multiple tokens
    text = text.replace("-", ' ')

    # Tokenize and replace tokens using replace_token function
    tokens = text.split()
    tokens = [replace_token(token) for token in tokens]
    text = ' '.join(tokens)

    return text

In [6]:
def load_dataset(file_path, text_column='text'):
    """
    Load a dataset from a CSV file.
    """
    try:
        df = pd.read_csv(file_path)
        if text_column not in df.columns:
            raise ValueError(f"Column '{text_column}' not found in {file_path}")
        return df
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

In [7]:
def preprocess_dataset(df, text_column='text'):
    """
    Apply preprocessing to the text column of the dataset.
    """
    df['preprocessed_text'] = df[text_column].astype(str).apply(preprocess_text)
    return df

In [8]:
def save_dataset(df, output_path):
    """
    Save the preprocessed dataset to a CSV file.
    """
    try:
        df.to_csv(output_path, index=False)
        print(f"Preprocessed data saved to {output_path}")
    except Exception as e:
        print(f"Error saving to {output_path}: {e}")

### Headlines Dataset

In [7]:
df = pd.read_csv('/kaggle/input/anlp-headlines/Headlines.csv')
processed_df = preprocess_dataset(df, text_column='headline')

In [8]:
def preprocess_labels(df, label_column='label'):
    """
    Converts the label column to binary values: 1 for sarcastic, 0 for non-sarcastic.
    """
    df['label'] = df[label_column].apply(lambda x: 1 if x.lower() == 'sarcastic' else 0)
    return df

# Apply the function to the DataFrame
processed_df = preprocess_labels(df, label_column='label')




In [9]:


# Save the processed DataFrame to CSV
output_path = '/kaggle/working/processed_news_headlines_preprocessed.csv'
processed_df.to_csv(output_path, index=False)

In [10]:
processed_df

Unnamed: 0,label,headline,valid,preprocessed_text
0,1,dateline nbc report inspired by actual events,False,dateline nbc report inspired by actual events
1,1,goldfish dying to be petted just once,False,goldfish dying to be petted just once
2,0,scalia's utter moral failure exposed,False,scalia's utter moral failure exposed
3,0,video captures courthouse beating of inmate ac...,False,video captures courthouse beating of inmate ac...
4,0,bernie sanders has a very lonely but very comm...,False,bernie sanders has a very lonely but very comm...
...,...,...,...,...
26704,0,"kim kardashian channels cruella de vil, plus m...",True,"kim kardashian channels cruella de vil, plus m..."
26705,1,students thankful standardized curriculum spar...,True,students thankful standardized curriculum spar...
26706,1,mortgage market collapse threatens nation's ba...,True,mortgage market collapse threatens nation's ba...
26707,0,the outrageous dessert you can make in a slow ...,True,the outrageous dessert you can make in a slow ...


In [12]:
train, remaining = train_test_split(processed_df, test_size=0.2, random_state=42)

# Split the remaining 20% equally into dev (10%) and test (10%)
dev, test = train_test_split(remaining, test_size=0.5, random_state=42)

# Save the splits into separate CSV files
train_path = '/kaggle/working/news_headlines_train.csv'
dev_path = '/kaggle/working/news_headlines_dev.csv'
test_path = '/kaggle/working/news_headlines_test.csv'

train.to_csv(train_path, index=False)
dev.to_csv(dev_path, index=False)
test.to_csv(test_path, index=False)

print(f"Training set saved to {train_path}")
print(f"Development set saved to {dev_path}")
print(f"Test set saved to {test_path}")

Training set saved to /kaggle/working/news_headlines_train.csv
Development set saved to /kaggle/working/news_headlines_dev.csv
Test set saved to /kaggle/working/news_headlines_test.csv


### Twitter Dataset

In [13]:
df = pd.read_csv('/kaggle/input/se4meval-task-3-irony-detection/SemEval2018-Task3-master/datasets/train/SemEval2018-T3-train-taskA_emoji.txt',sep='\t')
processed_df = preprocess_dataset(df, text_column='Tweet text')



In [14]:
processed_df

Unnamed: 0,Tweet index,Label,Tweet text,preprocessed_text
0,1,1,Sweet United Nations video. Just in time for C...,Sweet United Nations video. Just in time for C...
1,2,1,@mrdahl87 We are rumored to have talked to Erv...,@ALPHANUM We are rumored to have talked to Erv...
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...,Hey there! Nice to see you Minnesota/ND Winter...
3,4,0,3 episodes left I'm dying over here,NUM episodes left I'm dying over here
4,5,1,I can't breathe! was chosen as the most notabl...,I can't breathe! was chosen as the most notabl...
...,...,...,...,...
3812,3830,0,@banditelli regarding what the PSU president does,@banditelli regarding what the PSU president does
3813,3831,0,@banditelli But still bothers me that I see no...,@banditelli But still bothers me that I see no...
3814,3832,0,well now that i've listened to all of into the...,well now that i've listened to all of into the...
3815,3833,0,Hummingbirds #Are #Experts #at #Hovering #Aft...,Hummingbirds #Are #Experts #at #Hovering #Afte...


In [15]:
train, remaining = train_test_split(processed_df, test_size=0.2, random_state=42)

# Split the remaining 20% equally into dev (10%) and test (10%)
dev, test = train_test_split(remaining, test_size=0.5, random_state=42)

# Save the splits into separate CSV files
train_path = '/kaggle/working/twitter_train.csv'
dev_path = '/kaggle/working/twitter_dev.csv'
test_path = '/kaggle/working/twitter_test.csv'

train.to_csv(train_path, index=False)
dev.to_csv(dev_path, index=False)
test.to_csv(test_path, index=False)

print(f"Training set saved to {train_path}")
print(f"Development set saved to {dev_path}")
print(f"Test set saved to {test_path}")

Training set saved to /kaggle/working/twitter_train.csv
Development set saved to /kaggle/working/twitter_dev.csv
Test set saved to /kaggle/working/twitter_test.csv


## Reddit

In [10]:
df_reddit = pd.read_csv('/kaggle/input/sarcasm/train-balanced-sarcasm.csv')

In [11]:
df_reddit.columns

Index(['label', 'comment', 'author', 'subreddit', 'score', 'ups', 'downs',
       'date', 'created_utc', 'parent_comment'],
      dtype='object')

In [12]:
df = df_reddit[['label', 'comment']]

In [13]:
df

Unnamed: 0,label,comment
0,0,NC and NH.
1,0,You do know west teams play against west teams...
2,0,"They were underdogs earlier today, but since G..."
3,0,"This meme isn't funny none of the ""new york ni..."
4,0,I could use one of those tools.
...,...,...
1010821,1,I'm sure that Iran and N. Korea have the techn...
1010822,1,"whatever you do, don't vote green!"
1010823,1,Perhaps this is an atheist conspiracy to make ...
1010824,1,The Slavs got their own country - it is called...


In [14]:
df = preprocess_dataset(df, text_column='comment')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['preprocessed_text'] = df[text_column].astype(str).apply(preprocess_text)


In [15]:
df

Unnamed: 0,label,comment,preprocessed_text
0,0,NC and NH.,NC and NH.
1,0,You do know west teams play against west teams...,You do know west teams play against west teams...
2,0,"They were underdogs earlier today, but since G...","They were underdogs earlier today, but since G..."
3,0,"This meme isn't funny none of the ""new york ni...","This meme isn't funny none of the ""new york ni..."
4,0,I could use one of those tools.,I could use one of those tools.
...,...,...,...
1010821,1,I'm sure that Iran and N. Korea have the techn...,I'm sure that Iran and N. Korea have the techn...
1010822,1,"whatever you do, don't vote green!","whatever you do, don't vote green!"
1010823,1,Perhaps this is an atheist conspiracy to make ...,Perhaps this is an atheist conspiracy to make ...
1010824,1,The Slavs got their own country - it is called...,The Slavs got their own country it is called K...


In [19]:
train, remaining = train_test_split(df, test_size=0.2, random_state=42)

# Split the remaining 20% equally into dev (10%) and test (10%)
dev, test = train_test_split(remaining, test_size=0.5, random_state=42)

# Save the splits into separate CSV files
train_path = '/kaggle/working/reddit_train.csv'
dev_path = '/kaggle/working/reddit_dev.csv'
test_path = '/kaggle/working/reddit_test.csv'

train.to_csv(train_path, index=False)
dev.to_csv(dev_path, index=False)
test.to_csv(test_path, index=False)

print(f"Training set saved to {train_path}")
print(f"Development set saved to {dev_path}")
print(f"Test set saved to {test_path}")

Training set saved to /kaggle/working/reddit_train.csv
Development set saved to /kaggle/working/reddit_dev.csv
Test set saved to /kaggle/working/reddit_test.csv
