# Database Assemble

## Twitter Emotion Classification Dataset
More information can be find at this link https://www.kaggle.com/datasets/aadyasingh55/twitter-emotion-classification-dataset

In [42]:
import pandas as pd
import numpy as np

# Read Parquet
df_kaggle = pd.read_parquet('../data/datasets/raw/train-00000-of-00001.parquet')

# Add Emotions column for more clarity
label_to_emotions = {0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise"}
df_kaggle["emotions"] = df_kaggle["label"].map(label_to_emotions)

df_kaggle.head()



Unnamed: 0,text,label,emotions
0,i feel awful about it too because it s my job ...,0,sadness
1,im alone i feel awful,0,sadness
2,ive probably mentioned this before but i reall...,1,joy
3,i was feeling a little low few days back,0,sadness
4,i beleive that i am much more sensitive to oth...,2,love


**Merge** the two dataset together, by first dropping out columns in semeval dataframe not presents in the first dataset. Since the semeval dataset is multi-label i.e a tweet may be labeled both with "anger" and "disgust", i randomly pick one and ensure that each row will have 1 only in one emotion.

In [43]:
# Load semeval dataset
sem_eval_path_train = "../data/datasets/raw/2018-E-c-En-train.txt"
df_semeval = pd.read_csv(sem_eval_path_train, sep='\t')

df_semeval.head()


# Returns rows that have none of target emotions for the kaggle dataset (all 0s)
def filter_rows(target_df, emotion_mapping):

    # Extract the emotion names from your dictionary
    target_emotions = list(emotion_mapping.values())

    # Filter only the columns that actually exist in the dataframe to avoid KeyErrors
    existing_cols = [col for col in target_emotions if col in target_df.columns]

    # Sum the emotion columns value horizontally (axis=1) and keep rows where the sum is 0
    mask = target_df[existing_cols].sum(axis=1) == 0
    negative_df = target_df[mask].copy()

    print(f"Filtered {len(negative_df)} rows out of {len(target_df)}")
    return negative_df

# Dataset containing only rows with label not present in dataset1
negative = filter_rows(df_semeval, label_to_emotions) 

# Take only compatible rows:
positive = df_semeval[~df_semeval.index.isin(negative.index)]

# Identify which columns in 'positive' map to our target emotions
alignment_map = {
'anger': 'anger',
'fear': 'fear',
'joy': 'joy',
'love': 'love',
'sadness': 'sadness',
'surprise': 'surprise',
'disgust': 'anger' # Common academic grouping: Disgust often maps to Anger
}

present_cols = [col for col in alignment_map.keys() if col in positive.columns]

# Create semeval_subset directly from positive text
semeval_subset = pd.DataFrame({
    'text': positive['Tweet'].values
}, index=positive.index)

semeval_subset['emotions'] = positive[present_cols].any(axis=1).astype(int)
def map_to_primary(row):
    for col in present_cols:
        if row[col] == 1:
            return alignment_map[col]


semeval_subset['emotions'] = positive.apply(map_to_primary, axis=1)

emotions_to_label = {"sadness": 0, "joy": 1, "love": 2, "anger": 3, "fear": 4, "surprise": 5 }
semeval_subset["label"] = semeval_subset["emotions"].map(emotions_to_label)


semeval_subset.head()

Filtered 591 rows out of 6838


Unnamed: 0,text,emotions,label
1,Whatever you decide to do make sure it makes y...,joy,1
2,@Max_Kellerman it also helps that the majorit...,anger,3
3,Accept the challenges so that you can literall...,joy,1
4,My roommate: it's okay that we can't spell bec...,anger,3
5,No but that's so cute. Atsu was probably shy a...,joy,1


In [44]:
# Now the two dataset can be merged
merged_df = pd.concat([df_kaggle, semeval_subset], ignore_index=True)
merged_df.head()

Unnamed: 0,text,label,emotions
0,i feel awful about it too because it s my job ...,0,sadness
1,im alone i feel awful,0,sadness
2,ive probably mentioned this before but i reall...,1,joy
3,i was feeling a little low few days back,0,sadness
4,i beleive that i am much more sensitive to oth...,2,love


Now, the **ELTEA17 Dataset** is **adapted** to the format of the **merged dataset** and then merged.
- The ELTEA17 contains tweets with sarcasm in a column called *Sarcams* with values "Y" if it contains sarcasm and "N" otherwise
- Keep only the rows that contains "N"
- Since the detection of sarcasm is outside of the scope of this project, **sarcastic tweets are misleading**

In [45]:
eltea_dataset_path = "../data/datasets/raw/eltea_train.txt"
eltea_df = pd.read_csv(eltea_dataset_path, 
    sep='|', 
    header=None, 
    names=['emotions', 'sarcasm', 'text']
)

# Keep only tweets WITHOUT sarcasm and then drop the column
eltea_df = eltea_df[eltea_df['sarcasm'] =='N']
eltea_df = eltea_df.drop(columns=['sarcasm'])

print(pd.unique(eltea_df["emotions"]))
# Convert emotions to the correct format
conversion_map_eltea = {"joy" : "joy", "sad" : "sadness", "dis" : "anger", "ang" : "anger", "fea": "fear", "sup" : "surprise"}

eltea_df["emotions"] = eltea_df["emotions"].map(conversion_map_eltea)
print(pd.unique(eltea_df["emotions"]))

# Add numeric label to dataset
emotions_to_label = {"sadness": 0, "joy": 1, "love": 2, "anger": 3, "fear": 4, "surprise": 5 }
eltea_df["label"] = eltea_df["emotions"].map(emotions_to_label)


<ArrowStringArray>
['joy', 'dis', 'sad', 'fea', 'ang', 'sup']
Length: 6, dtype: str
<ArrowStringArray>
['joy', 'anger', 'sadness', 'fear', 'surprise']
Length: 5, dtype: str


In [46]:
eltea_df.head()

Unnamed: 0,emotions,text,label
0,joy,That is one #happy #dog who never ceases to ma...,1
2,anger,You say that I'm paranoid but I'm pretty sure ...,3
3,joy,One of London's best days and showing the worl...,1
4,sadness,More children will die because govt not trying...,0
5,fear,It's a slippery slope to open calls to shoot f...,4


In [47]:
# Merge the two dataset
merged_df = pd.concat([merged_df, semeval_subset])
merged_df.head()

Unnamed: 0,text,label,emotions
0,i feel awful about it too because it s my job ...,0,sadness
1,im alone i feel awful,0,sadness
2,ive probably mentioned this before but i reall...,1,joy
3,i was feeling a little low few days back,0,sadness
4,i beleive that i am much more sensitive to oth...,2,love


In [61]:
# Save to path
import os 
save_path = "../data/datasets/process"

os.makedirs(save_path, exist_ok=True) # Create folder if it doesn't exists

parquet_path = "../data/datasets/process/merged_emotions.parquet"

merged_df.to_parquet(parquet_path, index=False, engine='pyarrow')

print("Database successfully saved")

Database successfully saved


# Preprocessing Pipeline
**Tokenization**: in my experiment i'm going to consider tokens as words in a tweet separated by whitespace.

Punctuation is dropped, except for the following:
- Keep "?" and "!" because they are import and often find in surprise or anger tweets. 
- Keep "@" because Users are different from i.e words that may be contained in the users. 
- Keep "#" because hashtag are different and carry a different meaning from word. 
- I keep "." for elipses "..." that also carry emotional information.

The following class incapsulate the logic of the preprocessing

In [48]:
import nltk
from nltk.tokenize import word_tokenize, TweetTokenizer

In [49]:
class PreprocessPipeline:
  def __init__(self):
    self.tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True)

    # List of chars to keep
    # point is keep for elipses "..."
    chars_to_keep = "@#?!.'_"
    self.punct_to_remove = "".join([c for c in string.punctuation if c not in chars_to_keep])

  def clean_text(self, text):
    # Converts ðŸ˜‚ to " :face_with_tears_of_joy: "
    text = emoji.demojize(text, delimiters=(" ", " "))

    # Lower
    text = text.lower()

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\b(href|http|https)\b', '', text)

    # Some noise patterns found
    noise_patterns = [
        r'gt',
        r'class[^\w\s]*delicious[^\w\s]*title[^\w\s]*share[^\w\s]*del', # Removes 'gt' (from >)
        r'rel[^\w\s]*nofollow[^\w\s]*target[^\w\s]*blank',              # Specific CSS/HTML string
        r'languagedirection[^\w\s]*ltr',                                 # Specific CSS/HTML string
        r'\b(type|application|atom|xml|feedlinks|href|http|https)\b',     # Directional metadata
    ]

    combined_noise = '|'.join(noise_patterns)
    text = re.sub(combined_noise, '', text)

    # Remove puntuation, keep some special characters
    # We use a translation table here; it's much faster than regex for single characters
    table = str.maketrans('', '', self.punct_to_remove)
    text = text.translate(table)

    text = re.sub(combined_noise, '', text) # re apply

    # Remove extra space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

  def transform(self, text):
    text = self.clean_text(text)
    tokens = self.tweet_tokenizer.tokenize(text)
    return tokens