# Data Cleaning

---

## Imports

In [1]:
import json
import re
import string

import contractions
import emoji
import numpy as np
import pandas as pd

from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

## Data Load 

Train and test data are TSV's without headers

In [2]:
train = pd.read_csv("../data/train.tsv", sep="\t", header=None)
train.columns = ["text", "emotion", "code"]

val = pd.read_csv("../data/dev.tsv", sep="\t", header=None)
val.columns = ["text", "emotion", "code"]

test = pd.read_csv("../data/test.tsv", sep="\t", header=None)
test.columns = ["text", "emotion", "code"]

In [3]:
assert train["code"].nunique() == train.shape[0]
assert val["code"].nunique() == val.shape[0]
assert test["code"].nunique() == test.shape[0]
assert len(set(train["code"]).intersection(test["code"])) == 0
assert len(set(train["code"]).intersection(val["code"])) == 0
assert len(set(val["code"]).intersection(test["code"])) == 0

We load the name of the emotions according to the labels in the dataset

In [4]:
with open("../data/emotions.txt") as r:
    emotions = r.read().split("\n")

We can map the GoEmotions data against other classifications as well

In [5]:
with open("../data/ekman_mapping.json") as r:
    ekman_map = json.load(r)
with open("../data/sentiment_mapping.json") as r:
    sentiment_map = json.load(r)

---

## Concatenate

We are going to concatenate train and test data and extract all present emotions within each text, then we are going to map these emotions to ekman's and sentiment classification, and finally add a column with the set to which each sentence belongs to

In [6]:
labels_map = dict()

df = pd.concat([train, val, test])

arr = np.array(
    df["emotion"].apply(lambda x: [int(v) for v in str(x).split(",")])
    .apply(lambda x: [int(i in x) for i in range(28)])
    .tolist()
)

labels_goemotion = pd.DataFrame(arr, index=df["code"], columns=emotions)
complete = (
    labels_goemotion.reset_index()
    .melt(id_vars=["code"], var_name="goemotion", value_name="flag")
    .loc[lambda f: f["flag"] == 1]
    .drop(columns=["flag"])
    .assign(ekman=lambda f: f["goemotion"].replace({k: e for e, l in ekman_map.items() for k in l}))
    .assign(sentiment=lambda f: f["goemotion"].replace({k: e for e, l in sentiment_map.items() for k in l}))
    .assign(
        set=lambda f: np.where(
            f["code"].isin(train.code), "train", np.where(f["code"].isin(val.code), "validation", "test")
        )
    )
    .merge(df.drop(columns=["emotion"]))
)

---

## Clean Text 

We are going to clean text as an alternative approach to the baseline model. In this version, we are going to:
1. Replace contractions with the full word
2. Replace emojis with tags [EMOJI_...]
3. Remove handles such as @something
4. Normalize everything to lower case
5. Remove http and https links
6. Remove unwanted chars such as /, ;, etc.
7. Remove stop words
8. Remove ponctuation

In [7]:
stop_words = set(stopwords.words("english"))


def clean_content(text):
    # replaces abbreviations with full word versions
    clean_text = contractions.fix(text)
    
    # replaces emojis
    clean_text = "".join(
        [c if c not in emoji.EMOJI_DATA else emoji.EMOJI_DATA[c]["en"].replace(":", " _EMOJI_ ") for c in clean_text]
    )
    
    # remove reddit handles
    clean_text = re.sub(r"@\w+\s?", "", clean_text)
    
    # convert to lowercase
    clean_text = clean_text.lower()
    
    # remove links http:// or https://
    clean_text = re.sub(r"https?:\/\/\S+", "", clean_text)
    
    # remove links beginning with www. and ending with .com
    clean_text = re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", "", clean_text)
    
    # remove html reference characters
    clean_text = re.sub(r"&[a-z]+;", "", clean_text)
    
    # remove non-letter characters besides spaces "/", ";" "[", "]" "=", "#"
    clean_text = re.sub(r"\[name\]", "NAME", clean_text)  
    clean_text = re.sub(r"\[religion\]", "RELIGION", clean_text)  
    clean_text = re.sub(r"[/;\[\]=#]", "", clean_text)  
    clean_text = clean_text.split()
    
    # remove stop words
    clean_lst = []
    for word in clean_text:
        if word not in stop_words:
            clean_lst.append(word)
    
    # apply lemmatization
    lemmatized_words = []
    for word in clean_lst:
        lemmatized_word = WordNetLemmatizer().lemmatize(word)
        lemmatized_words.append(lemmatized_word)
    clean_text = " ".join(lemmatized_words)
    
    return clean_text


def remove_punctuation(word_list):
    PUNCUATION_LIST = list(string.punctuation)
    return " ".join([w for w in word_list if w not in PUNCUATION_LIST])


def readd_emoji_tags(text):
    open_emoji = False
    full_sentence = ""
    for sentence in text.split("_emoji_"):
        if open_emoji:
            full_sentence += " [EMOJI_" + sentence.upper().strip() + "] "
        else:
            full_sentence += sentence
        open_emoji = not open_emoji
        
    full_sentence = re.sub(" +", " ", full_sentence)
    return full_sentence.strip()

In [8]:
# apply cleaning process
complete["clean_text"] = complete["text"].apply(lambda x : clean_content(x))

# # splitting into tokens, features of the structure of the text used in Twitter
complete["clean_text"] = complete["clean_text"].apply(TweetTokenizer().tokenize)

# remove punctuation marks
complete["clean_text"] = complete["clean_text"].apply(remove_punctuation)

# clean weird chars
complete["clean_text"] = complete["clean_text"].apply(lambda x: "".join([w for w in x if ord(w) < 2000]))

# re-add name, religion and emoji tags tags
complete["clean_text"] = complete["clean_text"].str.replace("NAME", "[NAME]").str.replace("RELIGION", "[RELIGION]")
complete["clean_text"] = complete["clean_text"].apply(readd_emoji_tags)

In [9]:
complete.loc[lambda f: f["clean_text"].str.contains("\[EMOJI")].sample(10)[["text", "clean_text"]].values

array([['In an unfunny situation you made me laugh, so thanks 😅',
        'unfunny situation made laugh thanks [EMOJI_GRINNING_FACE_WITH_SWEAT]'],
       ["I'm so glad I saw this!!! Now I know how to wash my face! I can't wait for the tooth brushing tutorial🤗",
        'glad saw this know wash face cannot wait tooth brushing tutorial [EMOJI_SMILING_FACE_WITH_OPEN_HANDS]'],
       ["Thank you! I'm going to do my very best. ❤❤❤",
        'thank you going best [EMOJI_RED_HEART] [EMOJI_RED_HEART] [EMOJI_RED_HEART]'],
       ['I’m so sorry 🤪', 'sorry [EMOJI_ZANY_FACE]'],
       ["I'm here for you😄😜",
        '[EMOJI_GRINNING_FACE_WITH_SMILING_EYES] [EMOJI_WINKING_FACE_WITH_TONGUE]'],
       ['Planning to make a comeback soon™',
        'planning make comeback soon [EMOJI_TRADE_MARK]'],
       ['Dam 😣 that was awesome!',
        'dam [EMOJI_PERSEVERING_FACE] awesome'],
       ['They likely just didn’t have enough forex to pay their bill 😂',
        'likely enough forex pay bill [EMOJI_FACE_W

---

## Export 

In [10]:
complete.to_parquet("../data/clean_data.parquet")

---