### Augment Open Source Dataset of Tweets Containing Hate Speech

In [None]:
import json
import random
import pandas as pd
from datasets import load_dataset
from alphabet_detector import AlphabetDetector

In [None]:
dataset = load_dataset("tweets_hate_speech_detection")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.58k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.07M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/31962 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/17197 [00:00<?, ? examples/s]

In [None]:
with open('./real_world_terms_with_homoglyphs.json') as json_file:
    homoglyph_terms = json.load(json_file)

In [None]:
granularity = 1 # 100 % of the available words will be homoglpyed

In [None]:
def augment(row, colName, homoglyph_terms):
    words = row[colName].split()
    new_row = []
    good_output = False

    for word in words:
        cleaned_word = word.lower().strip()
        replaced = False

        for key in homoglyph_terms.keys():
            # ensure consistent formatting before string comparison
            cleaned_key = key.lower().strip().replace("'", "")
            
            if cleaned_word == cleaned_key or cleaned_word == cleaned_key.replace("'", ""):
                # pick a random subsitution (if possible)
                new_row.append(random.choice(homoglyph_terms[key]))
                replaced = True
                good_output = True
                break

        if not replaced:
            new_row.append(word)
    
    # return empty string if no replacements were made
    return " ".join(new_row) if good_output else ""

In [None]:
ground_truth_labels = []
alteredTweets = []
originalTweets = []
altered = []

# augment the dataset
for example in dataset["train"]:
    originalTweets.append(example['tweet'])
    ground_truth_labels.append(example['label'])
    processed_example = augment(example, "tweet")

    if processed_example == "":
      # these failed cases are removed later via data cleaning
      altered.append(0)
      alteredTweets.append("NONE")
    else:
      altered.append(1)
      alteredTweets.append(processed_example)

In [None]:
df = pd.DataFrame.from_dict({"ground_truth": ground_truth_labels, "original_tweet": originalTweets, "altered": altered, "altered_tweet": alteredTweets})

In [None]:
df.head()

Unnamed: 0,ground_truth,original_tweet,altered,altered_tweet
0,0,@user when a father is dysfunctional and is so...,1,@user whєn α father ιѕ dysfunctional αnd ιѕ sθ...
1,0,@user @user thanks for #lyft credit i can't us...,1,@user @user thanks ƒσя #lyft credit i canт usе...
2,0,bihday your majesty,1,bihday yоur majesty
3,0,#model i love u take with u all the time in ...,1,#model i lоvе υ tαkє wіth υ ѧʟʟ тне tímє ιn ur...
4,0,factsguide: society now #motivation,1,factsguide: sσcíєtч nоw #motivation


In [None]:
df.to_csv("./augmented_data.csv", index=False)

### Prep Data for Indirect Normalization

In [None]:
ad = AlphabetDetector()

def dropHomoglyphs(word, delimeter="_"):
  newWord = ""
  for letter in word:
    if ad.is_latin(letter):
      newWord += letter
    else:
      newWord += delimeter
  return newWord

In [None]:
print("Df len:", len(df))
newDf = df[df['altered'] == 1].copy()
print("Df len with failed augmentations dropped:", len(newDf))

altTweetDrops = []

for index, row in newDf.iterrows():
  altTweetDrops.append(dropHomoglyphs(row['altered_tweet']))

newDf['altered_drops'] = altTweetDrops

31962
29846


In [None]:
newDf.head()

Unnamed: 0,ground_truth,original_tweet,altered,altered_tweet,altered_drops
0,0,@user when a father is dysfunctional and is so...,1,@user whєn α father ιѕ dysfunctional αnd ιѕ sθ...,@user wh_n _ father __ dysfunctional _nd __ s_...
1,0,@user @user thanks for #lyft credit i can't us...,1,@user @user thanks ƒσя #lyft credit i canт usе...,@user @user thanks ƒ__ #lyft credit i can_ us_...
2,0,bihday your majesty,1,bihday yоur majesty,bihday y_ur majesty
3,0,#model i love u take with u all the time in ...,1,#model i lоvе υ tαkє wіth υ ѧʟʟ тне tímє ιn ur...,#model i l_v_ _ t_k_ w_th _ _ʟʟ ___ tím_ _n ur...
4,0,factsguide: society now #motivation,1,factsguide: sσcíєtч nоw #motivation,factsguide: s_cí_t_ n_w #motivation


In [None]:
newDf.to_csv("./augmented_data_drops.csv", index=False)