In [1]:
import pandas as pd

df = pd.read_csv("../data/interim/toxic_words.csv")

df

Unnamed: 0,toxic_words
0,colorblind.
1,one’s
2,gargantua's
3,chad!
4,nuisance
...,...
99053,drawings
99054,slapshot.
99055,anatoly...
99056,secong


## Filtering

In [6]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords


try:
    stop_words = set(stopwords.words("english"))
except LookupError:
    print("Downloading NLTK Stopwords")
    nltk.download("stopwords")
    stop_words = set(stopwords.words("english"))

from nltk.stem.snowball import SnowballStemmer

st = SnowballStemmer("english")


# function to clean data
def clean_data(df, col, clean_col):
    # change to lower and remove spaces on either side
    df[clean_col] = df[col].apply(lambda x: str(x).lower().strip())

    # remove extra spaces in between
    df[clean_col] = df[clean_col].apply(lambda x: re.sub(" +", " ", x))

    # remove punctuation
    df[clean_col] = df[clean_col].apply(lambda x: re.sub("[^a-zA-Z]", " ", x))

    # remove stopwords and get the stem
    df[clean_col] = df[clean_col].apply(
        lambda x: " ".join(
            st.stem(text) for text in x.split() if text not in stop_words
        )
    )

    return df


df = clean_data(df, "toxic_words", "clean")

Downloading NLTK Stopwords


[nltk_data] Downloading package stopwords to /home/leo/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
# filter the rows with the same 'clean' column
df = df.drop_duplicates(subset="clean", keep="first")

df

Unnamed: 0,toxic_words,clean
0,colorblind.,colorblind
1,one’s,one
2,gargantua's,gargantua
3,chad!,chad
4,nuisance,nuisanc
...,...,...
99044,rejoyla,rejoyla
99048,mahalalo's,mahalalo
99054,slapshot.,slapshot
99055,anatoly...,anatoli


In [4]:
# save
df.to_csv("../data/interim/toxic_words_clean.csv", index=False)

# Conclusion:

Arguably, this vocabulary has many outliers and there are enough non-toxic words present.

However, it is not a problem for idea of utilizing BERT for words replacement. Even though some non-toxic words from this vocabulary will be replaced, it will not create any toxicity either.

Moreover, arguably the vocabulary should not be constructed from the whole dataset, but from the train dataset only. Otherwise, it will contain words from the test dataset, which is not good for the evaluation.
