In [9]:
from dotenv import find_dotenv
import os
import sys

sys.path.append(os.path.dirname(find_dotenv()))

In [7]:
import pandas as pd
from nltk.corpus import wordnet
import nltk
import re

nltk.download("wordnet", quiet=True)

In [2]:
def get_nontoxic_synonym(word, blacklist):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            # Check against blacklist
            if lemma.name().lower() not in blacklist:
                synonyms.add(lemma.name())

    return list(synonyms)[0] if synonyms else None

In [3]:
df = pd.read_csv("../data/external/bad-words.csv", header=None, names=["tox"])
df.head()

Unnamed: 0,tox
0,jigaboo
1,mound of venus
2,asslover
3,s&m
4,queaf


In [4]:
# Using the first column as a blacklist
blacklist = set(df["tox"].str.lower())

# Apply the function to get non-toxic synonyms
df["ntox"] = df["tox"].apply(lambda x: get_nontoxic_synonym(x, blacklist))
df = df.dropna()  # some of the results are None, so drop them

In [5]:
df

Unnamed: 0,tox,ntox
0,jigaboo,spade
8,pimp,panderer
9,urine,piddle
10,whit,shred
11,randy,turned_on
...,...,...
1597,hiv,human_immunodeficiency_virus
1599,mad,unhinged
1600,sniggers,snicker
1602,testicle,ball


# Baseline correction


In [None]:
# Load the datasets
toxic_sentences_df = pd.read_csv("toxic_sentences.csv")
synonyms_df = pd.read_csv("synonyms.csv")

# Create a replacement dictionary
replacement_dict = dict(zip(synonyms_df["toxic_word"], synonyms_df["non_toxic"]))


# Function to replace toxic words in a sentence
def replace_toxic_words(sentence):
    for toxic, non_toxic in replacement_dict.items():
        # \b specifies word boundaries in regex, ensuring we're replacing whole words, not substrings
        sentence = re.sub(r"\b" + re.escape(toxic) + r"\b", non_toxic, sentence)
    return sentence


# Apply the function to the sentences
toxic_sentences_df["cleaned_sentence"] = toxic_sentences_df["sentence"].apply(
    replace_toxic_words
)

# Check the results
print(toxic_sentences_df.head())