In [4]:
import sys
sys.path.append(r"../")

from src.models.classifier import classify_text

In [24]:
import warnings
import pandas as pd
from tqdm.notebook import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

warnings.filterwarnings('ignore')
tqdm.pandas()

In [6]:
model_checkpoint = "t5-small"

In [7]:
model = AutoModelForSeq2SeqLM.from_pretrained("../models/t5-small-finetuned-toxic-en-to-neutral-en/best")
model.eval()
model.config.use_cache = False

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [18]:
def translate(model, inference_request, tokenizer=tokenizer):
    input_ids = tokenizer(inference_request, return_tensors="pt").input_ids
    outputs = model.generate(input_ids=input_ids)
    return tokenizer.decode(outputs[0], skip_special_tokens=True,temperature=0)

In [10]:
prefix = "paraphrase: "

In [11]:
def detoxify(request):
    return translate(model, prefix + request)

In [19]:
detoxify("The weather is very shitty today.")

'the weather is very bad today.'

In [46]:
df = pd.read_csv('../data/interim/combined.tsv', sep='\t', header=0)
df['text'] = df['toxic-en']
df = df[['text']]
df['toxic'] = classify_text(df['text'].to_numpy())
df = df[df['toxic'] == 1]
df.head()

Processing:   0%|          | 0/631231 [00:00<?, ?it/s]

Unnamed: 0,text,toxic
1,you're becoming disgusting.,1
4,I have orders to kill her.,1
5,I'm not gonna have a child... ...with the same...,1
6,"They're all laughing at us, so we'll kick your...",1
8,"Briggs, what the hell is going on?",1


In [54]:
eval_ratio = 0.01
eval_df = df.sample(frac=eval_ratio, random_state=42)
len(eval_df)

4755

In [55]:
classified_before = classify_text(eval_df['text'].to_numpy())

Processing:   0%|          | 0/4755 [00:00<?, ?it/s]

In [56]:
# count values before
print('Before:')
print(pd.Series(classified_before).value_counts())

Before:
1    4755
Name: count, dtype: int64


In [57]:
eval_df['text'] = eval_df['text'].progress_apply(detoxify)

  0%|          | 0/4755 [00:00<?, ?it/s]

In [58]:
classified_after = classify_text(eval_df['text'].to_numpy())

Processing:   0%|          | 0/4755 [00:00<?, ?it/s]

In [59]:
# count values after
print('After:')
print(pd.Series(classified_after).value_counts())

After:
0    2963
1    1792
Name: count, dtype: int64


In [60]:
print('Removed {:f}% of toxic comments'.format(100 * (1 - classified_after.sum() / classified_before.sum())))

Removed 62.313354% of toxic comments
