In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import datasets
from torch.utils.data import DataLoader
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
device = "cuda" if torch.cuda.is_available() else "cpu"
from tqdm.notebook import tqdm
import wandb

### Model and eval data preparation

In [2]:
# tokenizer and model weights for calculating toxisity score of the text
tokenizer = RobertaTokenizer.from_pretrained(
    "SkolkovoInstitute/roberta_toxicity_classifier"
)
model = RobertaForSequenceClassification.from_pretrained(
    "SkolkovoInstitute/roberta_toxicity_classifier", device_map='auto'
)

Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
def get_toxisity_score(model_output):
    sigmoid = nn.Sigmoid()
    scores = sigmoid(model_output.squeeze()).detach().numpy()
    result = {"neutral": scores[0], "toxic": scores[1]}
    return result


def get_classification(
    text, classification_tokenizer=tokenizer, classification_model=model
):
    tokenized_text = classification_tokenizer.encode(text, return_tensors="pt")
    predictions = classification_model(tokenized_text)
    return predictions


In [4]:
dataset_path = "s-nlp/paradetox"
dataset = datasets.load_dataset(dataset_path)["train"]

### Generation of the masked texts for the eval set

In [5]:
resulting_texts = []
for sample in tqdm(dataset["en_toxic_comment"]):
    sample = sample.split(" ")
    for i, word in enumerate(sample):
        output = get_classification(word, tokenizer, model)
        result = get_toxisity_score(output)
        toxicity_prob = result["toxic"]
        to_delete = np.random.rand() <= toxicity_prob
        if to_delete:
            sample[i] = "***"
    resulting_texts.append(" ".join(sample))

In [6]:
paradetox_results = np.array(resulting_texts)

In [7]:
len(paradetox_results) == len(dataset)

True

### Evaluation of the generated results

In [33]:
simmilarity_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
cosine_simmilarity = nn.CosineSimilarity(dim=0)

In [73]:
def get_sim(text1, text2):
    embeddings = simmilarity_model.encode([text1, text2], convert_to_tensor=True)
    return cosine_simmilarity(embeddings[0], embeddings[1]).item()


In [8]:
dataset = dataset.add_column("masked_result", paradetox_results)
dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

### Count toxicity of predictions and references

In [10]:
init_toxicity_val = []
detoxified_toxicity_val = []
masked_toxicity_val = []

for batch in tqdm(dataloader):
    toxic_comments_batch, detoxified_comments_batch, masked_comments_batch = (
        batch["en_toxic_comment"],
        batch["en_neutral_comment"],
        batch["masked_result"],
    )
    t = tokenizer(toxic_comments_batch, return_tensors='pt', padding='max_length', truncation=True)
    res = model(**t)
    for elem in res.logits:
        init_toxicity_val.append(get_toxisity_score(elem)['toxic'])

    t = tokenizer(detoxified_comments_batch, return_tensors='pt', padding='max_length', truncation=True)
    res = model(**t)
    for elem in res.logits:
        detoxified_toxicity_val.append(get_toxisity_score(elem)['toxic'])

    t = tokenizer(masked_comments_batch, return_tensors='pt', padding='max_length', truncation=True)
    res = model(**t)
    for elem in res.logits:
        masked_toxicity_val.append(get_toxisity_score(elem)['toxic'])



  0%|          | 0/1234 [00:00<?, ?it/s]

In [14]:
dataset = dataset.add_column('initial_toxicity', init_toxicity_val)
dataset = dataset.add_column('ideal_toxicity', detoxified_toxicity_val)
dataset = dataset.add_column('resulting_toxicity', masked_toxicity_val)

### Count simmilarity of the references and samples

In [74]:
reference2masked_sim = []
reference2translation_sim = []

for example in tqdm(dataset):
    toxic = example['en_toxic_comment']
    nontoxic = example['en_neutral_comment']
    masked = example['masked_result']
    reference2masked_sim.append(get_sim(toxic, masked))
    reference2translation_sim.append(get_sim(toxic, nontoxic))

  0%|          | 0/19744 [00:00<?, ?it/s]

In [75]:
dataset = dataset.add_column('reference2masked_sim', reference2masked_sim)
dataset = dataset.add_column('reference2translation_sim', reference2translation_sim)


## Fluency of the generated text

In [27]:
fluency_tokenizer = AutoTokenizer.from_pretrained("cointegrated/roberta-large-cola-krishna2020")
fluency_model = AutoModelForSequenceClassification.from_pretrained("cointegrated/roberta-large-cola-krishna2020").to(device)

In [28]:
dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

In [29]:
def get_fluency_score(logits):
    sigmoid = nn.Sigmoid()
    scores = sigmoid(logits.squeeze()).cpu().detach().numpy()
    result = {"Fluent": scores[0], "Non-Fluent": scores[1]}
    return result

In [31]:
fluency_scores = []
for batch in tqdm(dataloader):
    text = batch['masked_result']
    tokens = fluency_tokenizer(text, return_tensors='pt', truncation=True, padding='max_length').to(device)
    result = fluency_model(**tokens).logits
    for elem in result:
        elem_res = get_fluency_score(elem)['Fluent']
        fluency_scores.append(elem_res)

  0%|          | 0/1234 [00:00<?, ?it/s]

In [33]:
assert len(dataset) == len(fluency_scores)
dataset  = dataset.add_column('Fluency_scores', fluency_scores)

In [34]:
pandas_dataset_results = dataset.to_pandas()
pandas_dataset_results.head()

Unnamed: 0,en_toxic_comment,en_neutral_comment,masked_result,initial_toxicity,ideal_toxicity,resulting_toxicity,reference2masked_sim,reference2translation_sim,Fluency_scores
0,he had steel balls too !,he was brave too!,he had steel balls too !,0.912212,0.005482,0.912212,1.0,0.510213,0.880316
1,"dude should have been taken to api , he would ...",It would have been good if he went to api. He ...,"dude should have been taken to api , he would ...",0.744653,0.00685,0.744653,1.0,0.55702,0.89955
2,"im not gonna sell the fucking picture , i just...","I'm not gonna sell the picture, i just want to...","im not gonna sell the *** picture , i just wan...",0.919632,0.005215,0.015971,0.903694,0.887211,0.867077
3,the garbage that is being created by cnn and o...,the news that is being created by cnn and othe...,the *** that is being created by cnn and other...,0.955578,0.014164,0.063538,0.873326,0.912075,0.751078
4,the reason they dont exist is because neither ...,The reason they don't exist is because neither...,the reason they dont exist is because neither ...,0.94548,0.108274,0.765257,0.903425,0.895799,0.698454
