In [3]:
from dotenv import find_dotenv
import os
import sys

sys.path.append(os.path.dirname(find_dotenv()))

In [4]:
import src.models.baseline_model as baseline
import src.data.preprocess_dataset as preprocess
import src.data.metrics as metrics

In [5]:
import pandas as pd
import nltk

nltk.download("punkt")
nltk.download("wordnet")

# Load the dataset


In [8]:
df = pd.read_csv("dataset/test.csv", index_col=0)
df.head()

Unnamed: 0,source,target
0,and you think grandpa is gonna protect us from...,you think your grandpa will protect us from eric
1,might i add very clever assholes,i can deliver very clever blunders
2,i hate dickheads,i hate bees
3,jason put down that stupid camera and come hel...,jason put the camera down and help me
4,what a scumbag,what a punk


# Baseline evaluation


In [10]:
baseline_preds = baseline.predict(df)
baseline_preds.head()

Unnamed: 0,source,target,baseline_pred
0,and you think grandpa is gonna protect us from...,you think your grandpa will protect us from eric,and you think grandpa is gonna protect us from...
1,might i add very clever assholes,i can deliver very clever blunders,might i add very clever son of a beef
2,i hate dickheads,i hate bees,i hate dickheads
3,jason put down that stupid camera and come hel...,jason put the camera down and help me,jason put down that stupefied camera and come ...
4,what a scumbag,what a punk,what a scumbag


In [None]:
baseline_preds["target"] = baseline_preds["baseline_pred"]
baseline_preds[["source", "target"]].to_csv(
    "../data/interim/predictions/baseline-detoxify.csv"
)

In [13]:
semantic_res = metrics.cosine_similarity_score(
    baseline_preds["source"].tolist(), baseline_preds["baseline_pred"].tolist()
)

In [16]:
bleu_res = metrics.blue_score(
    baseline_preds["source"].tolist(), baseline_preds["baseline_pred"].tolist()
)

In [18]:
meteor_res = metrics.meteor_score(
    baseline_preds["source"].tolist(), baseline_preds["baseline_pred"].tolist()
)

In [20]:
toxicity_res = metrics.toxicity_score(baseline_preds["baseline_pred"].tolist())

100%|██████████| 120/120 [06:00<00:00,  3.01s/it]


In [None]:
baseline_metrics = pd.DataFrame(
    {
        "semantic": semantic_res[0],
        "bleu": bleu_res[0],
        "meteor": meteor_res[0],
        "toxicity": toxicity_res[0],
    }
)

In [22]:
baseline_metrics.head()

Unnamed: 0,semantic,bleu,meteor,toxicity
0,0.994616,0.824237,0.999852,0.025401
1,0.902898,0.446324,0.790476,0.065259
2,1.0,0.57572,0.981481,0.99717
3,0.991394,0.658037,0.895062,0.0364
4,1.0,0.57572,0.981481,0.972177


In [24]:
baseline_metrics.to_csv("../data/interim/metrics/baseline-metrics.csv")

# T5-ft evaluation


In [26]:
t5df = pd.read_csv("t5-detoxify.csv", index_col=0)
t5df.head()

Unnamed: 0,source,target
0,and you think grandpa is gonna protect us from...,and you think grandpa is gonna protect us from...
1,might i add very clever assholes,i m sure i add some clever slacks
2,i hate dickheads,i hate dickheads
3,jason put down that stupid camera and come hel...,jason put down that camera and come help me
4,what a scumbag,what a scumbag


In [27]:
semantic_res = metrics.cosine_similarity_score(
    t5df["source"].tolist(), t5df["target"].tolist()
)

In [28]:
bleu_res = metrics.blue_score(t5df["source"].tolist(), t5df["target"].tolist())

In [29]:
meteor_res = metrics.meteor_score(t5df["source"].tolist(), t5df["target"].tolist())

In [30]:
toxicity_res = metrics.toxicity_score(t5df["target"].tolist())

100%|██████████| 120/120 [05:02<00:00,  2.52s/it]


In [31]:
t5_metrics = pd.DataFrame(
    {
        "semantic": semantic_res[0],
        "bleu": bleu_res[0],
        "meteor": meteor_res[0],
        "toxicity": toxicity_res[0],
    }
)
t5_metrics.head()

Unnamed: 0,semantic,bleu,meteor,toxicity
0,0.986086,0.675405,0.814437,0.008657
1,0.936637,0.078826,0.412186,0.00067
2,1.0,0.57572,0.981481,0.99717
3,0.993081,0.669048,0.904103,0.001468
4,1.0,0.57572,0.981481,0.972177


In [36]:
t5_metrics.to_csv("../data/interim/metrics/t5-metrics.csv")

# T5-ft2


In [6]:
t52df = pd.read_csv("t5-ft2-detoxify.csv", index_col=0)
t52df.head()

Unnamed: 0,source,target
0,and you think grandpa is gonna protect us from...,and you think grandpa is gonna protect us from...
1,might i add very clever assholes,i m sure i m going to add some clever tricks
2,i hate dickheads,i hate dickheads
3,jason put down that stupid camera and come hel...,jason put down that camera and come help me
4,what a scumbag,what a scumbag


In [7]:
semantic_res = metrics.cosine_similarity_score(
    t52df["source"].tolist(), t52df["target"].tolist()
)

100%|██████████| 3000/3000 [00:01<00:00, 2619.35it/s]


In [11]:
bleu_res = metrics.blue_score(t52df["source"].tolist(), t52df["target"].tolist())

100%|██████████| 3000/3000 [00:01<00:00, 2921.67it/s]


In [14]:
meteor_res = metrics.meteor_score(t52df["source"].tolist(), t52df["target"].tolist())

100%|██████████| 3000/3000 [00:04<00:00, 613.63it/s] 


In [16]:
toxicity_res = metrics.toxicity_score(t52df["target"].tolist())

100%|██████████| 120/120 [06:15<00:00,  3.13s/it]


In [17]:
t5_metrics = pd.DataFrame(
    {
        "semantic": semantic_res[0],
        "bleu": bleu_res[0],
        "meteor": meteor_res[0],
        "toxicity": toxicity_res[0],
    }
)
t5_metrics.head()

Unnamed: 0,semantic,bleu,meteor,toxicity
0,0.986086,0.675405,0.814437,0.008657
1,0.923351,0.028425,0.230769,0.001101
2,1.0,0.57572,0.981481,0.99717
3,0.993081,0.669048,0.904103,0.001468
4,1.0,0.57572,0.981481,0.972177


In [19]:
t5_metrics.to_csv("t5-ft2-metrics.csv")

# SOTA Preds


In [37]:
from transformers import BartForConditionalGeneration, AutoTokenizer

base_model_name = "facebook/bart-base"
model_name = "SkolkovoInstitute/bart-base-detox"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

(…)ebook/bart-base/resolve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

(…)cebook/bart-base/resolve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

(…)cebook/bart-base/resolve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

(…)ok/bart-base/resolve/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

(…)bart-base-detox/resolve/main/config.json:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [38]:
from tqdm import tqdm

In [41]:
def test(model, tokenizer=tokenizer, batch_size=100):
    res = pd.DataFrame({"source": t5df["source"]})
    model_res = []
    test_data = t5df

    for i in tqdm(range(0, len(test_data), batch_size)):
        batch = test_data[i : i + batch_size]
        input_texts = [line for line in batch["source"]]

        input_ids = tokenizer(
            input_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128,
        ).input_ids
        outputs = model.generate(input_ids=input_ids)

        decoded_outputs = [
            tokenizer.decode(output, skip_special_tokens=True) for output in outputs
        ]
        model_res.extend(decoded_outputs)

    res["target"] = model_res
    return res

In [42]:
sota_res = test(model, tokenizer)

100%|██████████| 30/30 [14:10<00:00, 28.37s/it]


In [45]:
sota_res.head()

Unnamed: 0,source,target
0,and you think grandpa is gonna protect us from...,and you think grandpa is gonna protect us from...
1,might i add very clever assholes,might i add very clever people
2,i hate dickheads,i hate bad people
3,jason put down that stupid camera and come hel...,jason put down that camera and come help me
4,what a scumbag,What a bad person


In [46]:
semantic_res = metrics.cosine_similarity_score(
    sota_res["source"].tolist(), sota_res["target"].tolist()
)

In [47]:
bleu_res = metrics.blue_score(sota_res["source"].tolist(), sota_res["target"].tolist())

In [48]:
meteor_res = metrics.meteor_score(
    sota_res["source"].tolist(), sota_res["target"].tolist()
)

In [49]:
toxicity_res = metrics.toxicity_score(sota_res["target"].tolist())

100%|██████████| 120/120 [05:10<00:00,  2.59s/it]


In [51]:
sota_metrics = pd.DataFrame(
    {
        "semantic": semantic_res[0],
        "bleu": bleu_res[0],
        "meteor": meteor_res[0],
        "toxicity": toxicity_res[0],
    }
)

In [52]:
sota_res.to_csv("../data/interim/predictions/sota_detoxify.csv")

In [53]:
sota_metrics.to_csv("../data/interim/metrics/sota_metrics.csv")