In [1]:
!7z x "text detoxification.zip"


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.00GHz (50653),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         
ERROR: No more files
text detoxification.zip



System ERROR:
Unknown error -2147024872


In [2]:
!pip install sentence_transformers



In [3]:
!pip install datasets



In [4]:
!pip install evaluate bert_score



In [5]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
import pandas as pd
from datasets import Dataset

# Задача 1

In [6]:

model_name = 't5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name, return_tensors='pt')
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:

data = pd.read_csv('train_en.txt', sep='\t')
sentences = data['toxic'].values.tolist()[:1000]
translations = data['neutral'].values.tolist()[:1000]


In [8]:

inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
outputs = tokenizer(translations, padding=True, truncation=True, return_tensors='pt')


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:

train_data = Dataset.from_dict({
    'input_ids': inputs['input_ids'],
    'attention_mask': inputs['attention_mask'],
    'labels': outputs['input_ids']
})


In [10]:

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

train_dataloader = DataLoader(train_data, batch_size=8, collate_fn=data_collator)


In [11]:

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

model.train()
for epoch in range(3):
    total_loss = 0
    for batch in train_dataloader:
        batch = {key: value.to(device) for key, value in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_dataloader)}")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1, Loss: 1.9421620330810547
Epoch 2, Loss: 0.5521605021953583
Epoch 3, Loss: 0.454862149477005


In [12]:
from evaluate import load

In [13]:
bertscore = load("bertscore")
meteor = load("meteor")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [14]:
test_data = pd.read_csv('test_en.txt', sep='\t')

In [15]:
test_sentences = data['toxic'].values.tolist()[:1000]
test_translations = data['neutral'].values.tolist()[:1000]

In [24]:
def generate_predictions(model, tokenizer, sentences, device, batch_size=1):
    predictions = []
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i+batch_size]
        inputs = tokenizer(batch_sentences, return_tensors='pt', padding=True, truncation=True, max_length=256)

        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=256)

        batch_predictions = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(batch_predictions)

        torch.cuda.empty_cache()

    return predictions

In [None]:
predictions = generate_predictions(model, tokenizer, test_sentences, device='cuda')


In [None]:
bertscore_results = bertscore.compute(predictions=predictions, references=test_translations, lang='en', device='cpu')

In [36]:
import numpy as np
avg_bertscore = np.mean(bertscore_results['precision'])
print(f"Average BERTScore: {avg_bertscore}")

Average BERTScore: 0.9307315553426743


In [32]:
meteor_results = meteor.compute(predictions=predictions, references=test_translations)

In [33]:
meteor_results

{'meteor': 0.6908625389350477}

# Задача 2

In [37]:
sentences = data['toxic'].values.tolist()[:1000]
neutral_sentences = data['neutral'].values.tolist()[:1000]

In [38]:
results = {}

In [56]:
def create_fewshot_prompt(examples, input_text):
    prompt = ""
    for toxic, neutral in examples:
        prompt += f"Here is a toxic example: {toxic}\nHere is a non-toxic example: {neutral}\n"

    prompt += f"Rewrite the following text into non-toxic: {input_text}"

    return prompt

In [57]:
def generate_predictions(model, tokenizer, sentences, examples, device='cuda', batch_size=8):
    model.to(device)
    predictions = []

    prompts = [create_fewshot_prompt(examples, sentence) for sentence in sentences]

    inputs = tokenizer(prompts, padding=True, truncation=True, return_tensors='pt').to(device)

    with torch.no_grad():
        for i in range(0, len(sentences), batch_size):
            batch = {key: value[i:i+batch_size] for key, value in inputs.items()}

            outputs = model.generate(batch['input_ids'], max_length=50, num_beams=5, early_stopping=True)

            for output in outputs:
                prediction = tokenizer.decode(output, skip_special_tokens=True)
                predictions.append(prediction)

    return predictions

In [55]:
model_name = 'google/flan-t5-small'
model=AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [58]:
import random

results = {}

for n_examples in [1, 2, 3, 5, 10]:
    print(f"Generating fewshot with {n_examples} examples")
    random_examples = random.sample(list(zip(sentences, neutral_sentences)), n_examples)

    predictions = generate_predictions(model, tokenizer, sentences, random_examples, device='cuda', batch_size=8)

    results[n_examples] = predictions

Generating fewshot with 1 examples
Generating fewshot with 2 examples
Generating fewshot with 3 examples
Generating fewshot with 5 examples
Generating fewshot with 10 examples


In [61]:
for key in results.keys():
  predictions = results[key]
  print(f"Bertscore for num_examples={key}")
  print(np.mean(bertscore.compute(predictions=predictions, references=neutral_sentences, lang='en', device='cpu')['precision']))
  print(f"Meteor for num_examples={key}")
  print(meteor.compute(predictions=predictions, references=neutral_sentences))

Bertscore for num_examples=1




0.8966792778372764
Meteor for num_examples=1
{'meteor': 0.5261145041361404}
Bertscore for num_examples=2




0.8778233479857445
Meteor for num_examples=2
{'meteor': 0.32537276921173097}
Bertscore for num_examples=3
0.8517684339284897
Meteor for num_examples=3
{'meteor': 0.05873968215711416}
Bertscore for num_examples=5
0.8651050456762314
Meteor for num_examples=5
{'meteor': 0.19587296391916376}
Bertscore for num_examples=10
0.8757851771116256
Meteor for num_examples=10
{'meteor': 0.32111996274488686}


Изгледа дека со овој промпт формат, подобро е со 1 пример наместо повеќе. Не можам да пробам со различни промптови бидејќи на сите кодови од оваа задача им е потребно премногу време за да се извршат

# Задача 3

In [68]:
instruction = "Turn this sentence into non-toxic: "

In [69]:

model_name = 't5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name, return_tensors='pt')
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [70]:

data = pd.read_csv('train_en.txt', sep='\t')
sentences = data['toxic'].values.tolist()[:1000]
translations = data['neutral'].values.tolist()[:1000]


In [71]:
sentences_in = [f'{instruction}{s}' for s in sentences]

In [72]:

inputs = tokenizer(sentences_in, padding=True, truncation=True, return_tensors='pt')
outputs = tokenizer(translations, padding=True, truncation=True, return_tensors='pt')


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [73]:

train_data = Dataset.from_dict({
    'input_ids': inputs['input_ids'],
    'attention_mask': inputs['attention_mask'],
    'labels': outputs['input_ids']
})


In [74]:

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

train_dataloader = DataLoader(train_data, batch_size=8, collate_fn=data_collator)


In [75]:

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

model.train()
for epoch in range(3):
    total_loss = 0
    for batch in train_dataloader:
        batch = {key: value.to(device) for key, value in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_dataloader)}")

Epoch 1, Loss: 1.6979407331943512
Epoch 2, Loss: 0.5159563851356507
Epoch 3, Loss: 0.42888057899475096


In [76]:
from evaluate import load

In [77]:
bertscore = load("bertscore")
meteor = load("meteor")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [78]:
test_data = pd.read_csv('test_en.txt', sep='\t')

In [79]:
test_sentences = data['toxic'].values.tolist()[:1000]
test_sentences = [f'{instruction}{s}' for s in test_sentences]
test_translations = data['neutral'].values.tolist()[:1000]

In [80]:
def generate_predictions(model, tokenizer, sentences, device, batch_size=1):
    predictions = []
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i+batch_size]
        inputs = tokenizer(batch_sentences, return_tensors='pt', padding=True, truncation=True, max_length=256)

        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=256)

        batch_predictions = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(batch_predictions)

        torch.cuda.empty_cache()

    return predictions

In [81]:
predictions = generate_predictions(model, tokenizer, test_sentences, device='cuda')


In [82]:
bertscore_results = bertscore.compute(predictions=predictions, references=test_translations, lang='en', device='cpu')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [83]:
import numpy as np
avg_bertscore = np.mean(bertscore_results['precision'])
print(f"Average BERTScore: {avg_bertscore}")

Average BERTScore: 0.9348160962462425


In [84]:
meteor_results = meteor.compute(predictions=predictions, references=test_translations)

In [85]:
meteor_results

{'meteor': 0.7037581588500109}

# Со овој метод се добиваат највисоки метрики