In [None]:
!7z x "text detoxification.zip"


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.30GHz (306F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 507343 bytes (496 KiB)

Extracting archive: text detoxification.zip
--
Path = text detoxification.zip
Type = zip
Physical Size = 507343

  0%    Everything is Ok

Files: 3
Size:       1630442
Compressed: 507343


In [1]:
def replace_first_line(file_path, new_first_line):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    if lines:
        lines[0] = new_first_line + '\n'

    with open(file_path, 'w') as file:
        file.writelines(lines)

file_paths = ['train_en.txt', 'test_en.txt', 'val_en.txt']
new_first_line = "toxic\tneutral"

for file_path in file_paths:
    replace_first_line(file_path, new_first_line)
    print(f"Updated first line in {file_path}.")

Updated first line in train_en.txt.
Updated first line in test_en.txt.
Updated first line in val_en.txt.


In [3]:
!pip install sentence_transformers



In [4]:
!pip install datasets



In [25]:
!pip install evaluate bert_score nltk




In [6]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
import pandas as pd
from datasets import Dataset

In [7]:

data = pd.read_csv('train_en.txt', sep='\t')
sentences = data['toxic'].values.tolist()[:1000]
translations = data['neutral'].values.tolist()[:1000]


# Задача 1

In [None]:

model_name = 't5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name, return_tensors='pt')
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:

inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
outputs = tokenizer(translations, padding=True, truncation=True, return_tensors='pt')


In [None]:

train_data = Dataset.from_dict({
    'input_ids': inputs['input_ids'],
    'attention_mask': inputs['attention_mask'],
    'labels': outputs['input_ids']
})


In [None]:

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

train_dataloader = DataLoader(train_data, batch_size=8, collate_fn=data_collator)


In [None]:

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

model.train()
for epoch in range(3):
    total_loss = 0
    for batch in train_dataloader:
        batch = {key: value.to(device) for key, value in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_dataloader)}")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1, Loss: 1.9421620330810547
Epoch 2, Loss: 0.5521605021953583
Epoch 3, Loss: 0.454862149477005


In [None]:
from evaluate import load

In [None]:
bertscore = load("bertscore")
meteor = load("meteor")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
test_data = pd.read_csv('test_en.txt', sep='\t')

In [None]:
test_sentences = data['toxic'].values.tolist()[:1000]
test_translations = data['neutral'].values.tolist()[:1000]

In [None]:
def generate_predictions(model, tokenizer, sentences, device, batch_size=1):
    predictions = []
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i+batch_size]
        inputs = tokenizer(batch_sentences, return_tensors='pt', padding=True, truncation=True, max_length=256)

        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=256)

        batch_predictions = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(batch_predictions)

        torch.cuda.empty_cache()

    return predictions

In [None]:
predictions = generate_predictions(model, tokenizer, test_sentences, device='cuda')


In [None]:
bertscore_results = bertscore.compute(predictions=predictions, references=test_translations, lang='en', device='cpu')

In [None]:
import numpy as np
avg_bertscore = np.mean(bertscore_results['precision'])
print(f"Average BERTScore: {avg_bertscore}")

Average BERTScore: 0.9307315553426743


In [None]:
meteor_results = meteor.compute(predictions=predictions, references=test_translations)

In [None]:
meteor_results

{'meteor': 0.6908625389350477}

# Задача 2

In [None]:
sentences = data['toxic'].values.tolist()[:1000]
neutral_sentences = data['neutral'].values.tolist()[:1000]

In [None]:
results = {}

In [None]:
def create_fewshot_prompt(examples, input_text):
    prompt = ""
    for toxic, neutral in examples:
        prompt += f"Here is a toxic example: {toxic}\nHere is a non-toxic example: {neutral}\n"

    prompt += f"Rewrite the following text into non-toxic: {input_text}"

    return prompt

In [None]:
def generate_predictions(model, tokenizer, sentences, examples, device='cuda', batch_size=8):
    model.to(device)
    predictions = []

    prompts = [create_fewshot_prompt(examples, sentence) for sentence in sentences]

    inputs = tokenizer(prompts, padding=True, truncation=True, return_tensors='pt').to(device)

    with torch.no_grad():
        for i in range(0, len(sentences), batch_size):
            batch = {key: value[i:i+batch_size] for key, value in inputs.items()}

            outputs = model.generate(batch['input_ids'], max_length=50, num_beams=5, early_stopping=True)

            for output in outputs:
                prediction = tokenizer.decode(output, skip_special_tokens=True)
                predictions.append(prediction)

    return predictions

In [None]:
model_name = 'google/flan-t5-small'
model=AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
import random

results = {}

for n_examples in [1, 2, 3, 5, 10]:
    print(f"Generating fewshot with {n_examples} examples")
    random_examples = random.sample(list(zip(sentences, neutral_sentences)), n_examples)

    predictions = generate_predictions(model, tokenizer, sentences, random_examples, device='cuda', batch_size=8)

    results[n_examples] = predictions

Generating fewshot with 1 examples
Generating fewshot with 2 examples
Generating fewshot with 3 examples
Generating fewshot with 5 examples
Generating fewshot with 10 examples


In [None]:
for key in results.keys():
  predictions = results[key]
  print(f"Bertscore for num_examples={key}")
  print(np.mean(bertscore.compute(predictions=predictions, references=neutral_sentences, lang='en', device='cpu')['precision']))
  print(f"Meteor for num_examples={key}")
  print(meteor.compute(predictions=predictions, references=neutral_sentences))

Bertscore for num_examples=1




0.8966792778372764
Meteor for num_examples=1
{'meteor': 0.5261145041361404}
Bertscore for num_examples=2




0.8778233479857445
Meteor for num_examples=2
{'meteor': 0.32537276921173097}
Bertscore for num_examples=3
0.8517684339284897
Meteor for num_examples=3
{'meteor': 0.05873968215711416}
Bertscore for num_examples=5
0.8651050456762314
Meteor for num_examples=5
{'meteor': 0.19587296391916376}
Bertscore for num_examples=10
0.8757851771116256
Meteor for num_examples=10
{'meteor': 0.32111996274488686}


Изгледа дека со овој промпт формат, подобро е со 1 пример наместо повеќе. Не можам да пробам со различни промптови бидејќи на сите кодови од оваа задача им е потребно премногу време за да се извршат

# Задача 3

In [None]:
instruction = "Turn this sentence into non-toxic: "

In [None]:

model_name = 't5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name, return_tensors='pt')
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [None]:

data = pd.read_csv('train_en.txt', sep='\t')
sentences = data['toxic'].values.tolist()[:1000]
translations = data['neutral'].values.tolist()[:1000]


In [None]:
sentences_in = [f'{instruction}{s}' for s in sentences]

In [None]:

inputs = tokenizer(sentences_in, padding=True, truncation=True, return_tensors='pt')
outputs = tokenizer(translations, padding=True, truncation=True, return_tensors='pt')


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:

train_data = Dataset.from_dict({
    'input_ids': inputs['input_ids'],
    'attention_mask': inputs['attention_mask'],
    'labels': outputs['input_ids']
})


In [None]:

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

train_dataloader = DataLoader(train_data, batch_size=8, collate_fn=data_collator)


In [None]:

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

model.train()
for epoch in range(3):
    total_loss = 0
    for batch in train_dataloader:
        batch = {key: value.to(device) for key, value in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_dataloader)}")

Epoch 1, Loss: 1.6979407331943512
Epoch 2, Loss: 0.5159563851356507
Epoch 3, Loss: 0.42888057899475096


In [None]:
from evaluate import load

In [None]:
bertscore = load("bertscore")
meteor = load("meteor")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
test_data = pd.read_csv('test_en.txt', sep='\t')

In [None]:
test_sentences = data['toxic'].values.tolist()[:1000]
test_sentences = [f'{instruction}{s}' for s in test_sentences]
test_translations = data['neutral'].values.tolist()[:1000]

In [None]:
def generate_predictions(model, tokenizer, sentences, device, batch_size=1):
    predictions = []
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i+batch_size]
        inputs = tokenizer(batch_sentences, return_tensors='pt', padding=True, truncation=True, max_length=256)

        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=256)

        batch_predictions = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(batch_predictions)

        torch.cuda.empty_cache()

    return predictions

In [None]:
predictions = generate_predictions(model, tokenizer, test_sentences, device='cuda')


In [None]:
bertscore_results = bertscore.compute(predictions=predictions, references=test_translations, lang='en', device='cpu')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import numpy as np
avg_bertscore = np.mean(bertscore_results['precision'])
print(f"Average BERTScore: {avg_bertscore}")

Average BERTScore: 0.9348160962462425


In [None]:
meteor_results = meteor.compute(predictions=predictions, references=test_translations)

In [None]:
meteor_results

{'meteor': 0.7037581588500109}

# Со овој метод се добиваат највисоки метрики

# Задача 4

In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import torch.nn as nn

In [9]:
instruction_tox = 'Rewrite this text from toxic to neutral: '
instruction_classify = 'Classify the following text into toxic or neutral: %s (write "toxic" if it is toxic, or "neutral" if it is neutral)'

In [10]:
sentences_tox_in = [f'{instruction_tox}{s}' for s in sentences][:500]
sentences_classify = [f'{instruction_classify % s}' for s in sentences][:250]
sentences_classify += [f'{instruction_classify % t}' for t in translations][:250]

In [11]:
input_sentences = sentences_tox_in + sentences_classify
output_sentences = sentences[:500] + ['toxic'] * 250 + ['neutral'] * 250

In [12]:
model_name = 't5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [13]:
encoded_inputs = tokenizer(input_sentences, padding=True, truncation=True, return_tensors="pt")
encoded_outputs = tokenizer(output_sentences, padding=True, truncation=True, return_tensors="pt")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [14]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [15]:
x_train, x_test, y_train, y_test = train_test_split(encoded_inputs["input_ids"], encoded_outputs["input_ids"], test_size=0.2, random_state=0)


In [16]:
x_train, x_test = torch.tensor(x_train).to(device), torch.tensor(x_test).to(device)
y_train, y_test = torch.tensor(y_train).to(device), torch.tensor(y_test).to(device)

  x_train, x_test = torch.tensor(x_train).to(device), torch.tensor(x_test).to(device)
  y_train, y_test = torch.tensor(y_train).to(device), torch.tensor(y_test).to(device)


In [17]:
train_dataset = TensorDataset(x_train, y_train)
test_dataset = TensorDataset(x_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [18]:
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=5e-5)


In [19]:
criterion = nn.CrossEntropyLoss()


In [20]:
epochs = 3
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for batch in train_loader:
        input_ids, output_ids = batch

        input_ids = input_ids.to(device)
        output_ids = output_ids.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, labels=output_ids)
        logits = outputs.logits

        loss = criterion(logits.view(-1, logits.size(-1)), output_ids.view(-1))
        loss.backward()

        optimizer.step()

        train_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss / len(train_loader):.4f}")


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1/3, Train Loss: 1.8019
Epoch 2/3, Train Loss: 0.3919
Epoch 3/3, Train Loss: 0.2041


In [22]:
x_train

tensor([[ 4501,  4921,     8,  ...,     0,     0,     0],
        [ 4501,  4921,     8,  ...,     0,     0,     0],
        [  419, 17504,    48,  ...,     0,     0,     0],
        ...,
        [ 4501,  4921,     8,  ...,     0,     0,     0],
        [ 4501,  4921,     8,  ...,     0,     0,     0],
        [ 4501,  4921,     8,  ...,     0,     0,     0]], device='cuda:0')

In [32]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [61]:
import torch
import bert_score
from nltk.translate.meteor_score import meteor_score

model.eval()

predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, output_ids = batch

        input_ids = input_ids.to(device)
        output_ids = output_ids.to(device)

        outputs = model(input_ids=input_ids, labels=output_ids)
        logits = outputs.logits
        loss = outputs.loss

        predicted_ids = torch.argmax(logits, dim=-1)

        predictions.extend(predicted_ids.cpu().numpy())
        true_labels.extend(output_ids.cpu().numpy())

predicted_texts = tokenizer.batch_decode(predictions, skip_special_tokens=True)
true_texts = tokenizer.batch_decode(true_labels, skip_special_tokens=True)

tox_predicted = predicted_texts[:500]
tox_true = true_texts[:500]

bert_precision, bert_recall, bert_f1 = bert_score.score(tox_predicted, tox_true, lang='en')

from nltk.tokenize import word_tokenize

tokenized_tox_predicted = [word_tokenize(pred) for pred in tox_predicted]
tokenized_tox_true = [word_tokenize(true) for true in tox_true]

meteor_scores = [meteor_score([true], pred) for true, pred in zip(tokenized_tox_true, tokenized_tox_predicted)]

avg_meteor = sum(meteor_scores) / len(meteor_scores)
avg_bert_f1 = sum(bert_f1) / len(bert_f1)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [62]:

print(f"Detoxification Metrics:")
print(f"Average BERTScore F1: {avg_bert_f1:.4f}")
print(f"Average METEOR Score: {avg_meteor:.4f}")


Detoxification Metrics:
Average BERTScore F1: 0.9343
Average METEOR Score: 0.5818


In [65]:
input_text = f"Classify the following text into toxic or neutral: {sentences[999]}"
print(input_text)
encoded_input = tokenizer(input_text, return_tensors='pt').to(device)

model.eval()
with torch.no_grad():
    outputs = model(input_ids=encoded_input['input_ids'], labels=encoded_input['input_ids'])
    logits = outputs.logits

    predicted_text = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
    print(f"Predicted text: {predicted_text}")



Classify the following text into toxic or neutral: exploiting it and pumping it into the atmosphere is idiotic at best .
Predicted text: ify the following text into toxic or neutral: exploiting it and pumping it into the atmosphere is idiotic at best .


# Моделот само ги повтара оригиналните реченици кога се тестира за класификација, додека за трансформација добива добри резултати