In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from googletrans import Translator
from transformers import Trainer, TrainingArguments, T5Tokenizer, T5ForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## **Prueba traductor**

In [2]:
trans = Translator()
print(trans.translate('Pues hazme los deberes, furcia barata!').text)

Well, do my homework for me, you cheap whore!


## **Dataset**

In [3]:
df = pd.DataFrame(pd.read_csv('3_joined_dataset.csv'))
train_dataframe, val_dataframe = train_test_split(df, train_size=0.75, random_state=42) # random_state=42 for reproducibility

train_toxic_texts = list(train_dataframe['toxic_sentence'])
train_neutral_texts = list(train_dataframe['neutral_sentence'])

val_toxic_texts = list(val_dataframe['toxic_sentence'])
val_neutral_texts = list(val_dataframe['neutral_sentence'])

## **Detoxificador**

In [None]:
model_name = "google/flan-t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Congelar todos los parámetros del modelo
for param in model.parameters():
    param.requires_grad = False

# Descongelar las últimas 3 capas del decoder
for param in model.decoder.block[-3:].parameters():
    param.requires_grad = True

# Mantener la capa de salida (`lm_head`) entrenable
for param in model.lm_head.parameters():
    param.requires_grad = True

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total de parámetros: {total_params:,}")
print(f"Parámetros entrenables: {trainable_params:,}")
print(f"Parámetros congelados: {total_params - trainable_params:,}")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Total de parámetros: 76,961,152
Parámetros entrenables: 25,891,328
Parámetros congelados: 51,069,824


In [5]:
class DetoxDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, target_encodings):
        self.encodings = encodings
        self.target_encodings = target_encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        # item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # item['labels'] = torch.tensor(self.target_encodings['input_ids'][idx])
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.target_encodings['input_ids'][idx].clone().detach()
        return item

In [7]:
toxic_encodings_train = tokenizer(train_toxic_texts, truncation=True, padding=True, return_tensors="pt")
neutral_encodings_train = tokenizer(train_neutral_texts, truncation=True, padding=True, return_tensors="pt")

toxic_encodings_val = tokenizer(val_toxic_texts, truncation=True, padding=True, return_tensors="pt")
neutral_encodings_val = tokenizer(val_neutral_texts, truncation=True, padding=True, return_tensors="pt")

train_dataset = DetoxDataset(toxic_encodings_train, neutral_encodings_train)
val_dataset = DetoxDataset(toxic_encodings_val, neutral_encodings_val)

training_args = TrainingArguments(
    output_dir='./checkpoints',
    evaluation_strategy="no", # NO validar en cada epoch
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    learning_rate=2e-4,
    num_train_epochs=5,
    report_to=["none"], # para no pedir login de 'wandb' y otros
    fp16=True, # acelerar entrenaminento 
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [8]:
trainer.train()

  0%|          | 0/685 [00:00<?, ?it/s]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 0.0002, 'epoch': 3.65}
{'train_runtime': 173.9283, 'train_samples_per_second': 503.253, 'train_steps_per_second': 3.938, 'train_loss': 0.0, 'epoch': 5.0}


TrainOutput(global_step=685, training_loss=0.0, metrics={'train_runtime': 173.9283, 'train_samples_per_second': 503.253, 'train_steps_per_second': 3.938, 'train_loss': 0.0, 'epoch': 5.0})

## **GUARDAR Y CARGAR**

In [9]:
save_directory = "./modelo_local"

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

# model = T5ForConditionalGeneration.from_pretrained(save_directory)
# tokenizer = T5Tokenizer.from_pretrained(save_directory)

('./modelo_local\\tokenizer_config.json',
 './modelo_local\\special_tokens_map.json',
 './modelo_local\\spiece.model',
 './modelo_local\\added_tokens.json')

## **PRUEBAS DETOX**

In [10]:
def detoxify_sentence(text: str):
    # Tokenizar la oración tóxica
    inputs: dict = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    # Mover los tensores a la GPU, si está disponible
    inputs = {key: val.to(device) for key, val in inputs.items()}

    outputs = model.generate(inputs["input_ids"])
    texto_neutralizado = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return texto_neutralizado

In [15]:
test_sentences = [
    "Well, do my homework, you cheap whore.",
    "shut your angry ass up",
]

for sentence in test_sentences:
    neutral_sentence = detoxify_sentence(sentence)
    print(f"Original: {sentence} -> Neutral: {neutral_sentence}") 

Original: Well, do my homework, you cheap whore. -> Neutral: i'm not a student
Original: shut your angry ass up -> Neutral: i'm gonna be a bit shit up


---

---

## **BUCLE DE TRADUCCIÓN** (26 mins)

In [23]:
csv = pd.read_csv('1_detox_dataset.csv')
df = pd.DataFrame(csv)

trans = Translator()
# print(trans.translate(df.iloc[400]['toxic_sentence'], dest='en').text)
print(trans.translate("ታምራት ነገራ ፅንፈኞችን ከላይ እስከታች ሰብስበህ ሁለት ሰንበት ፀበልና ስልጠና ስጥልን", dest='en').text)

Tamrat Negara gathered the extremists from top to bottom and gave us two Sabbath prayers and training.


---

In [None]:
df = pd.DataFrame(pd.read_csv("1_detox_dataset.csv"))
df_trans = pd.DataFrame(columns=["toxic_sentence", "neutral_sentence"])
traductor = Translator()

for index in range(len(df)):
    toxic_translated = traductor.translate(df.iloc[index]['toxic_sentence'], dest='en').text
    neutral_translated = traductor.translate(df.iloc[index]['neutral_sentence'], dest='en').text
    df_trans.loc[len(df_trans)] = [toxic_translated, neutral_translated]
    # break

df_trans.to_csv("2_detox_dataset_trans.csv", index=False)