# Librerías

In [1]:
!pip install datasets
!pip install sacremoses
!pip install sacrebleu
!pip install evaluate
!pip install transformers[sentencepiece]
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     

In [2]:
import pandas as pd
import numpy as np
import tqdm
import sys
import os

In [3]:
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import pandas as pd

In [12]:
from transformers import Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM
from transformers import EarlyStoppingCallback
from transformers import Seq2SeqTrainer

import numpy as np
import pickle
import evaluate

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Funciones auxiliares

In [33]:
def preprocess_dataset(path_dataset: str, lang_output: str):
  """
  Lee los datos y los preprocesa. Lo pasa al formato necesario DatasetDict
  y divide los datos en train, test y validación.
  Sirve para traducción de indígena a español

  input:
  - path_dataset: con la ruta en donde se encuentra la base a procesar
  - lang_output: wayuu, arh de donde va a terminar la traducción

  output:
  - dataset_dict: DatasetDict con train test y validation
  """
  # Lectura de datos y conversión a diccionario
  dataset = pd.read_csv(path_dataset)
  conv = {'esp': 'es', 'wayuu': lang_output, 'arh': lang_output}
  dataset.rename(columns = conv, inplace = True)

  dataset = [{'es': row['es'], lang_output: row[lang_output]} for _, row in dataset.iterrows()]

  # División train, test y validación
  train, test = train_test_split(dataset, test_size = 0.2, random_state = 42)
  val, test = train_test_split(test, test_size = 0.5, random_state = 42)

  # Creación de datasets
  train = Dataset.from_dict({"id": list(range(len(train))), "translation": train})
  test = Dataset.from_dict({"id": list(range(len(test))), "translation": test})
  validation = Dataset.from_dict({"id": list(range(len(val))), "translation": val})

  # Creación del diccionario
  dataset_dict = DatasetDict({"train": train, "test": test, "validation": validation})

  return dataset_dict

def tokenizar(dataset_dict, model_checkpoint, max_length = 150):
  """
  A partir de un DatasetDict, tokeniza los datos. Esto depende del modelo a utilizar,
  y de un modelo específico.

  input:
  - dataset_dict: con los datos de train, test y validación
  - model_checkpoint: identificador del modelo a utilizar
  - max_length: de las sentencias a considerar

  output:
  - tokenized_datasets
  """
  # Cargar tokenizador
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

  def preprocess_function(examples):
      inputs = [ex["es"] for ex in examples["translation"]]
      targets = [ex["fi"] for ex in examples["translation"]]
      model_inputs = tokenizer(
          inputs, text_target=targets, max_length=max_length, truncation=True
      )
      return model_inputs

  # Tokenizar los datos
  tokenized_datasets = dataset_dict.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset_dict["train"].column_names,
  )

  return tokenized_datasets, tokenizer

In [41]:
def get_model(tokenized_datasets, tokenizer, model_checkpoint,
              learning_rate = 2e-5, epochs = 3, weight_decay = 0.01):

  model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
  data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

  metric = evaluate.load("sacrebleu")

  def compute_metrics(eval_preds):
      preds, labels = eval_preds
      # In case the model returns more than the prediction logits
      if isinstance(preds, tuple):
          preds = preds[0]

      decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

      # Replace -100s in the labels as we can't decode them
      labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
      decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

      # Some simple post-processing
      decoded_preds = [pred.strip() for pred in decoded_preds]
      decoded_labels = [[label.strip()] for label in decoded_labels]

      result = metric.compute(predictions=decoded_preds, references=decoded_labels)
      return {"bleu": result["score"]}

  args = Seq2SeqTrainingArguments(
      f"marian-finetuned-kde4-es-to-fi",
      evaluation_strategy= "no",
      save_strategy="no", # "epoch"
      learning_rate = learning_rate,
      per_device_train_batch_size=32,
      per_device_eval_batch_size=64,
      weight_decay=weight_decay,
      save_total_limit=3,
      num_train_epochs=epochs,
      predict_with_generate=True,
      fp16=True,
      push_to_hub=False,
      load_best_model_at_end = True
  )

  early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=1, early_stopping_threshold=0.0)

  trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    #callbacks=[early_stopping_callback]
  )

  return trainer

# Parametros

In [42]:
path = '/content/drive/MyDrive/Colab Notebooks/Talleres NLP/Proyecto/data_clean'
path_out = '/content/drive/MyDrive/Colab Notebooks/Talleres NLP/Proyecto/results'

params = {
    'dataset': ['/wayuu/COMP_ND.csv', '/wayuu/COMP_NDU.csv', '/wayuu/COMP_NC.csv', '/wayuu/COMP.csv'],
    'epochs': [3, 5, 10],
    'learning_rate' : [2e-5, 2e-4]
}

model_checkpoint = 'Helsinki-NLP/opus-mt-es-fi'

max_length = 128

In [None]:
for d in params['dataset']:

  # Procesar los datos
  dataset_dict = preprocess_dataset(path + d, lang_output = 'fi')
  tokenized_dataset, tokenizer = tokenizar(dataset_dict, model_checkpoint)

  for e in params['epochs']:
      for lr in params['learning_rate']:

          # Crear el modelo
          trainer = get_model(tokenized_dataset, tokenizer, model_checkpoint,
            learning_rate = lr, epochs = e)

          # Nombre-datos
          d = d.split('/')[-1].split('.')[0]

          # Obtener métricas antes
          #metrics1 = trainer.evaluate(max_length = max_length)
          #with open(path_out + f'/metrica_antes_{d}_{e}_{lr}.pickle', 'wb') as file:
          #  pickle.dump(metrics1, file)

          #print(metrics1)

          # Entrenar
          trainer.train()
          trainer.save_model(path_out + f'/modelo_{d}_{e}_{lr}')

          # Obtener métricas después
          metrics2 = trainer.evaluate(max_length = max_length)
          with open(path_out + f'/metrica_despues_{d}_{e}_{lr}.pickle', 'wb') as file:
            pickle.dump(metrics2, file)

          print(metrics2)