# Librerías

In [1]:
!pip install datasets
!pip install sacremoses
!pip install sacrebleu
!pip install evaluate
!pip install transformers[sentencepiece]
!pip install transformers[torch]



In [2]:
import pandas as pd
import numpy as np
import tqdm
import sys
import os

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="2,3"

In [4]:
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from transformers import Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM
from transformers import EarlyStoppingCallback
from transformers import Seq2SeqTrainer

import numpy as np
import pickle
import evaluate

2023-11-28 14:41:08.381983: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-28 14:41:08.518604: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-28 14:41:09.116527: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-11-28 14:41:09.116585: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

# Funciones auxiliares

In [6]:
def preprocess_dataset(path_dataset: str, lang_output: str):
  """
  Lee los datos y los preprocesa. Lo pasa al formato necesario DatasetDict
  y divide los datos en train, test y validación.
  Sirve para traducción de indígena a español

  input:
  - path_dataset: con la ruta en donde se encuentra la base a procesar
  - lang_output: wayuu, arh de donde va a terminar la traducción

  output:
  - dataset_dict: DatasetDict con train test y validation
  """
  # Lectura de datos y conversión a diccionario
  dataset = pd.read_csv(path_dataset)
  conv = {'esp': 'es', 'wayuu': lang_output, 'arh': lang_output}
  dataset.rename(columns = conv, inplace = True)

  dataset = [{'es': row['es'], lang_output: row[lang_output]} for _, row in dataset.iterrows()]

  # División train, test y validación
  train, test = train_test_split(dataset, test_size = 0.2, random_state = 42)
  val, test = train_test_split(test, test_size = 0.5, random_state = 42)

  # Creación de datasets
  train = Dataset.from_dict({"id": list(range(len(train))), "translation": train})
  test = Dataset.from_dict({"id": list(range(len(test))), "translation": test})
  validation = Dataset.from_dict({"id": list(range(len(val))), "translation": val})

  # Creación del diccionario
  dataset_dict = DatasetDict({"train": train, "test": test, "validation": validation})

  return dataset_dict

def tokenizar(dataset_dict, model_checkpoint, max_length = 150):
  """
  A partir de un DatasetDict, tokeniza los datos. Esto depende del modelo a utilizar,
  y de un modelo específico.

  input:
  - dataset_dict: con los datos de train, test y validación
  - model_checkpoint: identificador del modelo a utilizar
  - max_length: de las sentencias a considerar

  output:
  - tokenized_datasets
  """
  # Cargar tokenizador
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

  def preprocess_function(examples):
      inputs = [ex["fi"] for ex in examples["translation"]]
      targets = [ex["es"] for ex in examples["translation"]]
      model_inputs = tokenizer(
          inputs, text_target=targets, max_length=max_length, truncation=True
      )
      return model_inputs

  # Tokenizar los datos
  tokenized_datasets = dataset_dict.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset_dict["train"].column_names,
  )

  return tokenized_datasets, tokenizer

In [7]:
def get_model(tokenized_datasets, tokenizer, model_checkpoint,
              learning_rate = 2e-5, epochs = 3, weight_decay = 0.01):

  model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
  data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

  metric = evaluate.load("sacrebleu")

  def compute_metrics(eval_preds):
      preds, labels = eval_preds
      # In case the model returns more than the prediction logits
      if isinstance(preds, tuple):
          preds = preds[0]

      decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

      # Replace -100s in the labels as we can't decode them
      labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
      decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

      # Some simple post-processing
      decoded_preds = [pred.strip() for pred in decoded_preds]
      decoded_labels = [[label.strip()] for label in decoded_labels]

      result = metric.compute(predictions=decoded_preds, references=decoded_labels)
      return {"bleu": result["score"]}

  args = Seq2SeqTrainingArguments(
      f"marian-finetuned-kde4-fi-to-es",
      evaluation_strategy= "no",
      save_strategy="no", # "epoch"
      learning_rate = learning_rate,
      per_device_train_batch_size=32,
      per_device_eval_batch_size=64,
      weight_decay=weight_decay,
      save_total_limit=3,
      num_train_epochs=epochs,
      predict_with_generate=True,
      fp16=True,
      push_to_hub=False,
      load_best_model_at_end = True
  )

  early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=1, early_stopping_threshold=0.0)

  trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    #callbacks=[early_stopping_callback]
  )

  return trainer

# Parametros

In [8]:
path = 'data'
path_out = 'results'

params = {
    'dataset': ['/wayuu/COMP_ND.csv', '/wayuu/COMP_NDU.csv', '/wayuu/COMP_NC.csv', '/wayuu/COMP.csv'],
    'epochs': [3, 5, 10],
    'learning_rate' : [2e-5, 2e-4]
}

model_checkpoint = 'Helsinki-NLP/opus-mt-fi-es'

max_length = 128

In [9]:
d = params['dataset'][0]

# Procesar los datos
dataset_dict = preprocess_dataset(path + d, lang_output = 'fi')
tokenized_dataset, tokenizer = tokenizar(dataset_dict, model_checkpoint)

for e in params['epochs']:
    for lr in params['learning_rate']:

        # Crear el modelo
        trainer = get_model(tokenized_dataset, tokenizer, model_checkpoint,
          learning_rate = lr, epochs = e)

        # Nombre-datos
        d = d.split('/')[-1].split('.')[0]

        # Obtener métricas antes
        #metrics1 = trainer.evaluate(max_length = max_length)
        #with open(path_out + f'/metrica_antes_{d}_{e}_{lr}.pickle', 'wb') as file:
        #  pickle.dump(metrics1, file)

        #print(metrics1)

        # Entrenar
        trainer.train()
        trainer.save_model(path_out + f'/modelo_{d}_{e}_{lr}_REVES')

        # Obtener métricas después
        metrics2 = trainer.evaluate(max_length = max_length)
        with open(path_out + f'/metrica_despues_{d}_{e}_{lr}_REVES.pickle', 'wb') as file:
          pickle.dump(metrics2, file)

        print(metrics2)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7151/7151 [00:03<00:00, 1984.06 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 894/894 [00:00<00:00, 1786.24 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 894/894 [00:00<00:00, 2032.57 examples/s]


Step,Training Loss




{'eval_loss': 3.674457311630249, 'eval_bleu': 0.8020157512245715, 'eval_runtime': 47.2963, 'eval_samples_per_second': 18.902, 'eval_steps_per_second': 0.148, 'epoch': 3.0}


Step,Training Loss




{'eval_loss': 3.0449492931365967, 'eval_bleu': 3.5821907352866433, 'eval_runtime': 42.0343, 'eval_samples_per_second': 21.268, 'eval_steps_per_second': 0.167, 'epoch': 3.0}


Step,Training Loss
500,3.7947




{'eval_loss': 3.5203258991241455, 'eval_bleu': 1.2808105990353378, 'eval_runtime': 44.6812, 'eval_samples_per_second': 20.008, 'eval_steps_per_second': 0.157, 'epoch': 5.0}


Step,Training Loss
500,2.9025




{'eval_loss': 2.891594409942627, 'eval_bleu': 5.196829974673048, 'eval_runtime': 41.006, 'eval_samples_per_second': 21.802, 'eval_steps_per_second': 0.171, 'epoch': 5.0}


Step,Training Loss
500,3.7425
1000,3.2617




{'eval_loss': 3.287564992904663, 'eval_bleu': 2.6178198763210125, 'eval_runtime': 42.5402, 'eval_samples_per_second': 21.015, 'eval_steps_per_second': 0.165, 'epoch': 10.0}


Step,Training Loss
500,2.8582
1000,1.6458




{'eval_loss': 2.8723933696746826, 'eval_bleu': 8.298034099332504, 'eval_runtime': 40.6706, 'eval_samples_per_second': 21.981, 'eval_steps_per_second': 0.172, 'epoch': 10.0}


In [10]:
d = params['dataset'][1]

# Procesar los datos
dataset_dict = preprocess_dataset(path + d, lang_output = 'fi')
tokenized_dataset, tokenizer = tokenizar(dataset_dict, model_checkpoint)

for e in params['epochs']:
    for lr in params['learning_rate']:

        # Crear el modelo
        trainer = get_model(tokenized_dataset, tokenizer, model_checkpoint,
          learning_rate = lr, epochs = e)

        # Nombre-datos
        d = d.split('/')[-1].split('.')[0]

        # Obtener métricas antes
        #metrics1 = trainer.evaluate(max_length = max_length)
        #with open(path_out + f'/metrica_antes_{d}_{e}_{lr}.pickle', 'wb') as file:
        #  pickle.dump(metrics1, file)

        #print(metrics1)

        # Entrenar
        trainer.train()
        trainer.save_model(path_out + f'/modelo_{d}_{e}_{lr}_REVES')

        # Obtener métricas después
        metrics2 = trainer.evaluate(max_length = max_length)
        with open(path_out + f'/metrica_despues_{d}_{e}_{lr}_REVES.pickle', 'wb') as file:
          pickle.dump(metrics2, file)

        print(metrics2)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63398/63398 [00:12<00:00, 5145.55 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7925/7925 [00:01<00:00, 5207.38 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7925/7925 [00:01<00:00, 5173.24 examples/s]


Step,Training Loss
500,3.9672
1000,3.5456
1500,3.3335
2000,3.2529
2500,3.1678




{'eval_loss': 3.097890853881836, 'eval_bleu': 3.671259436569692, 'eval_runtime': 313.8695, 'eval_samples_per_second': 25.249, 'eval_steps_per_second': 0.198, 'epoch': 3.0}


Step,Training Loss
500,3.6003
1000,3.0733
1500,2.6252
2000,2.5517
2500,2.2468




{'eval_loss': 2.589311122894287, 'eval_bleu': 8.816313070329533, 'eval_runtime': 273.3427, 'eval_samples_per_second': 28.993, 'eval_steps_per_second': 0.227, 'epoch': 3.0}


Step,Training Loss
500,3.9626
1000,3.5293
1500,3.2992
2000,3.2018
2500,3.0837
3000,3.0293
3500,2.9558
4000,2.937
4500,2.8998




{'eval_loss': 2.9202771186828613, 'eval_bleu': 5.624198988662807, 'eval_runtime': 301.1759, 'eval_samples_per_second': 26.314, 'eval_steps_per_second': 0.206, 'epoch': 5.0}


Step,Training Loss
500,3.6009
1000,3.0814
1500,2.6444
2000,2.5869
2500,2.2608
3000,2.2358
3500,1.9804
4000,1.9709
4500,1.788




{'eval_loss': 2.5654776096343994, 'eval_bleu': 9.51608827947945, 'eval_runtime': 255.5792, 'eval_samples_per_second': 31.008, 'eval_steps_per_second': 0.243, 'epoch': 5.0}


Step,Training Loss
500,3.9592
1000,3.5174
1500,3.2757
2000,3.1684
2500,3.0312
3000,2.9665
3500,2.8673
4000,2.8375
4500,2.7662
5000,2.7432




{'eval_loss': 2.7195394039154053, 'eval_bleu': 7.519504205930042, 'eval_runtime': 277.1831, 'eval_samples_per_second': 28.591, 'eval_steps_per_second': 0.224, 'epoch': 10.0}


Step,Training Loss
500,3.6063
1000,3.0968
1500,2.6691
2000,2.6266
2500,2.2987
3000,2.296
3500,2.0172
4000,2.0352
4500,1.7928
5000,1.8104




{'eval_loss': 2.754638671875, 'eval_bleu': 10.353825976866347, 'eval_runtime': 247.4774, 'eval_samples_per_second': 32.023, 'eval_steps_per_second': 0.251, 'epoch': 10.0}


In [11]:
d = params['dataset'][2]

# Procesar los datos
dataset_dict = preprocess_dataset(path + d, lang_output = 'fi')
tokenized_dataset, tokenizer = tokenizar(dataset_dict, model_checkpoint)

for e in params['epochs']:
    for lr in params['learning_rate']:

        # Crear el modelo
        trainer = get_model(tokenized_dataset, tokenizer, model_checkpoint,
          learning_rate = lr, epochs = e)

        # Nombre-datos
        d = d.split('/')[-1].split('.')[0]

        # Obtener métricas antes
        #metrics1 = trainer.evaluate(max_length = max_length)
        #with open(path_out + f'/metrica_antes_{d}_{e}_{lr}.pickle', 'wb') as file:
        #  pickle.dump(metrics1, file)

        #print(metrics1)

        # Entrenar
        trainer.train()
        trainer.save_model(path_out + f'/modelo_{d}_{e}_{lr}_REVES')

        # Obtener métricas después
        metrics2 = trainer.evaluate(max_length = max_length)
        with open(path_out + f'/metrica_despues_{d}_{e}_{lr}_REVES.pickle', 'wb') as file:
          pickle.dump(metrics2, file)

        print(metrics2)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 65027/65027 [00:12<00:00, 5188.07 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8129/8129 [00:01<00:00, 5209.69 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8128/8128 [00:01<00:00, 5311.46 examples/s]


Step,Training Loss
500,3.9694
1000,3.5565
1500,3.3365
2000,3.259
2500,3.1688
3000,3.1477




{'eval_loss': 3.0837199687957764, 'eval_bleu': 3.7172321488601896, 'eval_runtime': 323.9203, 'eval_samples_per_second': 25.093, 'eval_steps_per_second': 0.198, 'epoch': 3.0}


Step,Training Loss
500,3.6073
1000,3.0894
1500,2.6294
2000,2.562
2500,2.2486
3000,2.2191




{'eval_loss': 2.570875644683838, 'eval_bleu': 8.85820271815254, 'eval_runtime': 275.4117, 'eval_samples_per_second': 29.512, 'eval_steps_per_second': 0.232, 'epoch': 3.0}


Step,Training Loss
500,3.965
1000,3.5404
1500,3.3028
2000,3.2102
2500,3.0864
3000,3.0448
3500,2.9689
4000,2.9306
4500,2.8975
5000,2.8843




{'eval_loss': 2.903745174407959, 'eval_bleu': 5.691598195199764, 'eval_runtime': 306.8195, 'eval_samples_per_second': 26.491, 'eval_steps_per_second': 0.209, 'epoch': 5.0}


Step,Training Loss
500,3.6104
1000,3.1021
1500,2.6523
2000,2.604
2500,2.269
3000,2.2626
3500,2.0069
4000,1.9779
4500,1.802
5000,1.7791




{'eval_loss': 2.55483341217041, 'eval_bleu': 9.702644722599882, 'eval_runtime': 264.2101, 'eval_samples_per_second': 30.763, 'eval_steps_per_second': 0.242, 'epoch': 5.0}


Step,Training Loss
500,3.9617
1000,3.5289
1500,3.2798
2000,3.1782
2500,3.0347
3000,2.9848
3500,2.8831
4000,2.8348
4500,2.7674
5000,2.7426




{'eval_loss': 2.7007975578308105, 'eval_bleu': 7.4391912864367065, 'eval_runtime': 276.5354, 'eval_samples_per_second': 29.392, 'eval_steps_per_second': 0.231, 'epoch': 10.0}


Step,Training Loss
500,3.6222
1000,3.1174
1500,2.677
2000,2.6412
2500,2.3047
3000,2.3226
3500,2.0408
4000,2.0451
4500,1.8097
5000,1.8247




{'eval_loss': 2.7529571056365967, 'eval_bleu': 10.415236782448906, 'eval_runtime': 258.615, 'eval_samples_per_second': 31.429, 'eval_steps_per_second': 0.247, 'epoch': 10.0}


In [12]:
d = params['dataset'][3]

# Procesar los datos
dataset_dict = preprocess_dataset(path + d, lang_output = 'fi')
tokenized_dataset, tokenizer = tokenizar(dataset_dict, model_checkpoint)

for e in params['epochs']:
    for lr in params['learning_rate']:

        # Crear el modelo
        trainer = get_model(tokenized_dataset, tokenizer, model_checkpoint,
          learning_rate = lr, epochs = e)

        # Nombre-datos
        d = d.split('/')[-1].split('.')[0]

        # Obtener métricas antes
        #metrics1 = trainer.evaluate(max_length = max_length)
        #with open(path_out + f'/metrica_antes_{d}_{e}_{lr}.pickle', 'wb') as file:
        #  pickle.dump(metrics1, file)

        #print(metrics1)

        # Entrenar
        trainer.train()
        trainer.save_model(path_out + f'/modelo_{d}_{e}_{lr}_REVES')

        # Obtener métricas después
        metrics2 = trainer.evaluate(max_length = max_length)
        with open(path_out + f'/metrica_despues_{d}_{e}_{lr}_REVES.pickle', 'wb') as file:
          pickle.dump(metrics2, file)

        print(metrics2)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 65132/65132 [00:12<00:00, 5225.43 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8142/8142 [00:01<00:00, 5208.12 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8141/8141 [00:01<00:00, 5452.08 examples/s]


Step,Training Loss
500,3.9621
1000,3.5657
1500,3.344
2000,3.2565
2500,3.1702
3000,3.141




{'eval_loss': 3.1045448780059814, 'eval_bleu': 3.625051695401511, 'eval_runtime': 319.0478, 'eval_samples_per_second': 25.517, 'eval_steps_per_second': 0.201, 'epoch': 3.0}


Step,Training Loss
500,3.5902
1000,3.0952
1500,2.6331
2000,2.56
2500,2.2552
3000,2.2119




{'eval_loss': 2.5863845348358154, 'eval_bleu': 8.744490137366364, 'eval_runtime': 274.4765, 'eval_samples_per_second': 29.66, 'eval_steps_per_second': 0.233, 'epoch': 3.0}


Step,Training Loss
500,3.9576
1000,3.5494
1500,3.3103
2000,3.2074
2500,3.0887
3000,3.0377
3500,2.966
4000,2.9438
4500,2.8932
5000,2.8906




{'eval_loss': 2.9274864196777344, 'eval_bleu': 5.442678755107879, 'eval_runtime': 292.7428, 'eval_samples_per_second': 27.809, 'eval_steps_per_second': 0.219, 'epoch': 5.0}


Step,Training Loss
500,3.5924
1000,3.1093
1500,2.6567
2000,2.6031
2500,2.2733
3000,2.2536
3500,2.0001
4000,1.9882
4500,1.7998
5000,1.7823




{'eval_loss': 2.566084384918213, 'eval_bleu': 9.755120148458026, 'eval_runtime': 264.7892, 'eval_samples_per_second': 30.745, 'eval_steps_per_second': 0.242, 'epoch': 5.0}


Step,Training Loss
500,3.9543
1000,3.5382
1500,3.2875
2000,3.1754
2500,3.0376
3000,2.9777
3500,2.8798
4000,2.8483
4500,2.763
5000,2.7491




{'eval_loss': 2.7241714000701904, 'eval_bleu': 7.161899754884128, 'eval_runtime': 263.5224, 'eval_samples_per_second': 30.893, 'eval_steps_per_second': 0.243, 'epoch': 10.0}


Step,Training Loss
500,3.5958
1000,3.1189
1500,2.6788
2000,2.6408
2500,2.3109
3000,2.3149
3500,2.0366
4000,2.0562
4500,1.8074
5000,1.8276




{'eval_loss': 2.7642598152160645, 'eval_bleu': 10.414354893151309, 'eval_runtime': 248.6073, 'eval_samples_per_second': 32.746, 'eval_steps_per_second': 0.257, 'epoch': 10.0}
