# Librerías

In [None]:
!pip install datasets
!pip install sacremoses
!pip install sacrebleu
!pip install evaluate
!pip install transformers[sentencepiece]
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/521.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/521.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m317.4/521.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-a

In [None]:
import pandas as pd
import numpy as np
import tqdm
import sys
import os

In [None]:
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import pandas as pd

In [None]:
from transformers import Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM
from transformers import EarlyStoppingCallback
from transformers import Seq2SeqTrainer

import numpy as np
import pickle
import evaluate

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Funciones auxiliares

In [None]:
def preprocess_dataset(path_dataset: str, lang_output: str):
  """
  Lee los datos y los preprocesa. Lo pasa al formato necesario DatasetDict
  y divide los datos en train, test y validación.
  Sirve para traducción de indígena a español

  input:
  - path_dataset: con la ruta en donde se encuentra la base a procesar
  - lang_output: wayuu, arh de donde va a terminar la traducción

  output:
  - dataset_dict: DatasetDict con train test y validation
  """
  # Lectura de datos y conversión a diccionario
  dataset = pd.read_csv(path_dataset)
  conv = {'esp': 'es', 'wayuu': lang_output, 'arh': lang_output}
  dataset.rename(columns = conv, inplace = True)

  dataset = [{'es': row['es'], lang_output: row[lang_output]} for _, row in dataset.iterrows()]

  # División train, test y validación
  train, test = train_test_split(dataset, test_size = 0.2, random_state = 42)
  val, test = train_test_split(test, test_size = 0.5, random_state = 42)

  # Creación de datasets
  train = Dataset.from_dict({"id": list(range(len(train))), "translation": train})
  test = Dataset.from_dict({"id": list(range(len(test))), "translation": test})
  validation = Dataset.from_dict({"id": list(range(len(val))), "translation": val})

  # Creación del diccionario
  dataset_dict = DatasetDict({"train": train, "test": test, "validation": validation})

  return dataset_dict

def tokenizar(dataset_dict, model_checkpoint, max_length = 150):
  """
  A partir de un DatasetDict, tokeniza los datos. Esto depende del modelo a utilizar,
  y de un modelo específico.

  input:
  - dataset_dict: con los datos de train, test y validación
  - model_checkpoint: identificador del modelo a utilizar
  - max_length: de las sentencias a considerar

  output:
  - tokenized_datasets
  """
  # Cargar tokenizador
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

  def preprocess_function(examples):
      inputs = [ex["fi"] for ex in examples["translation"]]
      targets = [ex["es"] for ex in examples["translation"]]
      model_inputs = tokenizer(
          inputs, text_target=targets, max_length=max_length, truncation=True
      )
      return model_inputs

  # Tokenizar los datos
  tokenized_datasets = dataset_dict.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset_dict["train"].column_names,
  )

  return tokenized_datasets, tokenizer

In [None]:
def get_model(tokenized_datasets, tokenizer, model_checkpoint,
              learning_rate = 2e-5, epochs = 3, weight_decay = 0.01):

  model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
  data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

  metric = evaluate.load("sacrebleu")

  def compute_metrics(eval_preds):
      preds, labels = eval_preds
      # In case the model returns more than the prediction logits
      if isinstance(preds, tuple):
          preds = preds[0]

      decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

      # Replace -100s in the labels as we can't decode them
      labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
      decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

      # Some simple post-processing
      decoded_preds = [pred.strip() for pred in decoded_preds]
      decoded_labels = [[label.strip()] for label in decoded_labels]

      result = metric.compute(predictions=decoded_preds, references=decoded_labels)
      return {"bleu": result["score"]}

  args = Seq2SeqTrainingArguments(
      f"marian-finetuned-kde4-fi-to-es",
      evaluation_strategy= "no",
      save_strategy="no", # "epoch"
      learning_rate = learning_rate,
      per_device_train_batch_size=32,
      per_device_eval_batch_size=64,
      weight_decay=weight_decay,
      save_total_limit=3,
      num_train_epochs=epochs,
      predict_with_generate=True,
      fp16=True,
      push_to_hub=False,
      load_best_model_at_end = True
  )

  early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=1, early_stopping_threshold=0.0)

  trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    #callbacks=[early_stopping_callback]
  )

  return trainer

# Parametros

In [None]:
path = '/content/drive/MyDrive/Colab Notebooks/Talleres NLP/Proyecto/data_clean'
path_out = '/content/drive/MyDrive/Colab Notebooks/Talleres NLP/Proyecto/results'

params = {
    'dataset': ['/arhuaco/COMP.csv', '/arhuaco/BIBLIA.csv', '/arhuaco/COMP_NC.csv'],
    'epochs': [3, 5, 10],
    'learning_rate' : [2e-5, 2e-4]
}

model_checkpoint = 'Helsinki-NLP/opus-mt-fi-es'

max_length = 128

In [None]:
d = params['dataset'][0]

# Procesar los datos
dataset_dict = preprocess_dataset(path + d, lang_output = 'fi')
tokenized_dataset, tokenizer = tokenizar(dataset_dict, model_checkpoint)

for e in params['epochs']:
    for lr in params['learning_rate']:

        # Crear el modelo
        trainer = get_model(tokenized_dataset, tokenizer, model_checkpoint,
          learning_rate = lr, epochs = e)

        # Nombre-datos
        d = d.split('/')[-1].split('.')[0]

        # Obtener métricas antes
        #metrics1 = trainer.evaluate(max_length = max_length)
        #with open(path_out + f'/metrica_antes_{d}_{e}_{lr}.pickle', 'wb') as file:
        #  pickle.dump(metrics1, file)

        #print(metrics1)

        # Entrenar
        trainer.train()
        trainer.save_model(path_out + f'/arhuaco_modelo_{d}_{e}_{lr}_REVES')

        # Obtener métricas después
        metrics2 = trainer.evaluate(max_length = max_length)
        with open(path_out + f'/arhuaco_metrica_despues_{d}_{e}_{lr}_REVES.pickle', 'wb') as file:
          pickle.dump(metrics2, file)

        print(metrics2)

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/847k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/827k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.57M [00:00<?, ?B/s]

Map:   0%|          | 0/4588 [00:00<?, ? examples/s]

Map:   0%|          | 0/574 [00:00<?, ? examples/s]

Map:   0%|          | 0/573 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/309M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Step,Training Loss


{'eval_loss': 3.663532018661499, 'eval_bleu': 0.6863517490896596, 'eval_runtime': 64.0529, 'eval_samples_per_second': 8.946, 'eval_steps_per_second': 0.141, 'epoch': 3.0}


Step,Training Loss


{'eval_loss': 3.263406753540039, 'eval_bleu': 1.9984474898278526, 'eval_runtime': 62.5825, 'eval_samples_per_second': 9.156, 'eval_steps_per_second': 0.144, 'epoch': 3.0}


Step,Training Loss
500,3.7688


{'eval_loss': 3.5439751148223877, 'eval_bleu': 0.7665199640397524, 'eval_runtime': 65.1476, 'eval_samples_per_second': 8.795, 'eval_steps_per_second': 0.138, 'epoch': 5.0}


Step,Training Loss
500,3.0137


{'eval_loss': 3.239234685897827, 'eval_bleu': 2.684949197916594, 'eval_runtime': 60.2437, 'eval_samples_per_second': 9.511, 'eval_steps_per_second': 0.149, 'epoch': 5.0}


Step,Training Loss
500,3.7343
1000,3.2151


{'eval_loss': 3.3946340084075928, 'eval_bleu': 0.9387018592509492, 'eval_runtime': 66.8422, 'eval_samples_per_second': 8.572, 'eval_steps_per_second': 0.135, 'epoch': 10.0}


Step,Training Loss
500,3.0042
1000,1.6844


{'eval_loss': 3.5056161880493164, 'eval_bleu': 3.6185704653853294, 'eval_runtime': 59.8403, 'eval_samples_per_second': 9.575, 'eval_steps_per_second': 0.15, 'epoch': 10.0}


In [None]:
d = params['dataset'][1]

# Procesar los datos
dataset_dict = preprocess_dataset(path + d, lang_output = 'fi')
tokenized_dataset, tokenizer = tokenizar(dataset_dict, model_checkpoint)

for e in params['epochs']:
    for lr in params['learning_rate']:

        # Crear el modelo
        trainer = get_model(tokenized_dataset, tokenizer, model_checkpoint,
          learning_rate = lr, epochs = e)

        # Nombre-datos
        d = d.split('/')[-1].split('.')[0]

        # Obtener métricas antes
        #metrics1 = trainer.evaluate(max_length = max_length)
        #with open(path_out + f'/metrica_antes_{d}_{e}_{lr}.pickle', 'wb') as file:
        #  pickle.dump(metrics1, file)

        #print(metrics1)

        # Entrenar
        trainer.train()
        trainer.save_model(path_out + f'/modelo_{d}_{e}_{lr}_REVES')

        # Obtener métricas después
        metrics2 = trainer.evaluate(max_length = max_length)
        with open(path_out + f'/metrica_despues_{d}_{e}_{lr}_REVES.pickle', 'wb') as file:
          pickle.dump(metrics2, file)

        print(metrics2)

Map:   0%|          | 0/4450 [00:00<?, ? examples/s]

Map:   0%|          | 0/557 [00:00<?, ? examples/s]

Map:   0%|          | 0/556 [00:00<?, ? examples/s]

Step,Training Loss


{'eval_loss': 3.5357744693756104, 'eval_bleu': 0.8885029393645031, 'eval_runtime': 62.2815, 'eval_samples_per_second': 8.927, 'eval_steps_per_second': 0.145, 'epoch': 3.0}


Step,Training Loss


{'eval_loss': 3.0832858085632324, 'eval_bleu': 1.8579173253233743, 'eval_runtime': 62.9404, 'eval_samples_per_second': 8.834, 'eval_steps_per_second': 0.143, 'epoch': 3.0}


Step,Training Loss
500,3.7255


{'eval_loss': 3.408541679382324, 'eval_bleu': 0.7651877133916551, 'eval_runtime': 63.7107, 'eval_samples_per_second': 8.727, 'eval_steps_per_second': 0.141, 'epoch': 5.0}


Step,Training Loss
500,2.9515


{'eval_loss': 3.0298116207122803, 'eval_bleu': 3.354010837910484, 'eval_runtime': 60.9276, 'eval_samples_per_second': 9.126, 'eval_steps_per_second': 0.148, 'epoch': 5.0}


Step,Training Loss
500,3.6892
1000,3.18


{'eval_loss': 3.242452383041382, 'eval_bleu': 1.1355153311890158, 'eval_runtime': 63.8418, 'eval_samples_per_second': 8.709, 'eval_steps_per_second': 0.141, 'epoch': 10.0}


Step,Training Loss
500,2.9412
1000,1.6338


{'eval_loss': 3.25426983833313, 'eval_bleu': 4.508676371677167, 'eval_runtime': 51.3235, 'eval_samples_per_second': 10.833, 'eval_steps_per_second': 0.175, 'epoch': 10.0}


In [None]:
d = params['dataset'][2]

# Procesar los datos
dataset_dict = preprocess_dataset(path + d, lang_output = 'fi')
tokenized_dataset, tokenizer = tokenizar(dataset_dict, model_checkpoint)

for e in params['epochs']:
    for lr in params['learning_rate']:

        # Crear el modelo
        trainer = get_model(tokenized_dataset, tokenizer, model_checkpoint,
          learning_rate = lr, epochs = e)

        # Nombre-datos
        d = d.split('/')[-1].split('.')[0]

        # Obtener métricas antes
        #metrics1 = trainer.evaluate(max_length = max_length)
        #with open(path_out + f'/metrica_antes_{d}_{e}_{lr}.pickle', 'wb') as file:
        #  pickle.dump(metrics1, file)

        #print(metrics1)

        # Entrenar
        trainer.train()
        trainer.save_model(path_out + f'/modelo_{d}_{e}_{lr}_REVES')

        # Obtener métricas después
        metrics2 = trainer.evaluate(max_length = max_length)
        with open(path_out + f'/metrica_despues_{d}_{e}_{lr}_REVES.pickle', 'wb') as file:
          pickle.dump(metrics2, file)

        print(metrics2)

Map:   0%|          | 0/4499 [00:00<?, ? examples/s]

Map:   0%|          | 0/563 [00:00<?, ? examples/s]

Map:   0%|          | 0/562 [00:00<?, ? examples/s]

Step,Training Loss


{'eval_loss': 3.622763156890869, 'eval_bleu': 0.5693466925365455, 'eval_runtime': 64.0606, 'eval_samples_per_second': 8.773, 'eval_steps_per_second': 0.14, 'epoch': 3.0}


Step,Training Loss


{'eval_loss': 3.192824602127075, 'eval_bleu': 1.4155850886576888, 'eval_runtime': 63.7607, 'eval_samples_per_second': 8.814, 'eval_steps_per_second': 0.141, 'epoch': 3.0}


Step,Training Loss
500,3.7428


{'eval_loss': 3.5007758140563965, 'eval_bleu': 0.7940745552566086, 'eval_runtime': 63.3543, 'eval_samples_per_second': 8.871, 'eval_steps_per_second': 0.142, 'epoch': 5.0}


Step,Training Loss
500,2.9773


{'eval_loss': 3.134737730026245, 'eval_bleu': 2.576465164614863, 'eval_runtime': 60.558, 'eval_samples_per_second': 9.28, 'eval_steps_per_second': 0.149, 'epoch': 5.0}


Step,Training Loss
500,3.7068
1000,3.1916


{'eval_loss': 3.343442440032959, 'eval_bleu': 0.7973419880033906, 'eval_runtime': 67.0436, 'eval_samples_per_second': 8.383, 'eval_steps_per_second': 0.134, 'epoch': 10.0}


Step,Training Loss
500,2.9657
1000,1.6575


{'eval_loss': 3.346017599105835, 'eval_bleu': 3.8491725195197555, 'eval_runtime': 57.9692, 'eval_samples_per_second': 9.695, 'eval_steps_per_second': 0.155, 'epoch': 10.0}
