# Revisión de resultados traducción español - arhuaco


## Librerías

In [19]:
!pip install datasets
!pip install sacremoses
!pip install sacrebleu
!pip install evaluate
!pip install transformers[sentencepiece]
!pip install transformers[torch]



In [20]:
from glob import glob
import pandas as pd
import numpy as np
from tqdm.auto import tqdm, trange
import sys
import os

In [21]:
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import pandas as pd

In [22]:
from transformers import Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM
from transformers import EarlyStoppingCallback
from transformers import Seq2SeqTrainer

import torch

import numpy as np
import pickle
import evaluate

In [23]:
import sacrebleu

## Funciones

In [24]:
def preprocess_dataset(path_dataset: str, lang_output: str):
  """
  Lee los datos y los preprocesa. Lo pasa al formato necesario DatasetDict
  y divide los datos en train, test y validación.
  Sirve para traducción de indígena a español

  input:
  - path_dataset: con la ruta en donde se encuentra la base a procesar
  - lang_output: wayuu, arh de donde va a terminar la traducción

  output:
  - dataset_dict: DatasetDict con train test y validation
  """
  # Lectura de datos y conversión a diccionario
  dataset = pd.read_csv(path_dataset)
  conv = {'esp': 'es', 'wayuu': lang_output, 'arh': lang_output}
  dataset.rename(columns = conv, inplace = True)

  dataset = [{'es': row['es'], lang_output: row[lang_output]} for _, row in dataset.iterrows()]

  # División train, test y validación
  train, test = train_test_split(dataset, test_size = 0.2, random_state = 42)
  val, test = train_test_split(test, test_size = 0.5, random_state = 42)

  # Creación de datasets
  train = Dataset.from_dict({"id": list(range(len(train))), "translation": train})
  test = Dataset.from_dict({"id": list(range(len(test))), "translation": test})
  validation = Dataset.from_dict({"id": list(range(len(val))), "translation": val})

  # Creación del diccionario
  dataset_dict = DatasetDict({"train": train, "test": test, "validation": validation})

  return dataset_dict

In [25]:
def tokenizar(dataset_dict, tokenizer, max_length = 150):
  """
  A partir de un DatasetDict, tokeniza los datos. Esto depende del modelo a utilizar,
  y de un modelo específico.

  input:
  - dataset_dict: con los datos de train, test y validación
  - tokenizer: tokenizer
  - max_length: de las sentencias a considerar

  output:
  - tokenized_datasets
  """

  def preprocess_function(examples):
      inputs = [ex["fi"] for ex in examples["translation"]]
      targets = [ex["es"] for ex in examples["translation"]]
      model_inputs = tokenizer(
          inputs, text_target=targets, max_length=max_length, truncation=True
      )
      return model_inputs

  # Tokenizar los datos
  tokenized_datasets = dataset_dict.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset_dict["train"].column_names,
  )

  return tokenized_datasets, tokenizer

In [26]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Lectura de datos

In [27]:
model_path = "/content/drive/MyDrive/Maestría/Sem2/Modelos/Datos traduccion wayuu - español"
eval_blues = {}

for res in glob(model_path + '/*'):
  if 'pickle' in res and 'REVES' in res:
    with open(res, 'rb') as file:
      blue_score = pickle.load(file)['eval_bleu']
      eval_blues[res] = blue_score

In [28]:
eval_blues

{'/content/drive/MyDrive/Maestría/Sem2/Modelos/Datos traduccion wayuu - español/metrica_despues_COMP_ND_3_0.0002_REVES.pickle': 3.5821907352866433,
 '/content/drive/MyDrive/Maestría/Sem2/Modelos/Datos traduccion wayuu - español/metrica_despues_COMP_10_0.0002_REVES.pickle': 10.414354893151309,
 '/content/drive/MyDrive/Maestría/Sem2/Modelos/Datos traduccion wayuu - español/metrica_despues_COMP_NC_5_2e-05_REVES.pickle': 5.691598195199764,
 '/content/drive/MyDrive/Maestría/Sem2/Modelos/Datos traduccion wayuu - español/metrica_despues_COMP_5_2e-05_REVES.pickle': 5.442678755107879,
 '/content/drive/MyDrive/Maestría/Sem2/Modelos/Datos traduccion wayuu - español/metrica_despues_COMP_NC_3_2e-05_REVES.pickle': 3.7172321488601896,
 '/content/drive/MyDrive/Maestría/Sem2/Modelos/Datos traduccion wayuu - español/metrica_despues_COMP_ND_3_2e-05_REVES.pickle': 0.8020157512245715,
 '/content/drive/MyDrive/Maestría/Sem2/Modelos/Datos traduccion wayuu - español/metrica_despues_COMP_ND_10_0.

## Métricas

In [29]:
bleu_calc = sacrebleu.BLEU()
chrf_calc = sacrebleu.CHRF(word_order=2)

## Funciones de predicción

In [30]:
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
print(torch_device)

cuda


In [31]:
def translate(model, tokenizer, text, src_lang='fi', tgt_lang='es', a=32, b=3, max_input_length=128, num_beams=4, **kwargs):
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
    result = model.generate(
        **inputs.to(model.device),
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
        max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
        num_beams=num_beams,
        **kwargs
    ).to(torch_device)
    return tokenizer.batch_decode(result, skip_special_tokens=True)

def batched_translate(model, tokenizer, texts, batch_size=16, **kwargs):
    """Translate texts in batches of similar length"""
    idxs, texts2 = zip(*sorted(enumerate(texts), key=lambda p: len(p[1]), reverse=True))
    results = []
    for i in trange(0, len(texts2), batch_size):
        results.extend(translate(model, tokenizer, texts2[i: i+batch_size], **kwargs))
    return [p for i, p in sorted(zip(idxs, results))]

## Métrica para datos completos

In [32]:
path_data = '/content/drive/MyDrive/Maestría/Sem2/PROYECTO/data_clean/wayuu'
d = 'COMP_NC'

In [33]:
dataset_dict = preprocess_dataset(path_data + '/' + d + '.csv', lang_output = 'fi')

df_test = pd.DataFrame(dataset_dict['test']['translation'])
df_train = pd.DataFrame(dataset_dict['train']['translation'])
df_validation = pd.DataFrame(dataset_dict['validation']['translation'])

In [34]:
df_test.head()

Unnamed: 0,es,fi
0,renunciamos a actuar de forma oculta y avergon...,"tu tekirajakat anain jia, nnojotsu taattajaain..."
1,"si el mundo los odia a ustedes, sepan que prim...",aashajaashi jesus nuchikua namuin muleka kan...
2,como podemos ser mas compasivos,kasa waa'inrajatka supula alinjatuin wayuu wapula
3,jesus acababa de ense arles a sus discipulos q...,ni'ikuin jesuu na nikirajuinkana sunain achunt...
4,con que se divierten hoy dia muchas personas,kasa naainjaka ma'in na wayuukana maa'ulu yaa


In [35]:
resultados_completos = [c for c in eval_blues.keys() if 'COMP_NC' in c]
resultados_completos

['/content/drive/MyDrive/Maestría/Sem2/Modelos/Datos traduccion wayuu - español/metrica_despues_COMP_NC_5_2e-05_REVES.pickle',
 '/content/drive/MyDrive/Maestría/Sem2/Modelos/Datos traduccion wayuu - español/metrica_despues_COMP_NC_3_2e-05_REVES.pickle',
 '/content/drive/MyDrive/Maestría/Sem2/Modelos/Datos traduccion wayuu - español/metrica_despues_COMP_NC_5_0.0002_REVES.pickle',
 '/content/drive/MyDrive/Maestría/Sem2/Modelos/Datos traduccion wayuu - español/metrica_despues_COMP_NC_3_0.0002_REVES.pickle',
 '/content/drive/MyDrive/Maestría/Sem2/Modelos/Datos traduccion wayuu - español/metrica_despues_COMP_NC_10_2e-05_REVES.pickle',
 '/content/drive/MyDrive/Maestría/Sem2/Modelos/Datos traduccion wayuu - español/metrica_despues_COMP_NC_10_0.0002_REVES.pickle']

In [37]:
for path in resultados_completos[4:]:

  print('\nResultados --- ', path, '----')
  name = path.split('.pickle')[0].replace('metrica_despues', 'modelo')
  print(name)

  tokenizer = AutoTokenizer.from_pretrained(name)
  model = AutoModelForSeq2SeqLM.from_pretrained(name).to('cuda')

  arh_translated_test = batched_translate(model, tokenizer, df_test.fi, src_lang='fi', tgt_lang='es')
  print(bleu_calc.corpus_score(arh_translated_test, [df_test['es'].tolist()]))
  print(chrf_calc.corpus_score(arh_translated_test, [df_test['es'].tolist()]))


Resultados ---  /content/drive/MyDrive/Maestría/Sem2/Modelos/Datos traduccion wayuu - español/metrica_despues_COMP_NC_10_2e-05_REVES.pickle ----
/content/drive/MyDrive/Maestría/Sem2/Modelos/Datos traduccion wayuu - español/modelo_COMP_NC_10_2e-05_REVES


  0%|          | 0/509 [00:00<?, ?it/s]

BLEU = 5.06 28.8/8.7/3.5/1.6 (BP = 0.825 ratio = 0.839 hyp_len = 104570 ref_len = 124672)
chrF2++ = 21.61

Resultados ---  /content/drive/MyDrive/Maestría/Sem2/Modelos/Datos traduccion wayuu - español/metrica_despues_COMP_NC_10_0.0002_REVES.pickle ----
/content/drive/MyDrive/Maestría/Sem2/Modelos/Datos traduccion wayuu - español/modelo_COMP_NC_10_0.0002_REVES


  0%|          | 0/509 [00:00<?, ?it/s]

BLEU = 7.37 31.0/11.0/5.3/2.9 (BP = 0.863 ratio = 0.872 hyp_len = 108656 ref_len = 124672)
chrF2++ = 24.75
