# Librerías

In [1]:
# !pip install datasets
# !pip install sacremoses
# !pip install sacrebleu
# !pip install evaluate
# !pip install transformers[sentencepiece]
# !pip install transformers[torch]

In [2]:
from glob import glob
import pandas as pd
import numpy as np
from tqdm.auto import tqdm, trange
import sys
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import pandas as pd

In [4]:
from transformers import Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM
from transformers import EarlyStoppingCallback
from transformers import Seq2SeqTrainer

import torch

import numpy as np
import pickle
import evaluate

In [5]:
#from google.colab import drive
#drive.mount('/content/drive')

In [6]:
import sacrebleu
bleu_calc = sacrebleu.BLEU()
chrf_calc = sacrebleu.CHRF(word_order=2)

# Funciones auxiliares 

In [7]:
def preprocess_dataset(path_dataset: str, lang_output: str):
  """
  Lee los datos y los preprocesa. Lo pasa al formato necesario DatasetDict
  y divide los datos en train, test y validación.
  Sirve para traducción de indígena a español

  input:
  - path_dataset: con la ruta en donde se encuentra la base a procesar
  - lang_output: wayuu, arh de donde va a terminar la traducción

  output:
  - dataset_dict: DatasetDict con train test y validation
  """
  # Lectura de datos y conversión a diccionario
  dataset = pd.read_csv(path_dataset)
  conv = {'esp': 'es', 'wayuu': lang_output, 'arh': lang_output}
  dataset.rename(columns = conv, inplace = True)

  dataset = [{'es': row['es'], lang_output: row[lang_output]} for _, row in dataset.iterrows()]

  # División train, test y validación
  train, test = train_test_split(dataset, test_size = 0.2, random_state = 42)
  val, test = train_test_split(test, test_size = 0.5, random_state = 42)

  # Creación de datasets
  train = Dataset.from_dict({"id": list(range(len(train))), "translation": train})
  test = Dataset.from_dict({"id": list(range(len(test))), "translation": test})
  validation = Dataset.from_dict({"id": list(range(len(val))), "translation": val})

  # Creación del diccionario
  dataset_dict = DatasetDict({"train": train, "test": test, "validation": validation})

  return dataset_dict

# 

In [8]:
def tokenizar(dataset_dict, tokenizer, max_length = 150):
  """
  A partir de un DatasetDict, tokeniza los datos. Esto depende del modelo a utilizar,
  y de un modelo específico.

  input:
  - dataset_dict: con los datos de train, test y validación
  - tokenizer: tokenizer
  - max_length: de las sentencias a considerar

  output:
  - tokenized_datasets
  """

  def preprocess_function(examples):
      inputs = [ex["es"] for ex in examples["translation"]]
      targets = [ex["fi"] for ex in examples["translation"]]
      model_inputs = tokenizer(
          inputs, text_target=targets, max_length=max_length, truncation=True
      )
      return model_inputs

  # Tokenizar los datos
  tokenized_datasets = dataset_dict.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset_dict["train"].column_names,
  )

  return tokenized_datasets, tokenizer

# Evaluamos el mejor modelo en cuanto a BLEU en entrenamiento

Veamos el modelo que dio mejor métrica BLUE en el entrenamiento para validation.

In [9]:
model_path = "../results/wayuu"
eval_blues = {}

for res in glob(model_path + '/*'):
  if 'pickle' in res and 'resultados' not in res:
    with open(res, 'rb') as file:
      blue_score = pickle.load(file)['eval_bleu']
      eval_blues[res] = blue_score

In [10]:
eval_blues

{'../results/wayuu/metrica_despues_COMP_5_0.0002.pickle': 8.46147848277743,
 '../results/wayuu/metrica_despues_COMP_NDU_10_2e-05.pickle': 3.823045364057283,
 '../results/wayuu/metrica_despues_COMP_NDU_5_0.0002.pickle': 8.358963761551946,
 '../results/wayuu/metrica_despues_COMP_10_2e-05.pickle': 3.841921542846619,
 '../results/wayuu/metrica_despues_COMP_ND_3_0.0002.pickle': 1.677752319229074,
 '../results/wayuu/metrica_despues_COMP_NC_3_0.0002.pickle': 6.985037271297072,
 '../results/wayuu/metrica_despues_COMP_10_0.0002.pickle': 9.944518207708816,
 '../results/wayuu/metrica_despues_COMP_ND_10_0.0002.pickle': 6.114102976300334,
 '../results/wayuu/metrica_despues_COMP_ND_5_2e-05.pickle': 0.3364212705823903,
 '../results/wayuu/metrica_despues_COMP_ND_5_0.0002.pickle': 3.3616074258638515,
 '../results/wayuu/metrica_despues_COMP_5_2e-05.pickle': 2.0283968999388584,
 '../results/wayuu/metrica_despues_COMP_NC_10_0.0002.pickle': 10.092185362846715,
 '../results/wayuu/metrica_despues_COMP_NC_5_2

In [15]:
res_max = max(eval_blues, key=lambda k: eval_blues[k])

Este es el modelo con mejor score BLUE para el conjunto de validación entre los entrenados

In [25]:
res_max, eval_blues[res_max]

('../results/wayuu/metrica_despues_COMP_NC_10_0.0002.pickle',
 10.092185362846715)

In [24]:
path_data = '../data/wayuu'

d = '_'.join(res_max.split('_')[-4:-2])
print(d)

# Cargar datos
dataset_dict = preprocess_dataset(path_data + '/' + d + '.csv', lang_output = 'fi')

# Cargar modelo y tokenizados
name = res_max.split('.pickle')[0].replace('metrica_despues', 'modelo')
tokenizer = AutoTokenizer.from_pretrained(name)
model = AutoModelForSeq2SeqLM.from_pretrained(name)

df_test = pd.DataFrame(dataset_dict['test']['translation'])
df_train = pd.DataFrame(dataset_dict['train']['translation'])
df_validation = pd.DataFrame(dataset_dict['validation']['translation'])

COMP_NC


In [15]:
def translate(text, src_lang='es', tgt_lang='fi', a=32, b=3, max_input_length=128, num_beams=4, **kwargs):
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
    result = model.generate(
        **inputs.to(model.device),
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
        max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
        num_beams=num_beams,
        **kwargs
    )
    return tokenizer.batch_decode(result, skip_special_tokens=True)

def batched_translate(texts, batch_size=16, **kwargs):
    """Translate texts in batches of similar length"""
    idxs, texts2 = zip(*sorted(enumerate(texts), key=lambda p: len(p[1]), reverse=True))
    results = []
    for i in trange(0, len(texts2), batch_size):
        results.extend(translate(texts2[i: i+batch_size], **kwargs))
    return [p for i, p in sorted(zip(idxs, results))]

Obtenemos las predicciones

In [39]:
df_test

Unnamed: 0,es,fi
0,renunciamos a actuar de forma oculta y avergon...,"tu tekirajakat anain jia, nnojotsu taattajaain..."
1,"si el mundo los odia a ustedes, sepan que prim...",aashajaashi jesus nuchikua namuin muleka kan...
2,como podemos ser mas compasivos,kasa waa'inrajatka supula alinjatuin wayuu wapula
3,jesus acababa de ense arles a sus discipulos q...,ni'ikuin jesuu na nikirajuinkana sunain achunt...
4,con que se divierten hoy dia muchas personas,kasa naainjaka ma'in na wayuukana maa'ulu yaa
...,...,...
8124,ma ana vere a mis hermanos,tereena huata tawalayu
8125,pero es preciso que sean constantes en el cump...,anakaja nnojorule juu'ulaain suulia anoujaa. m...
8126,pero poco a poco sus sentimientos se haran mas...,mapa ki'raleeshi'iya naya sunain muin nakuwa'ipa
8127,sin embargo eso no quiere decir que no tengamo...,kamaneepaja'a waya sumuin wayuu supushua'a


In [44]:
wayuu_pred = []
for i in trange(0, len(df_test.values)):
    translated = translate(df_test.es[i])
    wayuu_pred.append(translated)

100%|██████████| 8129/8129 [2:58:14<00:00,  1.32s/it]  


# Métricas en test

In [45]:
df_test.head()

Unnamed: 0,es,fi
0,renunciamos a actuar de forma oculta y avergon...,"tu tekirajakat anain jia, nnojotsu taattajaain..."
1,"si el mundo los odia a ustedes, sepan que prim...",aashajaashi jesus nuchikua namuin muleka kan...
2,como podemos ser mas compasivos,kasa waa'inrajatka supula alinjatuin wayuu wapula
3,jesus acababa de ense arles a sus discipulos q...,ni'ikuin jesuu na nikirajuinkana sunain achunt...
4,con que se divierten hoy dia muchas personas,kasa naainjaka ma'in na wayuukana maa'ulu yaa


In [23]:
import sacrebleu
bleu_calc = sacrebleu.BLEU()
chrf_calc = sacrebleu.CHRF(word_order=2)

In [50]:
wayuu_pred2 = [w[0] for w in wayuu_pred] 

In [51]:
print(bleu_calc.corpus_score(wayuu_pred2, [df_test['fi'].tolist()]))
print(chrf_calc.corpus_score(wayuu_pred2, [df_test['fi'].tolist()]))

BLEU = 7.48 31.0/12.0/5.6/2.9 (BP = 0.849 ratio = 0.859 hyp_len = 95933 ref_len = 111620)
chrF2++ = 31.90


In [52]:
with open(model_path + '/resultados_traducciones_mejor_modelo_test_wayuu.pickle', 'wb') as file:
  pickle.dump(wayuu_pred, file)

# Revisión otros modelos - No diccionario

In [54]:
keys_nd = [k for k in eval_blues.keys() if 'ND_' in k]
keys_nd

['../results/wayuu/metrica_despues_COMP_ND_3_0.0002.pickle',
 '../results/wayuu/metrica_despues_COMP_ND_10_0.0002.pickle',
 '../results/wayuu/metrica_despues_COMP_ND_5_2e-05.pickle',
 '../results/wayuu/metrica_despues_COMP_ND_5_0.0002.pickle',
 '../results/wayuu/metrica_despues_COMP_ND_3_2e-05.pickle',
 '../results/wayuu/metrica_despues_COMP_ND_10_2e-05.pickle']

In [55]:
path_data = '../data/wayuu'

d = 'COMP_ND'

# Cargar datos
dataset_dict = preprocess_dataset(path_data + '/' + d + '.csv', lang_output = 'fi')

df_test = pd.DataFrame(dataset_dict['test']['translation'])
df_train = pd.DataFrame(dataset_dict['train']['translation'])
df_validation = pd.DataFrame(dataset_dict['validation']['translation'])

In [61]:
for model_name in keys_nd:
    # Cargar modelo y tokenizados
    name = model_name.split('.pickle')[0].replace('metrica_despues', 'modelo')
    tokenizer = AutoTokenizer.from_pretrained(name)
    model = AutoModelForSeq2SeqLM.from_pretrained(name)

    # Traducir
    print('Comienza a predecir')
    wayuu_pred = []
    for i in trange(0, len(df_test.values)):
        translated = translate(df_test.es[i])
        wayuu_pred.append(translated)
    wayuu_pred2 = [w[0] for w in wayuu_pred] 

    # Resultados 
    print(f'\n\n ---------------------- Resultados {model_name}\n')
    print(bleu_calc.corpus_score(wayuu_pred2, [df_test['fi'].tolist()]))
    print(chrf_calc.corpus_score(wayuu_pred2, [df_test['fi'].tolist()]))

    # Guardar 
    name = name.split('/')[-1]
    with open(model_path + '/resultados_traducciones_{name}_wayuu.pickle', 'wb') as file:
        pickle.dump(wayuu_pred, file)

Comienza a predecir


100%|██████████| 894/894 [35:55<00:00,  2.41s/it]




 ---------------------- Resultados ../results/wayuu/metrica_despues_COMP_ND_3_0.0002.pickle

BLEU = 0.68 28.6/5.3/1.1/0.2 (BP = 0.281 ratio = 0.441 hyp_len = 22764 ref_len = 51637)
chrF2++ = 15.47
Comienza a predecir


100%|██████████| 894/894 [37:26<00:00,  2.51s/it]




 ---------------------- Resultados ../results/wayuu/metrica_despues_COMP_ND_10_0.0002.pickle

BLEU = 2.53 36.6/10.9/4.1/1.8 (BP = 0.342 ratio = 0.482 hyp_len = 24907 ref_len = 51637)
chrF2++ = 21.37
Comienza a predecir


100%|██████████| 894/894 [51:29<00:00,  3.46s/it]  




 ---------------------- Resultados ../results/wayuu/metrica_despues_COMP_ND_5_2e-05.pickle

BLEU = 0.17 8.1/0.9/0.1/0.0 (BP = 0.641 ratio = 0.692 hyp_len = 35726 ref_len = 51637)
chrF2++ = 9.15
Comienza a predecir


100%|██████████| 894/894 [37:06<00:00,  2.49s/it]




 ---------------------- Resultados ../results/wayuu/metrica_despues_COMP_ND_5_0.0002.pickle

BLEU = 1.47 31.7/7.7/2.3/0.7 (BP = 0.327 ratio = 0.472 hyp_len = 24381 ref_len = 51637)
chrF2++ = 18.49
Comienza a predecir


100%|██████████| 894/894 [50:45<00:00,  3.41s/it]  




 ---------------------- Resultados ../results/wayuu/metrica_despues_COMP_ND_3_2e-05.pickle

BLEU = 0.08 3.9/0.2/0.0/0.0 (BP = 0.610 ratio = 0.669 hyp_len = 34544 ref_len = 51637)
chrF2++ = 7.55
Comienza a predecir


100%|██████████| 894/894 [37:25<00:00,  2.51s/it]




 ---------------------- Resultados ../results/wayuu/metrica_despues_COMP_ND_10_2e-05.pickle

BLEU = 0.37 20.5/3.0/0.4/0.0 (BP = 0.348 ratio = 0.487 hyp_len = 25127 ref_len = 51637)
chrF2++ = 12.79


# Revisión otros modelos - No diccionario palabras únicas

In [12]:
path_data = '../data/wayuu'

d = 'COMP_NDU'
print(d)

# Cargar datos
dataset_dict = preprocess_dataset(path_data + '/' + d + '.csv', lang_output = 'fi')

df_test = pd.DataFrame(dataset_dict['test']['translation'])
df_train = pd.DataFrame(dataset_dict['train']['translation'])
df_validation = pd.DataFrame(dataset_dict['validation']['translation'])

COMP_NDU


In [13]:
keys_nd = [k for k in eval_blues.keys() if 'NDU_' in k]
keys_nd

['../results/wayuu/metrica_despues_COMP_NDU_10_2e-05.pickle',
 '../results/wayuu/metrica_despues_COMP_NDU_5_0.0002.pickle',
 '../results/wayuu/metrica_despues_COMP_NDU_3_0.0002.pickle',
 '../results/wayuu/metrica_despues_COMP_NDU_3_2e-05.pickle',
 '../results/wayuu/metrica_despues_COMP_NDU_5_2e-05.pickle',
 '../results/wayuu/metrica_despues_COMP_NDU_10_0.0002.pickle']

In [16]:
for model_name in keys_nd:
    # Cargar modelo y tokenizados
    name = model_name.split('.pickle')[0].replace('metrica_despues', 'modelo')
    tokenizer = AutoTokenizer.from_pretrained(name)
    model = AutoModelForSeq2SeqLM.from_pretrained(name)

    # Traducir
    print('Comienza a predecir')
    wayuu_pred = []
    for i in trange(0, len(df_test.values)):
        translated = translate(df_test.es[i])
        wayuu_pred.append(translated)
    wayuu_pred2 = [w[0] for w in wayuu_pred] 

    # Resultados 
    print(f'\n\n ---------------------- Resultados {model_name}\n')
    print(bleu_calc.corpus_score(wayuu_pred2, [df_test['fi'].tolist()]))
    print(chrf_calc.corpus_score(wayuu_pred2, [df_test['fi'].tolist()]))

    # Guardar 
    name = name.split('/')[-1]
    with open(model_path + '/resultados_traducciones_{name}_wayuu.pickle', 'wb') as file:
        pickle.dump(wayuu_pred, file)

Comienza a predecir


100%|██████████| 7925/7925 [4:48:02<00:00,  2.18s/it]     




 ---------------------- Resultados ../results/wayuu/metrica_despues_COMP_NDU_10_2e-05.pickle

BLEU = 2.65 24.2/6.2/1.8/0.5 (BP = 0.769 ratio = 0.792 hyp_len = 86944 ref_len = 109834)
chrF2++ = 24.93
Comienza a predecir


100%|██████████| 7925/7925 [2:44:16<00:00,  1.24s/it]     




 ---------------------- Resultados ../results/wayuu/metrica_despues_COMP_NDU_5_0.0002.pickle

BLEU = 6.47 30.4/11.1/4.8/2.3 (BP = 0.832 ratio = 0.845 hyp_len = 92780 ref_len = 109834)
chrF2++ = 30.82
Comienza a predecir


100%|██████████| 7925/7925 [6:44:44<00:00,  3.06s/it]     




 ---------------------- Resultados ../results/wayuu/metrica_despues_COMP_NDU_3_0.0002.pickle

BLEU = 5.33 28.5/9.7/3.8/1.6 (BP = 0.824 ratio = 0.838 hyp_len = 92056 ref_len = 109834)
chrF2++ = 29.27
Comienza a predecir


100%|██████████| 7925/7925 [3:05:25<00:00,  1.40s/it]  




 ---------------------- Resultados ../results/wayuu/metrica_despues_COMP_NDU_3_2e-05.pickle

BLEU = 0.89 17.6/2.7/0.4/0.1 (BP = 0.791 ratio = 0.810 hyp_len = 88985 ref_len = 109834)
chrF2++ = 20.20
Comienza a predecir


100%|██████████| 7925/7925 [5:51:16<00:00,  2.66s/it]     




 ---------------------- Resultados ../results/wayuu/metrica_despues_COMP_NDU_5_2e-05.pickle

BLEU = 1.49 20.6/4.0/0.8/0.2 (BP = 0.775 ratio = 0.797 hyp_len = 87567 ref_len = 109834)
chrF2++ = 22.22
Comienza a predecir


100%|██████████| 7925/7925 [5:09:23<00:00,  2.34s/it]    




 ---------------------- Resultados ../results/wayuu/metrica_despues_COMP_NDU_10_0.0002.pickle

BLEU = 7.53 32.1/12.4/5.8/3.0 (BP = 0.826 ratio = 0.839 hyp_len = 92163 ref_len = 109834)
chrF2++ = 31.98


# Revisión otros modelos - No constitución

In [17]:
path_data = '../data/wayuu'

d = 'COMP_NC'
print(d)

# Cargar datos
dataset_dict = preprocess_dataset(path_data + '/' + d + '.csv', lang_output = 'fi')

df_test = pd.DataFrame(dataset_dict['test']['translation'])
df_train = pd.DataFrame(dataset_dict['train']['translation'])
df_validation = pd.DataFrame(dataset_dict['validation']['translation'])

COMP_NC


In [19]:
keys_nc = [k for k in eval_blues.keys() if 'NC_' in k]
keys_nc

['../results/wayuu/metrica_despues_COMP_NC_3_0.0002.pickle',
 '../results/wayuu/metrica_despues_COMP_NC_10_0.0002.pickle',
 '../results/wayuu/metrica_despues_COMP_NC_5_2e-05.pickle',
 '../results/wayuu/metrica_despues_COMP_NC_5_0.0002.pickle',
 '../results/wayuu/metrica_despues_COMP_NC_3_2e-05.pickle',
 '../results/wayuu/metrica_despues_COMP_NC_10_2e-05.pickle']

In [21]:
for model_name in keys_nc:
    # Cargar modelo y tokenizados
    name = model_name.split('.pickle')[0].replace('metrica_despues', 'modelo')
    tokenizer = AutoTokenizer.from_pretrained(name)
    model = AutoModelForSeq2SeqLM.from_pretrained(name)

    # Traducir
    print('Comienza a predecir')
    wayuu_pred = []
    for i in trange(0, len(df_test.values)):
        translated = translate(df_test.es[i])
        wayuu_pred.append(translated)
    wayuu_pred2 = [w[0] for w in wayuu_pred] 

    # Resultados 
    print(f'\n\n ---------------------- Resultados {model_name}\n')
    print(bleu_calc.corpus_score(wayuu_pred2, [df_test['fi'].tolist()]))
    print(chrf_calc.corpus_score(wayuu_pred2, [df_test['fi'].tolist()]))

    # Guardar 
    name = name.split('/')[-1]
    with open(model_path + f'/resultados_traducciones_{name}_wayuu.pickle', 'wb') as file:
        pickle.dump(wayuu_pred, file)

Comienza a predecir


100%|██████████| 8129/8129 [2:22:56<00:00,  1.06s/it]  




 ---------------------- Resultados ../results/wayuu/metrica_despues_COMP_NC_3_0.0002.pickle

BLEU = 5.02 28.8/9.7/3.8/1.6 (BP = 0.777 ratio = 0.799 hyp_len = 89152 ref_len = 111620)
chrF2++ = 28.88
Comienza a predecir


100%|██████████| 8129/8129 [2:22:13<00:00,  1.05s/it]  




 ---------------------- Resultados ../results/wayuu/metrica_despues_COMP_NC_10_0.0002.pickle

BLEU = 7.48 31.0/12.0/5.6/2.9 (BP = 0.849 ratio = 0.859 hyp_len = 95933 ref_len = 111620)
chrF2++ = 31.90
Comienza a predecir


100%|██████████| 8129/8129 [2:35:50<00:00,  1.15s/it]  




 ---------------------- Resultados ../results/wayuu/metrica_despues_COMP_NC_5_2e-05.pickle

BLEU = 1.49 20.8/4.1/0.9/0.2 (BP = 0.739 ratio = 0.768 hyp_len = 85680 ref_len = 111620)
chrF2++ = 21.88
Comienza a predecir


100%|██████████| 8129/8129 [2:23:39<00:00,  1.06s/it]  




 ---------------------- Resultados ../results/wayuu/metrica_despues_COMP_NC_5_0.0002.pickle

BLEU = 6.26 29.6/10.8/4.7/2.1 (BP = 0.832 ratio = 0.844 hyp_len = 94259 ref_len = 111620)
chrF2++ = 30.60
Comienza a predecir


100%|██████████| 8129/8129 [2:58:08<00:00,  1.31s/it]  




 ---------------------- Resultados ../results/wayuu/metrica_despues_COMP_NC_3_2e-05.pickle

BLEU = 0.87 17.8/2.7/0.5/0.1 (BP = 0.763 ratio = 0.787 hyp_len = 87866 ref_len = 111620)
chrF2++ = 20.01
Comienza a predecir


  0%|          | 23/8129 [00:32<3:08:53,  1.40s/it]


KeyboardInterrupt: 