# Global Setup

In [12]:
import collections

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from datasets import load_dataset
import numpy as np
import torch
import evaluate

DATASET_NAME = "opus100"
SPLIT = "train" # train or validation
SOURCE_LANG = "en"
TGT_LANG = "es"
PREFIX = "translate English to Spanish: "
METRIC_TYPE = "sacrebleu"
CKPT_DIR = "opus100-ckpt"
CKPT_NUM = 99000
CKPT = f"./{CKPT_DIR}/checkpoint-{CKPT_NUM}"

# Load the model"./opus100-ckpt/checkpoint-99000"
model = AutoModelForSeq2SeqLM.from_pretrained(CKPT)

# Load the dataset
opus100 = load_dataset("opus100", f"{SOURCE_LANG}-{TGT_LANG}")
opus100 = opus100[SPLIT].train_test_split(test_size=0.2)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(CKPT)
metric = evaluate.load(METRIC_TYPE)

Found cached dataset opus100 (/home/mathadoor/.cache/huggingface/datasets/opus100/en-es/0.0.0/256f3196b69901fb0c79810ef468e2c4ed84fbd563719920b1ff1fdc750f7704)


  0%|          | 0/3 [00:00<?, ?it/s]

# Helper Functions

In [13]:
def preprocess_function(examples):
  # Prefix for tranlation is added in the beginning to the source text
  inputs = [PREFIX + example[SOURCE_LANG] for example in examples["translation"]]
  targets = [example[TGT_LANG] for example in examples["translation"]]
  model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)

  return model_inputs


def postprocess_text(preds, labels):

  # Post-processing: remove unnecessary whitespaces and combine bpe tokens
  preds = [pred.strip() for pred in preds]
  labels = [[label.strip()] for label in labels]

  return preds, labels

def compute_metrics(eval_preds):
  preds, labels = eval_preds
  if isinstance(preds, tuple):
    preds = preds[0]
  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

  result = metric.compute(predictions=decoded_preds, references=decoded_labels)
  result = {"bleu": result["score"]}

  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
  result["gen_len"] = np.mean(prediction_lens)
  result = {k: round(v, 4) for k, v in result.items()}
  return result

In [14]:
encoder_in = tokenizer(opus100["train"][0]["translation"]["en"], return_tensors="pt").input_ids
decoder_in = tokenizer(opus100["train"][0]["translation"]["es"], return_tensors="pt").input_ids


In [15]:
dir(model.decoder)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_auto_class',
 '_backward_compatibility_gradient_checkpointing',
 '_backward_hooks',
 '_backward_pre_hooks',
 '_buffers',
 '_call_impl',
 '_convert_head_mask_to_5d',
 '_create_repo',
 '_expand_inputs_for_generation',
 '_extract_past_from_model_output',
 '_forward_hooks',
 '_forward_hooks_with_kwargs',
 '_forward_pre_hooks',
 '_forward_pre_hooks_with_kwargs',
 '_from_config',
 '_get_backward_hooks',
 '_get_backward_pre_hooks',
 '_get_decoder_start_token_id',
 '_get_files_timestamps',
 '_get_logits_processor',
 '_get_logits_warpe

In [16]:
encoder_in

tensor([[12707,   107,   127,     7,    15,     6,    62,    31,    60,   365,
          3211,    55,     1]])

In [5]:
logits = model(input_ids=encoder_in, decoder_input_ids=decoder_in)["logits"]


In [6]:
probs = torch.softmax(logits, dim=2)


In [7]:
torch.sum(probs, dim=2)


tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000]], grad_fn=<SumBackward1>)

In [25]:
from tqdm import tqdm
token_set = set()

i = 0
counts = collections.Counter([])
for i in tqdm(range(100)):
  data_point = opus100["train"]['translation'][i]
  input = tokenizer(data_point['en']).input_ids
  output = tokenizer(data_point['es']).input_ids
  counts.update(input)
  counts.update(output)

print(len(counts))


  0%|          | 0/100 [00:02<?, ?it/s]


KeyboardInterrupt: 

In [None]:
input

In [1]:
from model import TinyTransformer, TinyTransformerConfig


In [2]:
student_config = TinyTransformerConfig()


In [3]:
student_model = TinyTransformer(student_config)


In [4]:
out = student_model(input_ids=encoder_in, decoder_input_ids=decoder_in)


NameError: name 'encoder_in' is not defined

In [9]:
out.shape


NameError: name 'out' is not defined

In [10]:
decoder_in


NameError: name 'decoder_in' is not defined

In [11]:
torch.save(student_model, "test_transformer.ckpt")
