In [1]:
from google.colab import drive

import json
import time

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import numpy as np

drive.mount('/content/drive', force_remount=True)
root_filepath = '/content/drive/MyDrive/CS685-NLP-GroupProject/Code/singleNotebookFromPythonFilesStructure/'
is_remote_execution = True

torch.device("cuda")


class DatasetHolder:

    def __init__(self):
        self.unknown_vocabulary_type = None
        self.padding_vocabulary_type = None
        self.end_of_sequence_type = None
        self.target_vocab = None
        self.target_vocab_array = None
        self.target_vocab_counts = None
        self.source_vocab = None
        self.source_vocab_array = None
        self.source_vocab_counts = None
        self.target_encodings = None
        self.target_encodings_train = None
        self.target_encodings_test = None
        self.source_encodings = None
        self.source_encodings_train = None
        self.source_encodings_test = None
        self.max_src_seq_obs = 0
        self.max_tgt_seq_obs = 0

    def get_unknown_vocabulary_type(self):
        return self.unknown_vocabulary_type

    def set_unknown_vocabulary_type(self, unknown_vocabulary_type):
        self.unknown_vocabulary_type = unknown_vocabulary_type

    def get_padding_vocabulary_type(self):
        return self.padding_vocabulary_type

    def set_padding_vocabulary_type(self, padding_vocabulary_type):
        self.padding_vocabulary_type = padding_vocabulary_type

    def get_end_of_sequence_vocabulary_type(self):
        return self.end_of_sequence_type

    def set_end_of_sequence_vocabulary_type(self, end_of_sequence_type):
        self.end_of_sequence_type = end_of_sequence_type

    def get_target_vocab(self):
        return self.target_vocab

    def set_target_vocab(self, target_vocab):
        self.target_vocab = target_vocab

    def get_target_vocab_numpy(self):
        if self.target_vocab_array is None:
            self.target_vocab_array = np.array(self.target_vocab)
        return self.target_vocab_array

    def get_target_vocab_counts(self):
        return self.target_vocab_counts

    def set_target_vocab_counts(self, target_vocab_counts):
        self.target_vocab_counts = target_vocab_counts

    def get_source_vocab(self):
        return self.source_vocab

    def set_source_vocab(self, source_vocab):
        self.source_vocab = source_vocab

    def get_source_vocab_numpy(self):
        if self.source_vocab_array is None:
            self.source_vocab_array = np.array(self.source_vocab)
        return self.source_vocab_array

    def get_source_vocab_counts(self):
        return self.source_vocab_counts

    def set_source_vocab_counts(self, source_vocab_counts):
        self.source_vocab_counts = source_vocab_counts

    def get_target_encodings(self):
        return self.target_encodings

    def set_target_encodings(self, target_encodings):
        del self.target_encodings
        self.target_encodings = target_encodings
        if is_remote_execution:
            torch.cuda.empty_cache()

    def get_source_encodings(self):
        return self.source_encodings

    def set_source_encodings(self, source_encodings):
        del self.source_encodings
        self.source_encodings = source_encodings
        if is_remote_execution:
            torch.cuda.empty_cache()

    def get_target_encodings_train(self):
        return self.target_encodings_train

    def set_target_encodings_train(self, target_encodings_train):
        del self.target_encodings_train
        self.target_encodings_train = target_encodings_train
        if is_remote_execution:
            torch.cuda.empty_cache()

    def get_source_encodings_train(self):
        return self.source_encodings_train

    def set_source_encodings_train(self, source_encodings_train):
        del self.source_encodings_train
        self.source_encodings_train = source_encodings_train
        if is_remote_execution:
            torch.cuda.empty_cache()

    def get_target_encodings_test(self):
        return self.target_encodings_test

    def set_target_encodings_test(self, target_encodings_test):
        del self.target_encodings_test
        self.target_encodings_test = target_encodings_test
        if is_remote_execution:
            torch.cuda.empty_cache()

    def get_source_encodings_test(self):
        return self.source_encodings_test

    def set_source_encodings_test(self, source_encodings_test):
        del self.source_encodings_test
        self.source_encodings_test = source_encodings_test
        if is_remote_execution:
            torch.cuda.empty_cache()

    def get_max_src_seq_obs(self):
        return self.max_src_seq_obs

    def set_max_src_seq_obs(self, max_src_seq_obs):
        self.max_src_seq_obs = max_src_seq_obs

    def get_max_tgt_seq_obs(self):
        return self.max_tgt_seq_obs

    def set_max_tgt_seq_obs(self, max_tgt_seq_obs):
        self.max_tgt_seq_obs = max_tgt_seq_obs



dataset_holder: DatasetHolder = torch.load(
    root_filepath + "resources/parsed_datasets/setimes/setimes_parsed-1715586974")

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang="tur_Latn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to("cuda").eval()

source_sentences = list()
target_sentences = list()
outputs = list()
for encoding in dataset_holder.get_source_encodings_test():
    decoded_tensor = np.take(dataset_holder.get_source_vocab_numpy(), encoding.detach().to(device="cpu").flatten().numpy())
    source_sentences.append("".join(decoded_tensor)[:-1])
for encoding in dataset_holder.get_target_encodings_test():
    decoded_tensor = np.take(dataset_holder.get_target_vocab_numpy(), encoding.detach().to(device="cpu").flatten().numpy())
    target_sentences.append("".join(decoded_tensor)[:-1])
assert len(source_sentences) == len(target_sentences)
for i in range(0, len(source_sentences)):
    tokenization = tokenizer(source_sentences[i], return_tensors="pt").to("cuda")
    generated_output = model.generate(**tokenization, forced_bos_token_id=tokenizer.lang_code_to_id['eng_Latn'])
    del tokenization
    torch.cuda.empty_cache()
    decoded_output = tokenizer.batch_decode(generated_output)[0].replace('</s>eng_Latn', '').replace('</s>', '')
    outputs.append({'source': source_sentences[i], 'target': target_sentences[i], 'baseline': decoded_output})
    print(f"completed processing {i+1} of {len(source_sentences)} at {time.time()}")
    if is_remote_execution and i % 100 == 0:
        print(f"Translation{i}: {outputs[i]}")
        print(f"Memory usage summary:")
        print(f"{torch.cuda.memory_summary()}")
        torch.cuda.reset_max_memory_allocated()
        torch.cuda.reset_max_memory_cached()
        torch.cuda.reset_peak_memory_stats()

output_file = open(
    root_filepath + "resources/baseline_translations/setimes/setimes_parsed-1715586974-NLLB.json", "w+")

output_file.write(json.dumps(outputs))

output_file.close()



Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder` this attribute will be removed in `transformers` v4.38


completed processing 1 of 9084 at 1715971490.2717373
Translation0: {'source': "BH Başbakanı USS Harry S. Truman'ı Ziyaret Etti", 'target': 'BiH Prime Minister Visits USS Harry S. Truman', 'baseline': ' The Prime Minister of the United Kingdom visited the USS Harry S. Truman'}
Memory usage summary:
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   2362 MiB |   2369 MiB |   4171 MiB |   1808 MiB |
|       from large pool |   2360 MiB |   2360 MiB |   2360 MiB |      0 MiB |
|       from small pool |      1 MiB |      8 MiB |   1810 MiB |   1808 MiB |
|---------------------------------------------------------------------------|



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
completed processing 5735 of 9084 at 1715973821.4737961
completed processing 5736 of 9084 at 1715973822.0919003
completed processing 5737 of 9084 at 1715973822.6663759
completed processing 5738 of 9084 at 1715973823.3852081
completed processing 5739 of 9084 at 1715973823.9040952
completed processing 5740 of 9084 at 1715973824.488824
completed processing 5741 of 9084 at 1715973824.9803872
completed processing 5742 of 9084 at 1715973825.6475637
completed processing 5743 of 9084 at 1715973826.2131426
completed processing 5744 of 9084 at 1715973826.7342162
completed processing 5745 of 9084 at 1715973827.201504
completed processing 5746 of 9084 at 1715973827.8837397
completed processing 5747 of 9084 at 1715973828.4361265
completed processing 5748 of 9084 at 1715973829.100638
completed processing 5749 of 9084 at 1715973829.6861944
completed processing 5750 of 9084 at 1715973830.0674627
completed processing 5751 of 9084 at 17159