## GPT-2

In [1]:
from transformers import GPT2Tokenizer, AutoModel, GPT2TokenizerFast

In [2]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_tokenizer_fast = GPT2TokenizerFast.from_pretrained("gpt2")
gpt2_model = AutoModel.from_pretrained("gpt2")

In [3]:
text = "I love my dog, because he is brave."
# text = "[SOS] " + text + " [EOS]"
text

'I love my dog, because he is brave.'

In [4]:
encoding = gpt2_tokenizer.encode(text, add_special_tokens=False)
encoding_fast = gpt2_tokenizer_fast.encode(text, add_special_tokens=False)

In [5]:
encoding

[40, 1842, 616, 3290, 11, 780, 339, 318, 14802, 13]

In [6]:
encoding_fast

[40, 1842, 616, 3290, 11, 780, 339, 318, 14802, 13]

In [7]:
decoding = gpt2_tokenizer.decode(encoding)
decoding_fast = gpt2_tokenizer_fast.decode(encoding_fast)
decoding, decoding_fast

('I love my dog, because he is brave.', 'I love my dog, because he is brave.')

In [8]:
from transformers import GPT2Tokenizer

# Instantiate the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set the padding token to be the same as the end-of-sequence (EOS) token
tokenizer.pad_token = tokenizer.eos_token

# Create a list of sentences
sentences = [
    "This is an example sentence.",
    "Another example sentence.",
    "A short one.",
]

# Use the tokenizer to encode and pad the sentences
encoded_batch = tokenizer.batch_encode_plus(
    sentences,
    padding=True,  # Enables padding
    return_tensors="pt",  # Returns PyTorch tensors (use "tf" for TensorFlow tensors)
)

# Access the padded input IDs and attention masks
input_ids = encoded_batch["input_ids"]
attention_masks = encoded_batch["attention_mask"]

print("Input IDs:", input_ids)
print("Attention masks:", attention_masks)

# batch decode where attention_mask is used to ignore padding tokens
tokenizer.batch_decode(input_ids, skip_special_tokens=True)

Input IDs: tensor([[ 1212,   318,   281,  1672,  6827,    13],
        [ 6610,  1672,  6827,    13, 50256, 50256],
        [   32,  1790,   530,    13, 50256, 50256]])
Attention masks: tensor([[1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 0, 0]])


['This is an example sentence.', 'Another example sentence.', 'A short one.']

### Test label to token distribution

In [11]:
from src.utils.text_processing import distribute_word_label_to_token

In [14]:
text = "I love my dog, because he is brave."
labels_per_word = [1, 2, 3, 4, 5, 6, 7, 8]
tokens, token_labels, word_to_token, grouped_tokens = distribute_word_label_to_token(
    text, labels_per_word, gpt2_tokenizer, "gpt2"
)
tokens, token_labels, word_to_token, grouped_tokens

([314, 1842, 616, 3290, 11, 837, 780, 339, 318, 14802, 13],
 [1, 2, 3, 4, 4, 5, 6, 7, 8],
 [[0], [1], [2], [3, 4], [5], [6], [7], [8], [9, 10]],
 [[314], [1842], [616], [3290, 11], [837], [780], [339], [318], [14802, 13]])

## BERT

In [51]:
# bert tokenizer and fast
from transformers import BertTokenizer, BertTokenizerFast

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
bert_tokenizer_fast = BertTokenizerFast.from_pretrained(
    "bert-base-uncased", do_lower_case=True
)

In [52]:
text = "I love my dog, because he is brave."
# text = "[SOS] " + text + " [EOS]"
text

'I love my dog, because he is brave.'

In [53]:
encodings = bert_tokenizer.encode(text, add_special_tokens=True)
encodings_fast = bert_tokenizer_fast.encode(text, add_special_tokens=True)

In [54]:
encodings

[101, 1045, 2293, 2026, 3899, 1010, 2138, 2002, 2003, 9191, 1012, 102]

In [12]:
from transformers import BertTokenizer

# Instantiate the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Create a list of sentences
sentences = [
    "This is an example sentence.",
    "Another example sentence.",
    "A short one.",
]

# Use the tokenizer to encode and pad the sentences
encoded_batch = tokenizer.batch_encode_plus(
    sentences,
    padding=True,  # Enables padding
    return_tensors="pt",  # Returns PyTorch tensors (use "tf" for TensorFlow tensors)
)

# Access the padded input IDs and attention masks
input_ids = encoded_batch["input_ids"]
attention_masks = encoded_batch["attention_mask"]

print("Input IDs:", input_ids)
print("Attention masks:", attention_masks)

# decoding
decoding = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
print(decoding)

Input IDs: tensor([[ 101, 2023, 2003, 2019, 2742, 6251, 1012,  102],
        [ 101, 2178, 2742, 6251, 1012,  102,    0,    0],
        [ 101, 1037, 2460, 2028, 1012,  102,    0,    0]])
Attention masks: tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0]])
['this is an example sentence.', 'another example sentence.', 'a short one.']


In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

sentences = [
    "This is an example sentence.",
    "Another example sentence.",
    "A short one.",
    "This is an example sentence.",
    "Another example sentence.",
    "A short one.",
]

labels = [[1, 1, 1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1], [1, 1, 1]]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

In [2]:
from src.data.components.datasets import encode_and_pad_batch, TokenTaggingDataset

dataset = TokenTaggingDataset(sentences, labels, tokenizer)

ModuleNotFoundError: No module named 'src'

In [20]:
from torch.utils.data import DataLoader

dataloader = DataLoader(
    dataset,
    batch_size=2,
    shuffle=True,
    collate_fn=lambda batch: encode_and_pad_batch(batch, tokenizer),
)

In [21]:
from typing import List, Tuple, Union
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, BertTokenizer, AutoTokenizer, AutoModel
from src.data.components.datasets import encode_and_pad_batch, TokenTaggingDataset
from omegaconf import DictConfig, OmegaConf


# distribute_word_label_to_token function from a previous response

# encode_and_pad_batch function from your message

# Instantiate the tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# tokenizer.pad_token = tokenizer.eos_token

# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True, add_prefix_space=True)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)

model = AutoModel.from_pretrained("bert-base-uncased")

cfg = DictConfig({"model": "bert-base-uncased"})

sentences = [
    "This is an example sentence.",
    "Another example sentence.",
    "A short one.",
    "What a long sentence this here is incredible.",
]
labels = [[1, 1, 1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]

dataset = TokenTaggingDataset(sentences, labels, tokenizer, cfg)
dataloader = DataLoader(
    dataset,
    batch_size=2,
    collate_fn=lambda batch: encode_and_pad_batch(batch, tokenizer),
)

for input_ids, attention_masks, padded_labels in dataloader:
    print("Input IDs:\n", input_ids)
    print(
        "Decoded input:\n", tokenizer.batch_decode(input_ids, skip_special_tokens=False)
    )
    print("Attention masks:\n", attention_masks)
    print("Padded labels:\n", padded_labels)
    outputs = model(input_ids, attention_mask=attention_masks).last_hidden_state
    print("Outputs:\n", outputs)
    print("Outputs shape:\n", outputs.shape)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


text 
 This is an example sentence.
word encodings 
 [[2023], [2003], [2019], [2742], [6251, 1012]]
word_to_token
 [[0], [1], [2], [3], [4, 5]]
#tokens 
 6
algined tokens 
 [2023, 2003, 2019, 2742, 6251, 1012]
aligned decoded
 this is an example sentence.
text 
 Another example sentence.
word encodings 
 [[2178], [2742], [6251, 1012]]
word_to_token
 [[0], [1], [2, 3]]
#tokens 
 4
algined tokens 
 [2178, 2742, 6251, 1012]
aligned decoded
 another example sentence.
Input IDs:
 tensor([[ 101, 2023, 2003, 2019, 2742, 6251, 1012,  102],
        [ 101, 2178, 2742, 6251, 1012,  102,    0,    0]])
Decoded input:
 ['[CLS] this is an example sentence. [SEP]', '[CLS] another example sentence. [SEP] [PAD] [PAD]']
Attention masks:
 tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0]])
Padded labels:
 tensor([[   1,    1,    1,    1,    1,    1, -999, -999],
        [   1,    1,    1,    1, -999, -999, -999, -999]])
Outputs:
 tensor([[[-0.3774, -0.3350, -0.3206,  ..., -0.5255,  0.2590

## Unified tokenizer function

In [7]:
from src.data.components.datasets import tokenize_text_with_labels
from transformers import BertTokenizer, GPT2Tokenizer

#### Examples with different models 

In [8]:
text = "Hello, world! This is a test."
labels = [0, 1, 2, 2, 2, 5]
score_first_token = True
relative_to_prev = False
n_prev = 3

In [9]:
# GPT2
model_type = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", add_prefix_space=True)

(
    input_text,
    tokenized_text,
    tokenized_labels,
    token_ids,
    mask,
) = tokenize_text_with_labels(
    text,
    labels,
    model_type,
    score_first_token=score_first_token,
    relative_to_prev=relative_to_prev,
    n_prev=n_prev,
)
print("Input text:", input_text)
print("Tokenized text:", tokenized_text)
print("Tokenized labels:", tokenized_labels)
print("Token IDs:", token_ids)  # decode the token ids
print("Mask:", mask)
tokenizer.decode(token_ids)

Input text: Hello, world! This is a test.
Tokenized text: ['ĠHello', ',', 'Ġworld', '!', 'ĠThis', 'Ġis', 'Ġa', 'Ġtest', '.']
Tokenized labels: [0, -1, 1, -1, 2, 2, 2, 5, -1]
Token IDs: [18435, 11, 995, 0, 770, 318, 257, 1332, 13]
Mask: [1, 0, 1, 0, 1, 1, 1, 1, 0]


' Hello, world! This is a test.'

In [74]:
# BERT
model_type = "bert-cased"
tokenizer = BertTokenizer.from_pretrained("bert-base-cased", add_special_tokens=True)


(
    input_text,
    tokenized_text,
    tokenized_labels,
    token_ids,
    mask,
) = tokenize_text_with_labels(
    text,
    labels,
    model_type,
    score_first_token=score_first_token,
    relative_to_prev=relative_to_prev,
    n_prev=n_prev,
)
print("Input text:", input_text)
print("Tokenized text:", tokenized_text)
print("Tokenized labels:", tokenized_labels)
print("Token IDs:", token_ids)  # decode the token ids
print("Mask:", mask)  # decode the token ids
tokenizer.decode(token_ids, ignore_special_tokens=True)

ValueError: too many values to unpack (expected 5)

## Test dataset

In [83]:
bert_tokenizer = BertTokenizer.from_pretrained(
    "bert-base-cased", add_special_tokens=True
)
bert_tokenizer.sep_token_id

102

In [70]:
from src.data.components.helsinki import HelsinkiProminenceExtractor
from src.data.components.datasets import TokenTaggingDataset
from torch.utils.data import DataLoader

In [65]:
extractor = HelsinkiProminenceExtractor(
    "/Users/lukas/Desktop/projects/MIT/prosody/prosody/repositories/helsinki-prosody/data",
    "dev.txt",
)
texts = extractor.get_all_texts()
prominences = extractor.get_all_real_prominence()

In [145]:
dataset = TokenTaggingDataset(
    texts,
    prominences,
    "bert-cased",
    score_first_token=True,
    relative_to_prev=False,
    n_prev=3,
)

In [146]:
from src.data.components.collators import collate_fn

In [147]:
from functools import partial

In [158]:
collator = partial(collate_fn, eos_token_id=102)

In [159]:
dataloader = DataLoader(dataset, batch_size=2, shuffle=False, collate_fn=collator)

In [164]:
for i, batch in enumerate(dataloader):
    for k, v in batch.items():
        print(k, v)
    if i > 5:
        break

input_text ["A 'JOLLY' ART CRITIC", 'There is a healthy bank holiday atmosphere about this book which is extremely pleasant']
tokenized_text [['[CLS]', 'A', "'", 'J', '##OL', '##L', '##Y', "'", 'AR', '##T', 'CR', '##IT', '##IC', '[SEP]'], ['[CLS]', 'There', 'is', 'a', 'healthy', 'bank', 'holiday', 'atmosphere', 'about', 'this', 'book', 'which', 'is', 'extremely', 'pleasant', '[SEP]']]
original_labels [[0.128, 2.454, 0.986, 0.233], [0.0, 0.164, 0.036, 2.144, 0.938, 0.091, 0.597, 0.162, 0.049, 0.669, 0.0, 0.038, 2.106, 0.076]]
tokenized_labels tensor([[-1.0000,  0.1281,  2.4531, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
          0.9858, -1.0000,  0.2330, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000],
        [-1.0000,  0.0000,  0.1639,  0.0360,  2.1445,  0.9380,  0.0910,  0.5972,
          0.1620,  0.0490,  0.6689,  0.0000,  0.0380,  2.1055,  0.0760, -1.0000]],
       dtype=torch.float16)
input_ids tensor([[  101,   138,   112,   147, 13901,  2162,  3663,   112, 22133,  1942,
        

In [161]:
bert_tokenizer = BertTokenizer.from_pretrained(
    "bert-base-cased", add_special_tokens=True
)

In [162]:
decoded_text = bert_tokenizer.decode(
    batch["input_ids"][0].tolist(), skip_special_tokens=True
)

In [163]:
decoded_text

"A'JOLLY'ART CRITIC"