In [62]:
from importlib import reload

import datasets
import tokenizers
import transformers

import src.models.components.feature_extractor_dinov2
import src.models.components.sign_language_net

In [2]:
rwth_phoenix = datasets.load_dataset('lukasbraach/rwth_phoenix_weather_2014', 'multisigner', streaming=True)
rwth_phoenix_pretrain = datasets.load_dataset('lukasbraach/rwth_phoenix_weather_2014', 'pre-training', streaming=True)

In [3]:
from itertools import chain


def string_iterator():
    it = chain.from_iterable(
        (rwth_phoenix_pretrain['train'], rwth_phoenix_pretrain['validation'], rwth_phoenix_pretrain['test']))

    for batch in it:
        yield batch['tokens']


for batch in string_iterator():
    print(batch)

['__ON__', 'LIEB', 'ZUSCHAUER', 'ABEND', 'WINTER', 'GESTERN', 'loc-NORD', 'SCHOTTLAND', 'loc-REGION', 'UEBERSCHWEMMUNG', 'AMERIKA', 'IX']
['loc-WEST', 'WARM', 'loc-WEST', 'BLEIBEN', 'KUEHL']
['KUEHL', 'KUEHL', 'IN-KOMMEND', 'NACHT', 'MINUS', 'loc-MINUS-PLUSPLUS', 'TEIL', 'MINUS', 'FUENF', 'ALPEN', 'KUESTE', 'NULL']
['MORGEN', 'TEMPERATUR', 'ACHT', 'BIS', 'DREIZEHN', 'MAXIMAL', 'DREIZEHN']
['AUCH', 'SAMSTAG', 'WEST', 'FREUNDLICH', 'IX', 'TROCKEN', 'WEST', 'REGEN', 'REGEN']
['SONNTAG', 'SUED', 'SCHNEE', 'DABEI', 'MONTAG', '__EMOTION__', 'SCHAUER', 'KOMMEN']
['DANN', 'FREUNDLICH', 'KLAR', 'MONTAG', 'MEHR', 'MEHR', 'WARM', 'AB', 'DIENSTAG', 'JETZT', 'WUENSCHEN', 'SCHOEN', 'ABEND', '__OFF__']
['__ON__', 'WETTER', 'WIE-AUSSEHEN', 'MORGEN', 'FREITAG', 'ZWEITE', 'APRIL', '__OFF__']
['HEUTE', 'NACHT', 'OST', 'REGEN', 'loc-NORD', '__EMOTION__', 'REGEN-PLUSPLUS', 'MORGEN', 'UEBERWIEGEND', 'FREUNDLICH']
['IX', 'SCHAUER', 'ABEND', 'WEST', 'loc-SUED', 'BISSCHEN', 'REGEN', 'REGEN']
['HIER', 'WIND', '

In [23]:
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

tokenizer = Tokenizer(model=BPE(unk_token="__UNK__"))
trainer = BpeTrainer(special_tokens=["__PAD__", "__UNK__", "__ON__", "__OFF__"])

tokenizer.pre_tokenizer = Whitespace()
tokenizer.train_from_iterator(string_iterator(), trainer)
tokenizer.add_special_tokens([
    tokenizers.AddedToken("__PAD__"),
    tokenizers.AddedToken("__UNK__"),
    tokenizers.AddedToken("__ON__"),
    tokenizers.AddedToken("__OFF__"),
])

print(tokenizer.get_vocab_size())




1935


In [18]:
tokenizer.save("../src/etc/rwth_phoenix_tokenizer.json")

In [29]:
output = tokenizer.encode("__ON__ SUED VERAENDERN KAUM WIE HEUTE SONNE ODER NEBEL __OFF__")

for batch in string_iterator():
    enc = tokenizer.encode(batch, is_pretokenized=True)
    print(enc.ids)

[2, 418, 427, 194, 617, 990, 77, 5, 75, 726, 77, 5, 72, 832, 1707, 87]
[77, 5, 114, 303, 77, 5, 114, 257, 259]
[259, 259, 178, 5, 294, 131, 241, 77, 5, 241, 5, 195, 181, 241, 233, 322, 334, 376]
[95, 296, 115, 92, 416, 374, 416]
[147, 271, 114, 197, 87, 266, 114, 56, 56]
[250, 91, 130, 360, 307, 139, 141, 82]
[174, 197, 262, 307, 146, 146, 303, 438, 325, 199, 547, 330, 194, 3]
[2, 151, 160, 5, 224, 95, 264, 766, 603, 3]
[155, 131, 99, 56, 77, 5, 75, 139, 56, 5, 195, 95, 398, 197]
[87, 141, 194, 114, 77, 5, 91, 158, 56, 56]
[516, 133, 245, 230, 230, 77, 5, 114, 75, 133, 3]
[2, 155, 131, 219, 267, 241, 298, 322, 515, 387]
[95, 348, 77, 5, 231, 397, 219]
[1053, 307, 147, 56, 56, 261, 139, 130, 3]
[2, 462, 87, 261, 413, 464, 303, 163, 5, 82, 239, 109, 148, 741, 561, 296, 296, 3]
[2, 72, 236, 208, 250, 163, 5, 82, 56, 56, 148, 142, 3]
[262, 87, 72, 145, 186, 422, 102, 381, 106, 548, 87, 1681, 353, 146, 109, 3]
[2, 133, 245, 230, 75, 457, 87, 145, 366, 126, 126, 126, 3]
[2, 201, 674, 155, 19

In [66]:
reload(src.models.components.feature_extractor_dinov2)
from src.models.components.feature_extractor_dinov2 import SignLanguageFeatureExtractor

feature_extractor = SignLanguageFeatureExtractor()


def collate_fn(batch):
    labels = tokenizer.encode(batch['tokens'], is_pretokenized=True)
    feature = feature_extractor(batch['frames'], sampling_rate=25)

    return {"input_values": feature.input_values[0], "labels": labels.ids}


train = rwth_phoenix['train'].map(function=collate_fn, batched=False, remove_columns=['frames', 'tokens'])
first = next(iter(train))

In [67]:
first

{'input_values': array([[-0.84833705, -0.58704543, -2.074996  , ..., -0.26034883,
         -2.8003612 , -0.06684101],
        [ 1.2415382 , -1.3554696 ,  0.14667816, ..., -0.8437252 ,
         -2.207518  ,  0.22651023],
        [ 1.0892667 , -1.7075235 ,  0.19240695, ..., -1.3566458 ,
         -2.3756495 , -0.35655874],
        ...,
        [ 0.10560069, -1.5657716 , -0.36898893, ..., -0.51297307,
         -3.1072054 , -0.00521034],
        [ 0.02705131, -1.019656  , -0.52293456, ...,  0.08987308,
         -2.6816874 ,  0.95835376],
        [-0.42416984, -0.6026987 , -1.3009195 , ..., -0.2116571 ,
         -2.1713347 ,  0.44469047]], dtype=float32),
 'labels': [2,
  418,
  427,
  194,
  617,
  990,
  77,
  5,
  75,
  726,
  77,
  5,
  72,
  832,
  1707,
  87]}

In [69]:
reload(src.models.components.sign_language_net)
from src.models.components.sign_language_net import SignLanguageNet

transformers_tokenizer = transformers.PreTrainedTokenizerFast(
    model_input_names=['input_values'],
    pad_token="__PAD__",
    bos_token="__ON__",
    eos_token="__OFF__",
    unk_token="__UNK__",
    tokenizer_object=tokenizer
)

model = SignLanguageNet(tokenizer=transformers_tokenizer)

model

SignLanguageNet(
  (encoder): SpatiotemporalEncoder(
    (feature_extractor): SpatialFeatureEncoder()
    (feature_projection): SpatiotemporalFeatureProjection(
      (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): Wav2Vec2Encoder(
      (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
        (conv): Conv1d(768, 768, kernel_size=(128,), stride=(1,), padding=(64,), groups=16)
        (padding): Wav2Vec2SamePadLayer()
        (activation): GELUActivation()
      )
      (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (layers): ModuleList(
        (0-5): 6 x Wav2Vec2EncoderLayer(
          (attention): Wav2Vec2Attention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=7

In [70]:
import torch
from transformers import DataCollatorForSeq2Seq

with torch.no_grad():
    collator = DataCollatorForSeq2Seq(
        model=model,
        tokenizer=transformers_tokenizer,
        pad_to_multiple_of=16,
        return_tensors='pt'
    )

collated = collator([first])

collated

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_values': tensor([[[-0.8483, -0.5870, -2.0750,  ..., -0.2603, -2.8004, -0.0668],
         [ 1.2415, -1.3555,  0.1467,  ..., -0.8437, -2.2075,  0.2265],
         [ 1.0893, -1.7075,  0.1924,  ..., -1.3566, -2.3756, -0.3566],
         ...,
         [ 0.1056, -1.5658, -0.3690,  ..., -0.5130, -3.1072, -0.0052],
         [ 0.0271, -1.0197, -0.5229,  ...,  0.0899, -2.6817,  0.9584],
         [-0.4242, -0.6027, -1.3009,  ..., -0.2117, -2.1713,  0.4447]]]), 'labels': tensor([[   2,  418,  427,  194,  617,  990,   77,    5,   75,  726,   77,    5,
           72,  832, 1707,   87]]), 'decoder_input_ids': tensor([[   3,    2,  418,  427,  194,  617,  990,   77,    5,   75,  726,   77,
            5,   72,  832, 1707]])}