In [1]:
import datasets
from transformers import MT5Tokenizer
from functools import reduce
import features 

dataset = datasets.load_dataset("lecslab/usp-igt")

tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small", legacy=False)

# Collect the unique set of gloss labels
all_glosses = sorted(set([gloss for glosses in dataset['train']['pos_glosses'] +
                            dataset['eval']['pos_glosses'] +
                            dataset['test']['pos_glosses'] for gloss in glosses.replace("-", " ").split()]))
all_glosses = ["<sep>", "<pad>"] + all_glosses
for gloss in all_glosses:
    print(gloss)

feature_map = features.create_feature_map('../features_v1.csv', all_glosses)
feature_map

<sep>
<pad>
???
A1P
A1S
A2P
A2S
ADJ
ADV
AFE
AFI
AGT
AP
APLI
ART
CAU
CLAS
COM
COND
CONJ
DEM
DIM
DIR
E1
E1P
E1S
E2
E2P
E2S
E3
E3P
E3S
ENF
EXS
GNT
IMP
INC
INS
INT
ITR
ITS
MED
MOV
NEG
NOM
NUM
PART
PAS
PL
POS
PP
PREP
PRG
PRON
REC
RFX
S
SAB
SC
SREL
SV
TAM
TOP
TRN
VI
VOC
VT


([[0, 0, 0, 0],
  [1, 0, 0, 0],
  [2, 0, 0, 0],
  [3, 1, 1, 2],
  [3, 1, 1, 1],
  [3, 2, 1, 2],
  [3, 2, 1, 1],
  [4, 0, 0, 0],
  [5, 0, 0, 0],
  [6, 0, 0, 0],
  [7, 0, 0, 0],
  [8, 0, 0, 0],
  [9, 0, 0, 0],
  [10, 0, 0, 0],
  [11, 0, 0, 0],
  [12, 0, 0, 0],
  [13, 0, 0, 0],
  [14, 0, 0, 0],
  [15, 0, 0, 0],
  [16, 0, 0, 0],
  [17, 0, 0, 0],
  [18, 0, 0, 0],
  [19, 0, 0, 0],
  [20, 1, 2, 0],
  [20, 1, 2, 2],
  [20, 1, 2, 1],
  [20, 2, 2, 0],
  [20, 2, 2, 2],
  [20, 2, 2, 1],
  [20, 3, 2, 0],
  [20, 3, 2, 2],
  [20, 3, 2, 1],
  [21, 0, 0, 0],
  [22, 0, 0, 0],
  [23, 0, 0, 0],
  [24, 0, 0, 0],
  [25, 0, 0, 0],
  [26, 0, 0, 0],
  [27, 0, 0, 0],
  [28, 0, 0, 0],
  [29, 0, 0, 0],
  [30, 0, 0, 0],
  [31, 0, 0, 0],
  [32, 0, 0, 0],
  [33, 0, 0, 0],
  [34, 0, 0, 0],
  [35, 0, 0, 0],
  [36, 0, 0, 0],
  [37, 0, 0, 0],
  [38, 0, 0, 0],
  [39, 0, 0, 0],
  [40, 0, 0, 0],
  [41, 0, 0, 0],
  [42, 0, 0, 0],
  [43, 0, 0, 0],
  [44, 0, 0, 0],
  [45, 0, 0, 0],
  [46, 0, 0, 0],
  [47, 0, 0, 0],
  [48, 0, 

In [6]:
from collections import Counter

for d in ['train', 'eval', 'test']:
    all_glosses = []
    for gloss_row in dataset[d]['pos_glosses']:
        all_glosses += filter(lambda g: g in ['A1P', 'A1S', 'A2P', 'A2S', 'E1', 'E1P', 'E1S', 'E2', 'E2P', 'E2S', 'E3', 'E3P', 'E3S'], gloss_row.replace("-", " ").split())
    print(Counter(all_glosses))

Counter({'E3S': 3118, 'E1P': 1370, 'E1S': 709, 'E2S': 564, 'E3': 385, 'A1S': 347, 'A2S': 127, 'A1P': 110, 'E3P': 32, 'E2P': 16, 'E2': 16, 'E1': 2, 'A2P': 2})
Counter({'E3S': 59, 'E2S': 17, 'A1S': 13, 'E1S': 10, 'A1P': 1, 'E1P': 1, 'E2P': 1, 'A2S': 1})
Counter({'E3S': 295, 'E2S': 76, 'E1S': 42, 'A1S': 22, 'E1P': 9, 'A2S': 3, 'E3': 1, 'E2P': 1})


In [7]:
filtered_rows = dataset['train'].filter(lambda r: 'A1P' in r['pos_glosses'].replace('-', ' ').split())
filtered_rows

Filter:   0%|          | 0/9774 [00:00<?, ? examples/s]

Dataset({
    features: ['transcription', 'segmentation', 'pos_glosses', 'glosses', 'translation'],
    num_rows: 104
})

In [2]:

SEP_TOKEN_ID = all_glosses.index("<sep>")
PAD_TOKEN_ID = all_glosses.index("<pad>")
print(f"{len(all_glosses)} unique glosses")

def encode_gloss_labels(label_string: str):
    """Encodes glosses as an id sequence. Each morpheme gloss is assigned a unique id."""
    word_glosses = label_string.split()
    glosses = [word_gloss.split("-") for word_gloss in word_glosses]
    glosses = [[all_glosses.index(gloss) for gloss in word if gloss != ''] for word in glosses]
    glosses = reduce(lambda a, b: a + [SEP_TOKEN_ID] + b, glosses)
    return glosses + [PAD_TOKEN_ID]

def tokenize(batch):
    inputs = tokenizer(batch['transcription'], truncation=True, padding=False, max_length=64)
    inputs['labels'] = [encode_gloss_labels(label) for label in batch['pos_glosses']]
    return inputs

dataset = dataset.map(tokenize, batched=True)

dataset['train'][0]

67 unique glosses


{'transcription': 'o sey xtok rixoqiil',
 'segmentation': "o' sea x-tok r-ixóqiil",
 'pos_glosses': 'CONJ ADV COM-VT E3S-S',
 'glosses': 'o sea COM-buscar E3S-esposa',
 'translation': 'O sea busca esposa.',
 'input_ids': [259, 268, 303, 276, 259, 329, 11207, 1418, 329, 159121, 696, 1],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [19, 0, 8, 0, 17, 66, 0, 31, 56, 1]}

In [6]:
from transformers import DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
# collator(batch.select_columns(['input_ids', 'attention_mask', 'labels']).to_list()


collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)
dataloader = DataLoader(dataset['train'].select_columns(['input_ids', 'attention_mask', 'labels']), batch_size=32, collate_fn=collator)

for batch in dataloader:
    # print(batch['input_ids'].shape)
    print("Labels", batch['labels'][0])
    print("Features", batch['labels'].unsqueeze(-2)[0])
    f = features.map_labels_to_features(batch['labels'],  [[i] for i in range(len(all_glosses))])
    print("Features", f[0])
    break

Labels tensor([  19,    0,    8,    0,   17,   66,    0,   31,   56,    1, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100])
Features tensor([[  19,    0,    8,    0,   17,   66,    0,   31,   56,    1, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]])
Epoch time = 0.000s / row
Features tensor([[  19,    0,    8,    0,   17,   66,    0,   31,   56,    1, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]])


In [71]:
import torch
torch.tensor(dataset['train'][0]['labels'])

tensor([[18,  0,  7,  0, 16, 65,  0, 30, 55]])

In [165]:
from typing import List

def greedy_decode(feature_map, feature_logits: List[torch.Tensor]):
        """Decodes a bundle of feature logits into gloss ID predictions. 
        The output should align with the input vocabulary of the decoder.

        Args:
            feature_logits (List[torch.Tensor]): List of feature logit tensors each of size `(batch_size, seq_length, feature_size)`
        """
        batch_size = feature_logits[0].shape[0]
        seq_length = feature_logits[0].shape[1]

        primary_features = torch.argmax(feature_logits[0], -1)

        def _decode_row(row_index: int):
            """Decodes the tokens in a given row of the batch. Generator function that `yields` a single prediction (id) for each token."""
            for token_index in range(seq_length):
                # Filter based on the primary feature
                possible_label_ids = [index for index, value in enumerate(
                    feature_map) if value[0] == primary_features[row_index, token_index].item()]

                if len(possible_label_ids) == 1:
                    yield possible_label_ids[0]
                    continue

                # Now we have a list of possible feature matrices
                # Compute the joint probability based on the logits of each feature
                softmax = torch.nn.LogSoftmax(dim=-1)
                # (num_features, feature_size)
                feature_probs = [softmax(feature_logits[feature_index][row_index, token_index])
                                for feature_index in range(1, len(feature_logits))]  # Omits the first feature
                
                # (prob, id)
                most_probable_label = (None, None)
                # Try each possible label, choose one with highest probability
                for possible_label_id in possible_label_ids:
                    possible_label_features = feature_map[possible_label_id]
                    label_prob = 0
                    # Sum up log probabilities for each true feature of the label
                    for feature_index, feature_value in enumerate(possible_label_features):
                        if feature_index == 0:
                            continue
                        label_prob += feature_probs[feature_index - 1][feature_value]

                    if most_probable_label[0] is None or label_prob > most_probable_label[0]:
                        most_probable_label = (label_prob, possible_label_id)
                yield most_probable_label[1]

        next_tokens = [list(_decode_row(row_index)) for row_index in range(batch_size)]
        return torch.tensor(next_tokens)

primary_feature = torch.tensor([[[0.1, 0.2, 0.1], [0.1, 0.2, 0.1]],
                                [[0.05, 0.01, 10], [0.05, 0.01, 10]]])
secondary_feature =  torch.tensor([[[8, 2, 1], [8, 2, 1]],
                                [[1, 5, 10], [1, 5, 10]]], dtype=float)
feature_map = [[0, 0], [0, 1], [1, 0], [1,1], [2,0], [2,2]]
greedy_decode(feature_map, [primary_feature, secondary_feature])

tensor([[2, 2],
        [5, 5]])

In [143]:
torch.log(torch.softmax(torch.tensor([1, 5, 10], dtype=float), 0))

tensor([1.2257e-04, 6.6920e-03, 9.9319e-01], dtype=torch.float64)

In [141]:
softmax = torch.nn.LogSoftmax(dim=0)
softmax(torch.tensor([1, 5, 10], dtype=float))

tensor([-9.0068e+00, -5.0068e+00, -6.8379e-03], dtype=torch.float64)

In [159]:
max([len(x) for x in dataset['train']['labels']])

39

In [160]:
dataset

DatasetDict({
    train: Dataset({
        features: ['transcription', 'segmentation', 'pos_glosses', 'glosses', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 9774
    })
    eval: Dataset({
        features: ['transcription', 'segmentation', 'pos_glosses', 'glosses', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 232
    })
    test: Dataset({
        features: ['transcription', 'segmentation', 'pos_glosses', 'glosses', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 633
    })
})

In [13]:
import pandas as pd 

df = pd.read_csv('../features_v1.csv')
(df.max(axis=0, numeric_only=True) + 1).values.tolist()

[56, 4, 3, 3]

In [16]:
df[df['Gloss'] == 'TOP'].values[0][1:].tolist()

[51, 0, 0, 0]