In [1]:
import torch

# Define the character set and the mapping from index to character
char_list = [' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
             'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '<blank>']
idx2char = {idx: char for idx, char in enumerate(char_list)}
char2idx = {char: idx for idx, char in enumerate(char_list)}

def text_to_int_sequence(text):
    """Converts text to an integer sequence."""
    return [char2idx[char] for char in text]

def int_sequence_to_text(sequence):
    """Converts an integer sequence to text."""
    return ''.join(idx2char[idx] for idx in sequence if idx != char2idx['<blank>'])

# Example usage:
text = "hello"
sequence = text_to_int_sequence(text)
print("Text to sequence:", sequence)
print("Sequence to text:", int_sequence_to_text(sequence))


Text to sequence: [8, 5, 12, 12, 15]
Sequence to text: hello


In [2]:


import torch
 
def tokenize_text(text):
    """ Tokenizes text into a list of words. """
    return text.lower().strip().split()
 
def build_vocab(texts):
    """ Builds a vocabulary from a list of texts. """
    vocab = {"<pad>": 0, "<unk>": 1}
    idx = 2
    for text in texts:
        tokens = tokenize_text(text)
        for token in tokens:
            if token not in vocab:
                vocab[token] = idx
                idx += 1
    return vocab
 
# Example text data
texts = ["Hello world", "How are you doing?", "This is a test text for speech-to-textSTT."]
 
# Build vocabulary
vocab = build_vocab(texts)
print("Vocabulary:", vocab)
 
def text_to_sequence(text, vocab):
    """ Converts text to a sequence of indices. """
    tokens = tokenize_text(text)
    return [vocab.get(token, vocab["<unk>"]) for token in tokens]
 
# Convert a sample text to sequence
sample_text = "How are you doing?"
sequence = text_to_sequence(sample_text, vocab)
print("Sequence:", sequence)
 
def sequence_to_tensor(sequence):
    """ Converts a list of sequences into a tensor, padded to the same length. """
    max_len = max(len(s) for s in sequence)
    padded = [s + [vocab["<pad>"]] * (max_len - len(s)) for s in sequence]
    return torch.tensor(padded)
 
# Convert sequences to tensor
sequences = [text_to_sequence(text, vocab) for text in texts]
tensor = sequence_to_tensor(sequences)
print("Tensor shape:", tensor.shape)


Vocabulary: {'<pad>': 0, '<unk>': 1, 'hello': 2, 'world': 3, 'how': 4, 'are': 5, 'you': 6, 'doing?': 7, 'this': 8, 'is': 9, 'a': 10, 'test': 11, 'text': 12, 'for': 13, 'speech-to-textstt.': 14}
Sequence: [4, 5, 6, 7]
Tensor shape: torch.Size([3, 7])


In [None]:

def pad_sequence(sequences, batch_first=False, padding_value=0):
    """Pad sequences to the same length with the given padding value."""
    max_size = sequences[0].size()
    trailing_dims = max_size[1:]
    max_len = max([s.size(0) for s in sequences])
    if batch_first:
        out_dims = (len(sequences), max_len) + trailing_dims
    else:
        out_dims = (max_len, len(sequences)) + trailing_dims
 
    out_tensor = sequences[0].data.new_full(out_dims, padding_value)
    for i, tensor in enumerate(sequences):
        length = tensor.size(0)
        if batch_first:
            out_tensor[i, :length, ...] = tensor
        else:
            out_tensor[:length, i, ...] = tensor
 
    return out_tensor
 
# Example: Padding text data for a batch
batch_texts = ["hello", "data", "world"]
batch_encoded = [torch.tensor(text_processor.text_to_int(text)) for text in batch_texts]
padded_texts = pad_sequence(batch_encoded, batch_first=True, padding_value=text_processor.char2index['<pad>'])
 
print("Padded Text Batch:", padded_texts)

In [4]:
def sequence_to_text(sequence, vocab):
    """ Converts a sequence of indices back to text. """
    inv_vocab = {v: k for k, v in vocab.items()}
    return ' '.join(inv_vocab[idx] for idx in sequence if idx not in [vocab['<pad>'], vocab['<unk>']])
 
# Decode sequence to text
decoded_text = sequence_to_text(sequence, vocab)
print("Decoded Text:", decoded_text)


Decoded Text: how are you doing?


In [5]:
import torch.nn as nn
import torch.optim as optim
 
# Simple RNN model for demonstration
class SpeechModel(nn.Module):
    def __init__(self, num_classes, hidden_size=128, num_layers=1):
        super(SpeechModel, self).__init__()
        self.rnn = nn.GRU(input_size=40, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        outputs, _ = self.rnn(x)
        outputs = self.fc(outputs)
        return outputs

 
# Initialize the model, loss, and optimizer
model = SpeechModel(num_classes=len(char_list))
ctc_loss = nn.CTCLoss(blank=char2idx['<blank>'], zero_infinity=True)
optimizer = optim.Adam(model.parameters())
 
# Dummy data for demonstration (random tensors as inputs and targets)
inputs = torch.rand(10, 100, 40)  # (batch_size, sequence_length, num_features)
target_lengths = torch.full((10,), 15, dtype=torch.long)
input_lengths = torch.full((10,), 100, dtype=torch.long)
targets = torch.randint(1, len(char_list), (10, 15), dtype=torch.long)
 
# Training step
optimizer.zero_grad()
outputs = model(inputs)  # Shape: (batch_size, sequence_length, num_classes)
outputs = outputs.permute(1, 0, 2)  # Align output for CTC Loss: (sequence_length, batch_size, num_classes)
loss = ctc_loss(outputs.log_softmax(2), targets, input_lengths, target_lengths)
loss.backward()
optimizer.step()
 
print("Training loss:", loss.item())


Training loss: 17.744062423706055
