In [32]:
import numpy as np
from collections import defaultdict
from torch.utils import data

In [5]:
# Generate Dataset
np.random.seed(42)

In [13]:
def generate_dataset(num_sequences=2**8):
    sequences = []
    for _ in range(num_sequences):
        token_length = np.random.randint(1, 12)
        sequence = f'{"a"*token_length}{"b"*token_length}EOS'
        sequences.append(sequence)
        
    return sequences

In [30]:
def word_encoding(sequences):
    
    # Get 1D list of all words in all sequences
    flatten = lambda l: [item for sublist in l for item in sublist]
    all_words = flatten(sequences)
    
    # Create dictionary mapping word to word frequency across all sequences
    word_to_count = defaultdict(int)
    for word in all_words:
        word_to_count[word] += 1
    word_to_count = sorted(list(word_to_count.items()), key=lambda l: -l[1]) # sorting according to frequency
    
    # List of unique words
    dictionary = [item[0] for item in word_to_count]
    dictionary.append('UNK')
    
    # Calculate lengths
    num_sequences = len(sequences)
    vocab_size = len(dictionary)
    
    # Make word to index and index to word mappings
    word_to_idx = defaultdict(lambda: vocab_size-1)
    idx_to_word = defaultdict(lambda: 'UNK')
    for idx, word in enumerate(dictionary):
        word_to_idx[word] = idx
        idx_to_word[idx] = word
    
    return word_to_idx, idx_to_word, vocab_size

In [33]:
class Dataset(data.Dataset):
    def __init__(self, inputs, targets):
        self.X = inputs
        self.y = targets

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index], self.y[index]

In [34]:
def prepare_data(sequences, train_size=0.8, test_size=0.1, val_size=0.1):
    
    # Split data
    num_train = int(train_size*len(sequences))
    num_test = int(test_size*len(sequences))
    num_val = int(val_size*len(sequences))
    
    train_seq = sequences[:num_train]
    test_seq = sequences[num_train:num_train+num_test]
    val_seq = sequences[-num_val:]
    
    # prepare input & target sequences
    def prepare_sequences(sequences):
        inputs = []
        targets = []
        
        for sequence in sequences:
            inputs.append(sequence[:-1])
            targets.append(sequence[1:])
        
        return inputs, targets
    
    train_inputs, train_targets = prepare_sequences(train_seq)
    test_inputs, test_targets = prepare_sequences(test_seq)
    val_inputs, val_targets = prepare_sequences(val_seq)
    
    # create datasets
    train_set = Dataset(train_inputs, train_targets)
    test_set = Dataset(test_inputs, test_targets)
    val_set = Dataset(val_inputs, val_targets)
    
    return train_set, test_set, val_set