<a href="https://colab.research.google.com/github/mukeshnarendran7/Fastai2020/blob/main/RNN1_numpy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import numpy as np

In [38]:
np.random.seed(42)
def generate_datasets(num_sequences):
  '''
  - Generating a sequence of characters
  '''
  sequences = []
  for _ in range(num_sequences):
    num_tokens = np.random.randint(1, 12)
    sample = ['a']*num_tokens + ['c']*num_tokens +['EOS']
    sequences.append(sample)
  return sequences

sequences = generate_datasets(15)
print('Sequences from the dataset')
print(sequences[2])

Sequences from the dataset
['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'EOS']


In [39]:
from collections import defaultdict

def sequences_to_dicts(sequences):

  flatten = lambda l:[item for sublist in l for item in sublist]
  all_words = flatten(sequences)

  word_count = defaultdict(int)
  for word in flatten(sequences):
    word_count[word] +=1

  word_count = sorted(list(word_count.items()), key = lambda l:-l[1])
  unique_words = [item[0] for item in word_count]
  unique_words.append('UNK')

  num_sentences, vocab_size = len(sequences), len(unique_words)
  # Create dictionaries so that we can go from word to index and back
  # If a word is not in our vocabulary, we assign it to coken 'UNK'
  word_to_idx = defaultdict(lambda: vocab_size-1)
  idx_to_word = defaultdict(lambda: 'UNK')

  for idx, word in enumerate(unique_words):
    word_to_idx[word] = idx 
    idx_to_word[idx] = word
  
  return word_to_idx, idx_to_word, num_sentences, vocab_size

word_to_idx, idx_to_word, num_sequences, vocab_size = sequences_to_dicts(sequences)

print(f'We have {num_sequences} sentences and {len(word_to_idx)} unique tokens in our dataset (including UNK).\n')
print('The index of \'c\' is', word_to_idx['c'])
print(f'The word corresponding to index 1 is \'{idx_to_word[1]}\'')

assert idx_to_word[word_to_idx['c']] == 'c', \
    'Consistency error: something went wrong in the conversion.'


We have 15 sentences and 4 unique tokens in our dataset (including UNK).

The index of 'c' is 1
The word corresponding to index 1 is 'c'


In [55]:
 from torch.utils import data

 class Dataset(data.Dataset):
   def __init__(self, inputs, targets):
    self.inputs, self.targets = inputs, targets
 
   def __len__(self):
      return len(self.targets)

   def __getitem__(self, index):
     X = self.inputs[index]
     y = self.targtes[index]

     return X, y

def create_datasets(sequences, dataset_class, p_train = 0.8, p_val = 0.1, p_test = 0.1):
  num_train = int(len(sequences)*p_train)
  num_val = int(len(sequences)*p_val)
  num_test = int(len(sequences)*p_test)

  sequences_train = sequences[:num_train]
  sequences_val = sequences[num_train:num_train+num_val]
  sequences_test = sequences[-num_test:]

  def get_inputs_targets_from_sequences(sequences):
        inputs, targets = [], []

        for sequence in sequences:
            inputs.append(sequence[:-1])
            targets.append(sequence[1:])
            
        return inputs, targets

  inputs_train, targets_train = get_inputs_targets_from_sequences(sequences_train)
  inputs_val, targets_val = get_inputs_targets_from_sequences(sequences_val)
  inputs_test, targets_test = get_inputs_targets_from_sequences(sequences_test)
  
  training_set = dataset_class(inputs_train, targets_train)
  validation_set = dataset_class(inputs_val, targets_val)
  test_set = dataset_class(inputs_test, targets_test)

  return training_set, validation_set, test_set

training_set, validation_set, test_set = create_datasets(sequences, Dataset)

print(f'We have {len(training_set)} samples in the training set.')
print(f'We have {len(validation_set)} samples in the validation set.')
print(f'We have {len(test_set)} samples in the test set.')


We have 12 samples in the training set.
We have 1 samples in the validation set.
We have 1 samples in the test set.


***One Hot encoding***

In [51]:
sequences

[['a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'EOS'],
 ['a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'EOS'],
 ['a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'EOS'],
 ['a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'EOS'],
 ['a', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'c', 'EOS'],
 ['a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'EOS'],
 ['a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'EOS'],
 ['a', 'a', 'a', 'c', 'c', 'c', 'EOS'],
 ['a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'EOS'],
 ['a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'a',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'c',
  'EOS'],
 ['a',
  '