Dataset originaly obtained from https://github.com/suarasaur/dinosaurs

In [52]:
import numpy as np
import matplotlib.pyplot as plt
import pdb

# Read and convert data

In [53]:
with open('dinosaurs.csv') as f:
    data = [x.strip() for x in f.readlines()]
data[:4]

['aachenosaurus', 'aardonyx', 'abelisaurus', 'abrictosaurus']

In [54]:
chars = list(set(''.join(data)))               # ['x', 'z', 'j', 'g', ... ]
chars.insert(0, ' ')                           # use space as not-a-char tag, used for padding
ch2i = {ch:i for i,ch in enumerate(chars)}     # {' ': 0, 'x': 1, 'z': 2, 'j': 3, 'g': 4, ... }
i2ch = {i:ch for ch,i in ch2i.items()}         # {0: ' ', 1: 'x', 2: 'z', 3: 'j', 4: 'g', ... }

In [55]:
np.random.seed(0)
np.random.shuffle(data)
data[:4]

['yongjinglong', 'eocarcharia', 'shidaisaurus', 'brasileosaurus']

In [56]:
max_len = len(max(data, key=len))  # length of longest dino name
for i, dino in enumerate(data):
    data[i] = dino.ljust(max_len)  # pad all names with spaces to same length
data[:4]

['yongjinglong           ',
 'eocarcharia            ',
 'shidaisaurus           ',
 'brasileosaurus         ']

In [77]:
vocab_size = len(chars)

In [59]:

[ch2i[x] for x in dino]

[24, 6, 10, 1, 4, 12, 10, 1, 14, 6, 10, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [83]:
indices = np.zeros(shape=[len(data), max_len], dtype=int)

In [84]:
indices

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [85]:
for i, dino_name in enumerate(data):
    indices[i] = [ch2i[x] for x in dino_name]

In [86]:
data[234]

'inosaurus              '

In [87]:
''.join([i2ch[x] for x in indices[234]])

'inosaurus              '

In [92]:
onehot = np.zeros(shape=[len(data), max_len, vocab_size], dtype=int)

In [93]:
for i in range(len(indices)):
    for j in range(max_len):
        onehot[i, j, indices[i,j]] = 1

In [94]:
indices[0, 0]

24

In [97]:
onehot[0]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 0, 0, 0, 

In [98]:
''.join([i2ch[np.argmax(x)] for x in onehot[234]])

'inosaurus              '

In [99]:
onehot.shape

(1325, 23, 27)

# Neural Network

<img src="../Udacity_DL_Nanodegree/031%20RNN%20Super%20Basics/MultiMultiRNN01.png" align="left"/>

<img src="assets/rnn_diag.png"/>