<a href="https://colab.research.google.com/github/mikeypixels/recurrent_neural_network/blob/master/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load the data

In [1]:
from google.colab import files

uploaded = files.upload()

data = open('sampleFICT.txt', 'r').read()

chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print ('data has %d chars, %d unique' % (data_size, vocab_size))

Saving sampleFICT.txt to sampleFICT.txt
data has 5196229 chars, 28 unique


# Calculate vocab size

In [2]:
char_to_ix = {ch:i for i, ch in enumerate(chars)}
ix_to_char = {i:ch for i, ch in enumerate(chars)}
print(char_to_ix)
print(ix_to_char)

{'q': 0, 'o': 1, 'w': 2, 'i': 3, 'g': 4, 'l': 5, 'b': 6, 'x': 7, 'z': 8, 'r': 9, 'j': 10, 'c': 11, ' ': 12, 'd': 13, 'e': 14, 's': 15, 'm': 16, 'h': 17, 'n': 18, 't': 19, '\n': 20, 'k': 21, 'y': 22, 'p': 23, 'v': 24, 'a': 25, 'f': 26, 'u': 27}
{0: 'q', 1: 'o', 2: 'w', 3: 'i', 4: 'g', 5: 'l', 6: 'b', 7: 'x', 8: 'z', 9: 'r', 10: 'j', 11: 'c', 12: ' ', 13: 'd', 14: 'e', 15: 's', 16: 'm', 17: 'h', 18: 'n', 19: 't', 20: '\n', 21: 'k', 22: 'y', 23: 'p', 24: 'v', 25: 'a', 26: 'f', 27: 'u'}


In [3]:
import numpy as np

vector_for_char_a = np.zeros((vocab_size, 1))
vector_for_char_a[char_to_ix['a']] = 1
print(vector_for_char_a.ravel())

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0.]


# Definition of the Network

In [0]:
#hyperparameters

hidden_size = 100
seq_length = 25
learning_rate = 1e-1

In [0]:
#model parameters
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01 #input to hidden state
Whh = np.random.randn(hidden_size, hidden_size) * 0.01 #input to hidden state
Why = np.random.randn(vocab_size, hidden_size) * 0.01 #input to hidden state
bh = np.zeros((hidden_size, 1))
by = np.zeros((vocab_size, 1))

In [0]:
def lossFun(inputs, targets, hprev):
  
  xs, hs, ys, ps = {}, {}, {}, {}
  
  hs[-1] = np.copy(hprev)
  loss = 0
  
  #forward pass
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size, 1))
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
    ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
    loss += -np.log(ps[t][targets[t],0]) # softmax
  # backward pass: compute gradients going backwards
  # initialize vectors for gradient values for each set of weights
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    #output probabilities
    dy = np.copy(ps[t])
    #derive out first gradient
    dy[targets[t]] -= 1 # backdrop into y
    #compute output gradient - output times hidden states
    #When we apply the transpose weight matrix,
    #we can think intuitively of this as moving the error
    #through the network, giving us some sort of measure
    #output gradient
    dWhy += np.dot(dy, hs[t].T)
    #derivative of output bias
    dby += dy
    #backpropagate!
    dh = np.dot(Why.T, dy) + dhnext # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh
    dbh += dhraw #derivative of hidden bias
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t-1].T)
    dhnext = np.dot(Whh.T, dhraw)
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

# Create a sentence from a model

In [7]:
#prediction one full forward pass
def sample(h, seed_ix, n):
  """
  sample a sequence of integers from the model
  h is memory state, seed_ix is seed letter for first time
  n is how many characters to predict
  """
  
  #create vector
  x = np.zeros((vocab_size, 1))
  #customize it for our seed char
  x[seed_ix] = 1
  #list to store generated chars
  ixes = []
  #for as many characters as we want to generate
  for t in range(n):
    #a hidden state at a given time step is a function
    #of the input at the same time step modified by a weight
    #added to the hidden state of the previous time step
    #multiplied by its own hidden state to hidden state matrix
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    #compute output (unnormalised)
    y = np.dot(Why, h) + by
    ## probabilities for next chars
    p = np.exp(y) / np.sum(np.exp(y))
    #pick one with the highest probability
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    #create a vector
    x = np.zeros((vocab_size, 1))
    #customize it for the predicted char
    x[ix] = 1
    #add it to the list 
    ixes.append(ix)
    
  txt = ''.join(ix_to_char[ix] for ix in ixes)
  print ('----\n %s \n----' % (txt, ))
hprev = np.zeros((hidden_size, 1)) # reset RNN memory
#predict the 200 next characters given 'a'
sample(hprev, char_to_ix['a'], 200)

----
 haajzjzpzexpokrqitkrzrrzaiaal
qxtjjojcxsoaojtrxnlxvcr zbayfm
fsutclthshteywartvvmbqhvsqbmjzzthasezgs
zy
kuzujhzcqrprp
iiabeydlhvbdfqqiht vysmsyxkvyjlgovmzlkbsvh jloemwgvw
tiht
wxzbvplsneot fffdojzghtt 
----


# Training

## Feed the loss function with inputs and targets

In [8]:
p = 0
inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
print("inputs", inputs)
targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
print("targets", targets)

inputs [2, 25, 15, 12, 1, 18, 5, 22, 12, 6, 22, 12, 9, 14, 19, 9, 1, 15, 23, 14, 11, 19, 12, 22, 1]
targets [25, 15, 12, 1, 18, 5, 22, 12, 6, 22, 12, 9, 14, 19, 9, 1, 15, 23, 14, 11, 19, 12, 22, 1, 27]


In [9]:
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variable
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration
while n <= 1000*100:
  # prepare inputs (we're sweeping from left to right in steps)
  # check "How to feed the loss function to see how this part works"
  if p+seq_length+1 >= len(data) or n == 0:
    hprev = np.zeros((hidden_size, 1)) # reset RNN memory
    p = 0 # go from start of data
  inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
  targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
  
  # forward seq_length characters through the net and fetch gradient
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  
  # sample from the model now and then
  if n % 1000 == 0:
    print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
    sample(hprev, inputs[0], 200)
    
  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                                [dWxh, dWhh, dWhy, dbh, dby],
                                [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8) 
    
  p += seq_length # move data pointer
  n += 1 # iteration counter

iter 0, loss: 83.305115
----
 siqpnrvzlmljazpczjpu
o
wjriubtkcnifprhcwnkilnbmbbfkclyrmamauesofccdrqzdxtdlmhtvwsotpfatav kwqyzbgn
c dlrgtpxkyw kxjdgdxadvdkhuuuphseaecuapunfaclszunoeorvvxucpi zcxekjutjrjxu iq
akt
ihl
bcfa
entijkyxfx 
----
iter 1000, loss: 72.632768
----
 the at fitienpeke wos hucloeg popdin teininornud voran oniy donecad arrcee honry an vee to epitactlorpe iyi at un hrocptan at ot hleitanrens avt wresrin ref held hhirealoyshnnare th uy sifo sr hiads p 
----
iter 2000, loss: 64.894428
----
 e therleas miranord asle wely ender crot an antith opran pfand ope ardsed wen tit nfaune pdirtd toirh med thel ingnythitrers bpaswmonye is honl in titd wee dyeme in beof ple sibiny ar to an pettres th 
----
iter 3000, loss: 60.427078
----
 nt bare of ar ost hicemaqunt wometce tengrithumasu an ufilise pof hasitheud the athe pceaage intich aokeg liwe inind they tiredt bot oy gasp ance cforotuuud ancuth tislende thy htibantba inceuedif tef 
----
iter 4000, loss: 57.852652
----
 s honyt of