In [None]:
import numpy as np
import pandas as pd

In [None]:
# Read the file and specific characters in the file

data = open('bless.txt', 'r').read()
char = list(set(data))

char_size, data_size = len(char), len(data)
char_size, data_size 

In [None]:
#calculate the vocab size, convert the chars to vectors
char_to_int = { ch:i for i,ch in enumerate(char)}
int_to_char = { i:ch for i,ch in enumerate(char)}

In [None]:
#one-hot encode all characters
char_vector = np.zeros((np.int32(char_size), np.int32(char_size)))
 
for x, y in int_to_char.items():
    char_vector[x, char_to_int[y]] = 1

In [None]:
# Define the model architecture, weights and biases

#hyperparameters
hidden_size = 100 #-> hundred neuron for its hidden layer
seq_len = 25
lr = 1e-1

#weights
wx = np.random.randn(hidden_size, char_size) #for the sake of dot multiplication, it's meant to be (char_size, hidden_size)
whh = np.random.randn(hidden_size, hidden_size)  
who = np.random.randn (char_size, hidden_size) #for the sake of dot multiplication, it's meant to be (hidden_size, char_size)

#biases
bh = np.zeros((hidden_size, 1))
by = np.zeros((char_size, 1))

In [None]:
#define the loss function

#we want to implement the forward and backward pass

def lossfun(inputs, target, prevh):

    #store the input characters, hidden states, target values and the probability of the output at every time steps
    xs, hs, ys, ps = {}, {}, {}, {}

    #save the previous hidden state
    hs[-1] = np.copy(prevh) 
    loss = 0

    #forward pass
    for i in range(len(inputs)):
        # for every input (character), get it encoded form
        xs[i] = char_vector[char_to_int[inputs[i]]]
        xs[i] = xs[i].reshape(char_size,1)

        # calc the hidden state
        hs[i] = np.tanh(np.dot(wx, xs[i]) + np.dot(whh, hs[i-1]) + bh) 

        # unnormalized log probabilities for next chars
        ys[i] = np.dot(who, hs[i]) + by 
        # probabilities for next chars
        ps[i] = np.exp(ys[i]) / np.sum(np.exp(ys[i])) 
        
        # get the integer value of the character and find 
        # where ps[i] == the target value == 1 in the one hot encoded form
        
        loss += -np.log(ps[i][target[i],0]) # softmax (cross-entropy loss)
  
    #backward pass      
    #initialize the gradients going backwards
    dwx, dwhh, dwho = np.zeros_like(wx), np.zeros_like(whh), np.zeros_like(who)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0]) 

    #calculate the gradient going backwards
    for i in reversed(range(len(inputs))):

        dy = np.copy(ps[i])

        #say E is the error
        #dE/dy where y is the output not the softmax of the output
        dy[target[i]] -= 1 # backprop into y. 

        #dE/dwho = dE/dy*(dy/dwho --> h = dy/dwho)
        dwho += np.dot(dy, hs[i].T)

        #dE/dby 
        dby += dy

        #dE/dh = dE/dy*(dy/dh --> dwho ) + dhnext 
        dh = np.dot(who.T, dy) + dhnext 

        # backprop through tanh nonlinearity
        dhraw = (1 - hs[i] * hs[i]) * dh 
        dbh += dhraw

        #dE/dwx = ...
        dwx += np.dot(dhraw, xs[i].T)
        dwhh += np.dot(dhraw, hs[i-1].T)
        dhnext = np.dot(whh.T, dhraw)

    for dparam in [dwx, dwhh, dwho, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    return loss, dwx, dwhh, dwho, dbh, dby, hs[len(inputs)-1]

def sample(h, seed_ix, n):
  '''                                                                                                                                                                                        
  sample a sequence of integers from the model                                                                                                                                                
  h is memory state, seed_ix is seed letter for first time step   
  n is how many characters to predict
  
  '''

  x = char_vector[char_to_int[seed_ix]]
  #list to store generated chars
  x = x.reshape(1,x.shape[0])
  x = x.T
  ixes = []
  #for as many characters as we want to generate
  for i in range(n):
    #a hidden state at a given time step is a function 
    #of the input at the same time step modified by a weight matrix 
    #added to the hidden state of the previous time step 
    #multiplied by its own hidden state to hidden state matrix.
    h = np.tanh(np.dot(wx, x) + np.dot(whh, h) + bh)
    #compute output (unnormalised)
    y = np.dot(who, h) + by
    ## probabilities for next chars
    p = np.exp(y) / np.sum(np.exp(y))

    #pick one with the highest probability 
    ix = np.random.choice(range(char_size), p=p.ravel())
    #create a vector
    x = np.zeros((char_size, 1))
    #customize it for the predicted char
    x[ix] = 1
    #add it to the list
    ixes.append(ix)

  txt = ''.join(int_to_char[ix] for ix in ixes)
  print ('----\n %s \n----' % (txt, ))
hprev = np.zeros((hidden_size,1)) # reset RNN memory  
#predict the 200 next characters given 'a'
sample(hprev, 'a', 200)


In [None]:
""" inh = "Mmel"
target_list = []
target = "mmel"
for element in "mmel":
   target_list.append(char_to_int[element])

hprev = np.zeros((hidden_size,1))

lossfun(inh, target_list, hprev) """

In [None]:
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(wx), np.zeros_like(whh), np.zeros_like(who)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad                                                                                                                
smooth_loss = -np.log(1.0/char_size)*seq_len # loss at iteration 0                                                                                                                        
while n<=10000:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  # check "How to feed the loss function to see how this part works
  if p+seq_len+1 >= len(data) or n == 0:
    hprev = np.zeros((hidden_size,1)) # reset RNN memory                                                                                                                                      
    p = 0 # go from start of data   

  inputs = [ch for ch in data[p:p+seq_len]]
  targets = [char_to_int[ch] for ch in data[p+1:p+seq_len+1]]
  # forward seq_length characters through the net and fetch gradient                                                                                                                          
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossfun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001

  # sample from the model now and then                                                                                                                                                        
  if n % 10 == 0:
    print ('iter %d, loss: %f' % (n, smooth_loss)) # print progress
    sample(hprev, inputs[0], 200)

  # perform parameter update with Adagrad                                                                                                                                                     
  for param, dparam, mem in zip([wx, whh, who, bh, by],
                                [dWxh, dWhh, dWhy, dbh, dby],
                                [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -lr * dparam / np.sqrt(mem + 1e-8) # adagrad update                                                                                                                   

  p += seq_len # move data pointer                                                                                                                                                         
  n += 1 # iteration counter    