In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [206]:
#data 
path = r'E:\Study n Work\Neural Networks\Recurrent Neural Network\Data\Names\NationalNames.csv'
data = pd.read_csv(path)
data['Name'] = data['Name']
data = np.array(data['Name'][:10000]).reshape(-1,1)
data = [x.lower() for x in data[:,0]]

data = np.array(data).reshape(-1,1)
print(data.shape)
data[1:10]

(10000, 1)


array([['anna'],
       ['emma'],
       ['elizabeth'],
       ['minnie'],
       ['margaret'],
       ['ida'],
       ['alice'],
       ['bertha'],
       ['sarah']],
      dtype='<U12')

In [207]:
transform_data = np.copy(data)
max_length = 0
for index in range(len(data)):
    max_length = max(max_length,len(data[index,0]))

for index in range(len(data)):
    length = (max_length - len(data[index,0]))
    string = '.'*length
    transform_data[index,0] = ''.join([transform_data[index,0],string])
    
print(transform_data[1:10])

[['anna........']
 ['emma........']
 ['elizabeth...']
 ['minnie......']
 ['margaret....']
 ['ida.........']
 ['alice.......']
 ['bertha......']
 ['sarah.......']]


In [208]:
vocab = list()
for name in transform_data[:,0]:
    vocab.extend(list(name))

vocab = set(vocab)
vocab_size = len(vocab)

print("Vocab size : %d"%len(vocab))
print("Vocab = {}".format(vocab))

Vocab size : 27
Vocab = {'r', 'j', 'p', 'k', 'l', 'm', 'z', 'e', 'c', 'v', 'n', 't', 'u', 'q', '.', 'x', 'b', 'g', 's', 'h', 'a', 'y', 'o', 'f', 'i', 'd', 'w'}


In [209]:
#map char to id and id to chars
char_id = dict()
id_char = dict()

for i,char in enumerate(vocab):
    char_id[char] = i
    id_char[i] = char

print('a-{}, 22-{}'.format(char_id['a'],id_char[22]))

a-20, 22-o


In [210]:
train_dataset = []
batch_size = 20
for i in range(len(transform_data)-batch_size+1):
    start = i*batch_size
    end = start+batch_size
    batch_data = transform_data[start:end]
    if(len(batch_data)!=batch_size):
        break
    char_list = []
    for k in range(len(batch_data[0][0])):
        batch_dataset = np.zeros([batch_size,len(vocab)])
        for j in range(batch_size):
            name = batch_data[j][0]
            char_index = char_id[name[k]]
            batch_dataset[j,char_index] = 1.0
            
        char_list.append(batch_dataset)
    train_dataset.append(char_list)

In [211]:
input_units = 100
hidden_units = 256
output_units = vocab_size

#hyperparameters
learning_rate = 0.001
beta1 = 0.90
beta2 = 0.99

In [29]:
def initialize_parameters():
    mean = 0
    std = 0.01
    
    #lstm cell weights
    forget_gate_weights = np.random.normal(mean,std,(input_units+hidden_units,hidden_units))
    input_gate_weights  = np.random.normal(mean,std,(input_units+hidden_units,hidden_units))
    output_gate_weights = np.random.normal(mean,std,(input_units+hidden_units,hidden_units))
    gate_gate_weights   = np.random.normal(mean,std,(input_units+hidden_units,hidden_units))
    
    #hidden to output weights
    hidden_output_weights = np.random.normal(mean,std,(hidden_units,output_units))
    
    parameters = dict()
    parameters['fgw'] = forget_gate_weights
    parameters['igw'] = input_gate_weights
    parameters['ogw'] = output_gate_weights
    parameters['ggw'] = gate_gate_weights
    parameters['how'] = hidden_output_weights
    
    return parameters

def initialize_V(parameters):
    Vfgw = np.zeros(parameters['fgw'].shape)
    Vigw = np.zeros(parameters['igw'].shape)
    Vogw = np.zeros(parameters['ogw'].shape)
    Vggw = np.zeros(parameters['ggw'].shape)
    Vhow = np.zeros(parameters['how'].shape)
    
    V = dict()
    V['vfgw'] = Vfgw
    V['vigw'] = Vigw
    V['vogw'] = Vogw
    V['vggw'] = Vggw
    V['vhow'] = Vhow
    return V

def initialize_S(parameters):
    Sfgw = np.zeros(parameters['fgw'].shape)
    Sigw = np.zeros(parameters['igw'].shape)
    Sogw = np.zeros(parameters['ogw'].shape)
    Sggw = np.zeros(parameters['ggw'].shape)
    Show = np.zeros(parameters['how'].shape)
    
    S = dict()
    S['sfgw'] = Sfgw
    S['sigw'] = Sigw
    S['sogw'] = Sogw
    S['sggw'] = Sggw
    S['show'] = Show
    return S

In [10]:
#Activation Functions
def sigmoid(X):
    return 1/(1+np.exp(-X))

def tanh_activation(X):
    return np.tanh(X)

def softmax(X):
    exp_X = np.exp(X)
    exp_X_sum = np.sum(exp_X,axis=1).reshape(-1,1)
    exp_X = exp_X/exp_X_sum
    return exp_X

def tanh_derivative(X):
    return 1-(X**2)

In [11]:
def lstm_cell(batch_dataset, prev_activation_matrix, prev_cell_matrix, parameters):
    #get parameters
    fgw = parameters['fgw']
    igw = parameters['igw']
    ogw = parameters['ogw']
    ggw = parameters['ggw']
    
    #concat batch data and prev_activation matrix
    concat_dataset = np.concatenate((batch_dataset,prev_activation_matrix),axis=1)
    
    #forget gate activation
    fa = np.matmul(concat_dataset,fgw)
    fa = sigmoid(fa)
    
    #input gate activation
    ia = np.matmul(concat_dataset,igw)
    ia = sigmoid(ia)
    
    #output gate activation
    oa = np.matmul(concat_dataset,ogw)
    oa = sigmoid(oa)
    
    #gate gate activation
    ga = np.matmul(concat_dataset,ggw)
    ga = tanh_activation(ga)
    
    #new cell memory matrix
    cell_memory_matrix = np.multiply(fa,prev_cell_matrix) + np.multiply(ia,ga)
    
    #current activation matrix
    activation_matrix = np.multiply(oa, tanh_activation(cell_memory_matrix))
    
    #lets store the activations to be used in back prop
    lstm_activations = dict()
    lstm_activations['fa'] = fa
    lstm_activations['ia'] = ia
    lstm_activations['oa'] = oa
    lstm_activations['ga'] = ga
    
    return lstm_activations,cell_memory_matrix,activation_matrix

In [12]:
def output_cell(activation_matrix,parameters):
    #get hidden to output parameters
    how = parameters['how']
    
    #get outputs 
    output_matrix = np.matmul(activation_matrix,how)
    output_matrix = softmax(output_matrix)
    
    return output_matrix

In [33]:
def get_embeddings(batch_dataset,embeddings):
    embedding_dataset = np.matmul(batch_dataset,embeddings)
    return embedding_dataset

In [72]:
#forward propagation
def forward_propagation(batches,parameters,embeddings):
    batch_size = batches[0].shape[0]
    
    #to store the activations of all the unrollings.
    lstm_cache = dict()
    activation_cache = dict()
    cell_cache = dict()
    output_cache = dict()
    embedding_cache = dict()
    
    #initial activation_matrix and cell_matrix
    a0 = np.zeros([batch_size,hidden_units],dtype=np.float32)
    c0 = np.zeros([batch_size,hidden_units],dtype=np.float32)
    
    #store the initial activations in cache
    activation_cache['a0'] = a0
    cell_cache['c0'] = c0
    
    for i in range(len(batches)-1):
        batch_dataset = batches[i]
        batch_dataset = get_embeddings(batch_dataset,embeddings)
        embedding_cache['emb'+str(i)] = batch_dataset
        
        #lstm cell
        lstm_activations,ct,at = lstm_cell(batch_dataset,a0,c0,parameters)
        
        #output cell
        ot = output_cell(at,parameters)
        
        #store the time 't' activations in cache
        lstm_cache['lstm' + str(i+1)]  = lstm_activations
        activation_cache['a'+str(i+1)] = at
        cell_cache['c' + str(i+1)] = ct
        output_cache['o'+str(i+1)] = ot
        
        #update a0 and c0 to new 'at' and 'ct' for next lstm cell
        a0 = at
        c0 = ct
        
    return embedding_cache,lstm_cache,activation_cache,cell_cache,output_cache

In [103]:
#calculate loss, perplexity and accuracy
def cal_loss_accuracy(batch_labels,output_cache):
    loss = 0
    acc  = 0
    prob = 1
    batch_size = batch_labels[0].shape[0]
    
    for i in range(1,len(output_cache)+1):
        labels = batch_labels[i]
        pred = output_cache['o'+str(i)]
        
        prob = np.multiply(prob,np.sum(np.multiply(labels,pred),axis=1).reshape(-1,1))
        loss += np.sum((np.multiply(labels,np.log(pred)) + np.multiply(1-labels,np.log(1-pred))),axis=1).reshape(-1,1)
        acc  += np.array(np.argmax(labels,1)==np.argmax(pred,1),dtype=np.float32).reshape(-1,1)
    
    perplexity = np.sum((1/prob)**(1/len(output_cache)))/batch_size
    loss = np.sum(loss)*(-1/batch_size)
    acc  = np.sum(acc)/(batch_size)
    acc = acc/len(output_cache)
    
    return perplexity,loss,acc

In [16]:
#calculate output cell errors
def calculate_output_cell_error(batch_labels,output_cache,parameters):
    output_error_cache = dict()
    activation_error_cache = dict()
    how = parameters['how']
    
    for i in range(1,len(output_cache)+1):
        labels = batch_labels[i]
        pred = output_cache['o'+str(i)]
        
        error_output = pred - labels
        error_activation = np.matmul(error_output,how.T)
        
        output_error_cache['eo'+str(i)] = error_output
        activation_error_cache['ea'+str(i)] = error_activation
    return output_error_cache,activation_error_cache

In [113]:
#calculate error for single lstm cell
def calculate_single_lstm_cell_error(activation_output_error,next_activation_error,next_cell_error,parameters,lstm_activation,cell_activation,prev_cell_activation):
    activation_error = activation_output_error + next_activation_error
    
    #output gate error
    oa = lstm_activation['oa']
    eo = np.multiply(activation_error,tanh_activation(cell_activation))
    eo = np.multiply(np.multiply(eo,oa),1-oa)
    
    #cell activation error
    cell_error = np.multiply(activation_error,oa)
    cell_error = np.multiply(cell_error,tanh_derivative(tanh_activation(cell_activation)))
    cell_error += next_cell_error
    
    #input gate error
    ia = lstm_activation['ia']
    ga = lstm_activation['ga']
    ei = np.multiply(cell_error,ga)
    ei = np.multiply(np.multiply(ei,ia),1-ia)
    
    #gate gate error
    eg = np.multiply(cell_error,ia)
    eg = np.multiply(eg,tanh_derivative(ga))
    
    #forget gate error
    fa = lstm_activation['fa']
    ef = np.multiply(cell_error,prev_cell_activation)
    ef = np.multiply(np.multiply(ef,fa),1-fa)
    
    #prev cell error
    prev_cell_error = np.multiply(cell_error,fa)
    
    #get parameters
    fgw = parameters['fgw']
    igw = parameters['igw']
    ggw = parameters['ggw']
    ogw = parameters['ogw']
    
    #embedding + hidden activation error
    embed_activation_error = np.matmul(ef,fgw.T)
    embed_activation_error += np.matmul(ei,igw.T)
    embed_activation_error += np.matmul(eo,ogw.T)
    embed_activation_error += np.matmul(eg,ggw.T)
    
    input_hidden_units = fgw.shape[0]
    hidden_units = fgw.shape[1]
    input_units = input_hidden_units - hidden_units
    
    #prev activation error
    prev_activation_error = embed_activation_error[:,input_units:]
    
    #input error (embedding error)
    embed_error = embed_activation_error[:,:input_units]
    
    #store lstm error
    lstm_error = dict()
    lstm_error['ef'] = ef
    lstm_error['ei'] = ei
    lstm_error['eo'] = eo
    lstm_error['eg'] = eg
    
    return prev_activation_error,prev_cell_error,embed_error,lstm_error

In [114]:
#calculate output cell derivatives
def calculate_output_cell_derivatives(output_error_cache,activation_cache,parameters):
    dhow = np.zeros(parameters['how'].shape)
    batch_size = activation_cache['a1'].shape[0]
    
    for i in range(1,len(output_error_cache)+1):
        output_error = output_error_cache['eo' + str(i)]
        activation = activation_cache['a'+str(i)]
        dhow += np.matmul(activation.T,output_error)/batch_size
        
    #dhow = dhow/len(output_error_cache)
    return dhow

In [115]:
#calculate derivatives for single lstm cell
def calculate_single_lstm_cell_derivatives(lstm_error,embedding_matrix,activation_matrix):
    ef = lstm_error['ef']
    ei = lstm_error['ei']
    eo = lstm_error['eo']
    eg = lstm_error['eg']
    
    concat_matrix = np.concatenate((embedding_matrix,activation_matrix),axis=1)
    batch_size = embedding_matrix.shape[0]
    
    dfgw = np.matmul(concat_matrix.T,ef)/batch_size
    digw = np.matmul(concat_matrix.T,ei)/batch_size
    dogw = np.matmul(concat_matrix.T,eo)/batch_size
    dggw = np.matmul(concat_matrix.T,eg)/batch_size
    
    derivatives = dict()
    derivatives['dfgw'] = dfgw
    derivatives['digw'] = digw
    derivatives['dogw'] = dogw
    derivatives['dggw'] = dggw
    
    return derivatives

In [116]:
#backpropagation
def backward_propagation(batch_labels,embedding_cache,lstm_cache,activation_cache,cell_cache,output_cache,parameters):
    #calculate output errors 
    output_error_cache,activation_error_cache = calculate_output_cell_error(batch_labels,output_cache,parameters)
    
    #to store error lstms
    lstm_error_cache = dict()
    
    #to store embeding errors
    embedding_error_cache = dict()
    
    # next activation error 
    # next cell error
    eat = np.zeros(activation_error_cache['ea1'].shape)
    ect = np.zeros(activation_error_cache['ea1'].shape)
    
    #calculate all lstm cell errors
    for i in range(len(lstm_cache),0,-1):
        pae,pce,ee,le = calculate_single_lstm_cell_error(activation_error_cache['ea'+str(i)],eat,ect,parameters,lstm_cache['lstm'+str(i)],cell_cache['c'+str(i)],cell_cache['c'+str(i-1)])
        lstm_error_cache['elstm'+str(i)] = le
        embedding_error_cache['eemb'+str(i-1)] = ee
        eat = pae
        ect = pce
    
    #calculate output cell derivatives
    derivatives = dict()
    derivatives['dhow'] = calculate_output_cell_derivatives(output_error_cache,activation_cache,parameters)
    
    #calculate lstm cell derivatives
    lstm_derivatives = dict()
    for i in range(1,len(lstm_error_cache)+1):
        lstm_derivatives['dlstm'+str(i)] = calculate_single_lstm_cell_derivatives(lstm_error_cache['elstm'+str(i)],embedding_cache['emb'+str(i-1)],activation_cache['a'+str(i-1)])
    
    derivatives['dfgw'] = np.zeros(parameters['fgw'].shape)
    derivatives['digw'] = np.zeros(parameters['igw'].shape)
    derivatives['dogw'] = np.zeros(parameters['ogw'].shape)
    derivatives['dggw'] = np.zeros(parameters['ggw'].shape)
    
    for i in range(1,len(lstm_error_cache)+1):
        derivatives['dfgw'] += lstm_derivatives['dlstm'+str(i)]['dfgw']
        derivatives['digw'] += lstm_derivatives['dlstm'+str(i)]['digw']
        derivatives['dogw'] += lstm_derivatives['dlstm'+str(i)]['dogw']
        derivatives['dggw'] += lstm_derivatives['dlstm'+str(i)]['dggw']
    
    #derivatives['dfgw'] /= len(lstm_cache)
    #derivatives['digw'] /= len(lstm_cache)
    #derivatives['dogw'] /= len(lstm_cache)
    #derivatives['dggw'] /= len(lstm_cache)
    
    return derivatives,embedding_error_cache

In [120]:
#adam optimization
def update_parameters(parameters,derivatives,V,S,t):
    #get derivatives
    dfgw = derivatives['dfgw']
    digw = derivatives['digw']
    dogw = derivatives['dogw']
    dggw = derivatives['dggw']
    dhow = derivatives['dhow']
    
    #get parameters
    fgw = parameters['fgw']
    igw = parameters['igw']
    ogw = parameters['ogw']
    ggw = parameters['ggw']
    how = parameters['how']
    
    #get V parameters
    vfgw = V['vfgw']
    vigw = V['vigw']
    vogw = V['vogw']
    vggw = V['vggw']
    vhow = V['vhow']
    
    #get S parameters
    sfgw = S['sfgw']
    sigw = S['sigw']
    sogw = S['sogw']
    sggw = S['sggw']
    show = S['show']
    
    vfgw = (beta1*vfgw + (1-beta1)*dfgw)#/(1-(beta1**t))
    vigw = (beta1*vigw + (1-beta1)*digw)#/(1-beta1**t)
    vogw = (beta1*vogw + (1-beta1)*dogw)#/(1-beta1**t)
    vggw = (beta1*vggw + (1-beta1)*dggw)#/(1-beta1**t)
    vhow = (beta1*vhow + (1-beta1)*dhow)#/(1-beta1**t)
    
    sfgw = (beta2*sfgw + (1-beta2)*(dfgw**2))#/(1-beta2**t)
    sigw = (beta2*sigw + (1-beta2)*(digw**2))#/(1-beta2**t)
    sogw = (beta2*sogw + (1-beta2)*(dogw**2))#/(1-beta2**t)
    sggw = (beta2*sggw + (1-beta2)*(dggw**2))#/(1-beta2**t)
    show = (beta2*show + (1-beta2)*(dhow**2))#/(1-beta2**t)
    
    fgw = fgw - learning_rate*((vfgw)/(np.sqrt(sfgw) + 1e-6))
    igw = igw - learning_rate*((vigw)/(np.sqrt(sigw) + 1e-6))
    ogw = ogw - learning_rate*((vogw)/(np.sqrt(sogw) + 1e-6))
    ggw = ggw - learning_rate*((vggw)/(np.sqrt(sggw) + 1e-6))
    how = how - learning_rate*((vhow)/(np.sqrt(show) + 1e-6))
    
    parameters['fgw'] = fgw
    parameters['igw'] = igw
    parameters['ogw'] = ogw
    parameters['ggw'] = ggw
    parameters['how'] = how
    
    V['vfgw'] = vfgw 
    V['vigw'] = vigw 
    V['vogw'] = vogw 
    V['vggw'] = vggw
    V['vhow'] = vhow
    
    S['sfgw'] = sfgw 
    S['sigw'] = sigw 
    S['sogw'] = sogw 
    S['sggw'] = sggw
    S['show'] = show
    
    return parameters,V,S    

def update_embeddings(embeddings,embedding_error_cache,batch_labels):
    embedding_derivatives = np.zeros(embeddings.shape)
    batch_size = batch_labels[0].shape[0]
    
    for i in range(len(embedding_error_cache)):
        embedding_derivatives += np.matmul(batch_labels[i].T,embedding_error_cache['eemb'+str(i)])/batch_size
    
    embeddings = embeddings - learning_rate*embedding_derivatives
    return embeddings

In [148]:
#train function
def train(train_dataset,embeddings,iters=1000,batch_size=20):
    parameters = initialize_parameters()
    V = initialize_V(parameters)
    S = initialize_S(parameters)
    embeddings = np.random.normal(0,0.01,(len(vocab),input_units))
    
    for step in range(iters):
        index = step%len(train_dataset)
        batches = train_dataset[index]
        
        embedding_cache,lstm_cache,activation_cache,cell_cache,output_cache = forward_propagation(batches,parameters,embeddings)
        perplexity,loss,acc = cal_loss_accuracy(batches,output_cache)
        derivatives,embedding_error_cache = backward_propagation(batches,embedding_cache,lstm_cache,activation_cache,cell_cache,output_cache,parameters)
        parameters,V,S = update_parameters(parameters,derivatives,V,S,step)
        embeddings = update_embeddings(embeddings,embedding_error_cache,batches)
        
        if(step%1000==0):
            print('Step = {}'.format(step))
            print('Loss = {}'.format(round(loss,2)))
            print('Perp = {}'.format(round(perplexity,2)))
            print('Accu = {}'.format(round(acc*100,2)))
            print()
    
    return embeddings, parameters

In [212]:
embeddings,parameters = train(train_dataset,embeddings,iters=20001)

Step = 0
Loss = 47.05
Perp = 27.0
Accu = 0.91

Step = 1000
Loss = 16.41
Perp = 3.31
Accu = 69.55

Step = 2000
Loss = 14.86
Perp = 2.75
Accu = 70.0

Step = 3000
Loss = 12.46
Perp = 2.17
Accu = 78.18

Step = 4000
Loss = 10.36
Perp = 1.85
Accu = 82.73

Step = 5000
Loss = 9.15
Perp = 1.71
Accu = 84.55

Step = 6000
Loss = 8.59
Perp = 1.65
Accu = 85.91

Step = 7000
Loss = 8.07
Perp = 1.59
Accu = 86.82

Step = 8000
Loss = 7.64
Perp = 1.55
Accu = 87.73

Step = 9000
Loss = 7.81
Perp = 1.56
Accu = 86.82

Step = 10000
Loss = 7.58
Perp = 1.54
Accu = 85.91

Step = 11000
Loss = 7.77
Perp = 1.56
Accu = 85.91

Step = 12000
Loss = 7.68
Perp = 1.54
Accu = 85.91

Step = 13000
Loss = 7.71
Perp = 1.55
Accu = 85.91

Step = 14000
Loss = 7.66
Perp = 1.54
Accu = 85.91

Step = 15000
Loss = 7.62
Perp = 1.54
Accu = 85.91

Step = 16000
Loss = 7.61
Perp = 1.53
Accu = 86.36

Step = 17000
Loss = 7.58
Perp = 1.53
Accu = 86.36

Step = 18000
Loss = 7.55
Perp = 1.53
Accu = 86.36

Step = 19000
Loss = 7.53
Perp = 1.53
Accu

In [256]:
#predict
def predict(parameters,embeddings,id_char,vocab):
    
    names = []
    for i in range(20):
        #initial activation_matrix and cell_matrix
        a0 = np.zeros([1,hidden_units],dtype=np.float32)
        c0 = np.zeros([1,hidden_units],dtype=np.float32)

        #store the initial activations in cache
        activation_cache['a0'] = a0
        cell_cache['c0'] = c0

        name = ''
        batch_dataset = np.zeros([1,len(vocab)])
        index = np.random.randint(0,27,1)[0]
        batch_dataset[0,index] = 1.0
        name += id_char[index]
        char = id_char[index]
        while(char!='.'):
            batch_dataset = get_embeddings(batch_dataset,embeddings)

            #lstm cell
            lstm_activations,ct,at = lstm_cell(batch_dataset,a0,c0,parameters)

            #output cell
            ot = output_cell(at,parameters)
            pred = np.random.choice(27,1,p=ot[0])[0]
            true = np.random.randint(0,1,1)[0]
            #pred = np.argmax(ot)
                
            name += id_char[pred]
            char = id_char[pred]
            batch_dataset = np.zeros([1,len(vocab)])
            batch_dataset[0,pred] = 1.0

            #update a0 and c0 to new 'at' and 'ct' for next lstm cell
            a0 = at
            c0 = ct
        names.append(name)
        
    return names

In [259]:
predict(parameters,embeddings,id_char,vocab)

['richard.',
 'natalie.',
 'fletcher.',
 'oreine.',
 'parthena.',
 'adolph.',
 'ramon.',
 'ollie.',
 'young.',
 'netta.',
 'arvella.',
 'mark.',
 'tina.',
 'truman.',
 'young.',
 'aby.',
 'jesus.',
 'jess.',
 'oscar.',
 'helena.']

In [215]:
predict(parameters,embeddings,id_char,vocab)

['donnie.',
 'george.',
 'vernie.',
 'margaret.',
 'isabella.',
 'isabella.',
 'orange.',
 'theodore.',
 'theodore.',
 'johnnie.',
 'ursula.',
 'clara.',
 'harris.',
 '.',
 '.',
 'george.',
 'katheryn.',
 'bertie.',
 'leonard.',
 'leonard.']

In [474]:
def calculate_errors(error_activation,next_error_cell,lstm_activation,cell_activation,prev_cell_activation,parameters):
    #error_cell_mem 'ct'
    ea_t = error_activation
    oa = lstm_activation['oa']
    ct_der = tanh_derivative(cell_activation)
    error_cell = np.multiply(np.multiply(ea_t,oa),ct_der) + next_error_cell
    

    #error lstm cell
    #output gate
    oa = lstm_activation['oa']
    eo = np.multiply(ea_t,tanh_activation(cell_activation))
    eo = np.multiply(np.multiply(eo,oa),1-oa)

    #gate gate
    ia = lstm_activation['ia']
    eg = np.multiply(ia,error_cell)
    eg = tanh_derivative(eg)

    #input gate
    ga = lstm_activation['ga']
    ei = np.multiply(ga,error_cell)
    ei = np.multiply(np.multiply(ei,ia),1-ia)

    #forget gate
    prev_cell = prev_cell_activation
    fa = lstm_activation['fa']
    ef = np.multiply(prev_cell,error_cell)
    ef = np.multiply(np.multiply(ef,fa),1-fa)

    #error prev cell ct-1
    prev_error_cell = np.multiply(fa,error_cell)

    #store errors in error_lstm
    error_lstm = np.zeros([4,ef.shape[1]])
    error_lstm[0,:] = ef[0]
    error_lstm[1,:] = ei[0]
    error_lstm[2,:] = eo[0]
    error_lstm[3,:] = eg[0]
    
    #error prev activation at-1
    fgw = parameters['fgw'][input_units:,:]
    igw = parameters['igw'][input_units:,:]
    ogw = parameters['ogw'][input_units:,:]
    ggw = parameters['ggw'][input_units:,:]
    
    prev_error_activation = np.matmul(ef,fgw.T)
    prev_error_activation += np.matmul(ei,igw.T)
    prev_error_activation += np.matmul(eo,ogw.T)
    prev_error_activation += np.matmul(eg,ggw.T)
    
    return error_lstm,prev_error_cell,prev_error_activation

In [475]:
def get_activation_matrix(activation_cache):
    l = len(activation_cache)-1
    activation_matrix = np.zeros([l,activation_cache['a0'].shape[1]])
    for i in range(l):
        activation_matrix[i] = activation_cache['a'+str(i+1)]
    
    activation_matrix = activation_matrix.reshape(-1,activation_cache['a0'].shape[1])
    return activation_matrix

def get_input_activation_matrix(train_name,activation_cache,char_id,char_mat):
    l = len(activation_cache)-1
    input_activation_matrix = np.zeros([l,char_mat.shape[1]+activation_cache['a0'].shape[1]])
    chars = list(train_name)
    for i in range(l):
        index = char_id[chars[i]]
        input_vec = char_mat[index].reshape(-1,char_mat.shape[1])
        activation_vec = activation_cache['a'+str(i)]
        concat_vec = np.concatenate((input_vec,activation_vec),axis=1)
        input_activation_matrix[i] = concat_vec
    
    input_activation_matrix = input_activation_matrix.reshape(-1,char_mat.shape[1]+activation_cache['a0'].shape[1])
    return input_activation_matrix

In [476]:
def backward_propagation(train_name,parameters,lstm_cache,activation_cache,cell_cache,output,char_id,char_mat):
    #lstm parameters
    fgw = parameters['fgw']
    igw = parameters['igw']
    ogw = parameters['ogw']
    ggw = parameters['ggw']
    
    #ihw = parameters['ihw']
    #hhw = parameters['hhw']
    
    #hidden to output weights
    how = parameters['how']
    
    chars = list(train_name)
    
    derivatives = dict()
    derivatives['dfgw'] = np.zeros(shape=fgw.shape)
    derivatives['digw'] = np.zeros(shape=igw.shape)
    derivatives['dogw'] = np.zeros(shape=ogw.shape)
    derivatives['dggw'] = np.zeros(shape=ggw.shape)
    
    #derivatives['dihw'] = np.zeros(shape=ihw.shape)
    #derivatives['dhhw'] = np.zeros(shape=hhw.shape)
    derivatives['dhow'] = np.zeros(shape=how.shape)
    
    error_lstm = dict()
    error_output = np.zeros([len(train_name)-1,how.shape[1]])
    
    for i in range(1,len(train_name)):
        #to store errors
        #error_activation = dict()
        #error_cell = dict()
        
        
        #get lstm activations at time t
        lstm_activation = lstm_cache['lstm'+str(i)]
        
        #get act label and output label;
        act_label  = char_id[chars[i]]
        act_vec = char_mat[act_label].reshape(1,input_units)
        out_vec = output['o'+str(i)]
        
        #output error
        error_output[i-1] = out_vec - act_vec
        
        
        #error_activation 'at'
        eat = np.matmul(error_output[i-1],how.T)
        #error_activation['ea' + str(i)] = np.matmul(error_output,how.T)
        
        #error_cell ct = zeros
        ect = np.zeros(shape=eat.shape)
        
        #initialize error_lstm
        error_lstm['elstm'+str(i)] = np.zeros([4,how.shape[0]])
        
        for j in range(i,0,-1):
            elstm,ect,eat = calculate_errors(eat,ect,lstm_activation,cell_cache['c'+str(j)],cell_cache['c'+str(j-1)],parameters)
            error_lstm['elstm'+str(j)] += elstm
            #der = derivative(cache['a'+str(j)])
            #error_activation['ea'+str(j)] = np.multiply(np.matmul(error_activation['ea'+str(j+1)],hhw.T),der)
        
        #sum the errors
        #convert into mat to get derivatives
        
    #calculate derivatives
    error_output = error_output.reshape(-1,how.shape[1])
    #print(error_output.shape)
    assert error_output.shape == (len(train_name)-1,how.shape[1])
    
    #activation matrix
    activation_matrix = get_activation_matrix(activation_cache)
    input_activation_matrix = get_input_activation_matrix(train_name,activation_cache,char_id,char_mat)
    
    
    #derivative ouput
    derivatives['dhow'] = np.matmul(activation_matrix.T,error_output)

    for i in range(len(train_name)-1):
        input_activation = input_activation_matrix[i].reshape(1,input_activation_matrix.shape[1])
        elstm = error_lstm['elstm'+str(i+1)]
        derivatives['dfgw'] += np.matmul(input_activation.T,elstm[0,:].reshape(1,elstm.shape[1]))
        derivatives['digw'] += np.matmul(input_activation.T,elstm[1,:].reshape(1,elstm.shape[1]))
        derivatives['dogw'] += np.matmul(input_activation.T,elstm[2,:].reshape(1,elstm.shape[1]))
        derivatives['dggw'] += np.matmul(input_activation.T,elstm[3,:].reshape(1,elstm.shape[1]))
        
        #derivatives['dhhw'] += np.matmul(cache['a'+str(j-1)].T,error_activation['ea'+str(j)])
        #in_vec = char_id[chars[j-1]]
        #in_vec = char_mat[in_vec].reshape(1,input_units)
        #derivatives['dihw'] += np.matmul(in_vec.T,error_activation['ea'+str(j)])
    
    derivatives['dfgw'] /= (len(train_name)-1)
    derivatives['digw'] /= (len(train_name)-1)
    derivatives['dogw'] /= (len(train_name)-1)
    derivatives['dggw'] /= (len(train_name)-1)
    derivatives['dhow'] /= (len(train_name)-1)
    
    return derivatives

In [477]:
def update_parameters(parameters,derivatives,V):
    
    fgw = parameters['fgw']
    igw = parameters['igw']
    ogw = parameters['ogw']
    ggw = parameters['ggw']
    how = parameters['how']
    
    dfgw = derivatives['dfgw']
    digw = derivatives['digw']
    dogw = derivatives['dogw']
    dggw = derivatives['dggw']
    dhow = derivatives['dhow']
    
    
    Vfgw = V['Vfgw']
    Vigw = V['Vigw']
    Vogw = V['Vogw']
    Vggw = V['Vggw']
    Vhow = V['Vhow']
    
    Vfgw1 = beta*Vfgw + (1-beta)*dfgw
    Vigw1 = beta*Vigw + (1-beta)*digw
    Vogw1 = beta*Vogw + (1-beta)*dogw
    Vggw1 = beta*Vggw + (1-beta)*dggw
    Vhow1 = beta*Vhow + (1-beta)*dhow
    
    fgw = fgw - learning_rate*Vfgw1
    igw = igw - learning_rate*Vigw1
    ogw = ogw - learning_rate*Vogw1
    ggw = ggw - learning_rate*Vggw1
    how = how - learning_rate*Vhow1
    
    V['Vfgw'] = Vfgw1
    V['Vigw'] = Vigw1
    V['Vogw'] = Vogw1
    V['Vggw'] = Vggw1
    V['Vhow'] = Vhow1
    
    parameters['fgw'] = fgw
    parameters['igw'] = igw
    parameters['ogw'] = ogw
    parameters['ggw'] = ggw
    parameters['how'] = how
    
    return parameters,V

In [478]:
def train(train_dataset,parameters,iters=1000):
    parameters = initialize_parameters()
    V = initialize_V(parameters)
    print(parameters['fgw'].shape)
    
    
    for step in range(iters):
        total_loss = 0
        total_perplexity = 0
        total_acc = 0
        for j in range(len(train_dataset)):
            lstm_cache,activation_cache,cell_cache,output = forward_propagation(train_dataset[j,0],parameters,char_id,char_mat)
            loss,perplexity,acc = cal_loss_accuracy(train_dataset[j,0],output,char_id,char_mat)
            derivatives = backward_propagation(train_dataset[j,0],parameters,lstm_cache,activation_cache,cell_cache,output,char_id,char_mat)
            total_loss += loss
            total_perplexity += perplexity
            total_acc += acc
            
        #derivatives['dfgw'] /= len(train_dataset)
        #derivatives['digw'] /= len(train_dataset)
        #derivatives['dogw'] /= len(train_dataset)
        #derivatives['dggw'] /= len(train_dataset)
        #derivatives['dhow'] /= len(train_dataset)

            parameters,V = update_parameters(parameters,derivatives,V)
        
        total_loss /= len(train_dataset)
        total_perplexity /= len(train_dataset)
        total_acc /= len(train_dataset)
        
        if(step%200==0):
            print('Step : {}'.format(step))
            print("Loss : {}".format(round(total_loss,2)))
            print("Perp : {}".format(round(total_perplexity,2)))
            print("Accu : {}".format(round(total_acc*100,2)))
            print()
    return parameters
