#### Natural Language Processing
#### Story Generation Implementation using tensorflow 

In [42]:
# importing important libraries

import time
from collections import namedtuple
import numpy as np
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

In [43]:
# reading file for training

with open('book.txt','r') as f:
    text = f.read()
vocab = set(text) # using set we are identifying unique characters from the file
vocab_to_int = {c : i for i,c in enumerate(vocab)} # by using enumeration we are retrieving correspondig numbers with alph.
int_to_vocab = dict(enumerate(vocab))
char = np.array([vocab_to_int[c] for c in text],dtype = np.int32) # converting all letters to corresponding numerical value

In [44]:
text



In [4]:
vocab_to_int

{'P': 0,
 '-': 1,
 'Z': 2,
 'S': 3,
 'y': 4,
 '/': 5,
 ' ': 6,
 'θ': 7,
 'V': 8,
 '4': 9,
 '0': 10,
 '!': 11,
 ',': 12,
 'j': 13,
 'A': 14,
 ';': 15,
 'D': 16,
 't': 17,
 '’': 18,
 'g': 19,
 'T': 20,
 'w': 21,
 'l': 22,
 'd': 23,
 'n': 24,
 '1': 25,
 'F': 26,
 'f': 27,
 'X': 28,
 'R': 29,
 '+': 30,
 '$': 31,
 '”': 32,
 'z': 33,
 'C': 34,
 'k': 35,
 'c': 36,
 'o': 37,
 '(': 38,
 '8': 39,
 '.': 40,
 ':': 41,
 'W': 42,
 'a': 43,
 'N': 44,
 'i': 45,
 ')': 46,
 'B': 47,
 '=': 48,
 'E': 49,
 '%': 50,
 '×': 51,
 'U': 52,
 '?': 53,
 'Y': 54,
 'v': 55,
 ']': 56,
 '\n': 57,
 'Q': 58,
 'u': 59,
 '_': 60,
 'm': 61,
 'p': 62,
 '7': 63,
 '3': 64,
 '6': 65,
 'r': 66,
 'q': 67,
 'h': 68,
 '2': 69,
 'I': 70,
 'O': 71,
 '[': 72,
 '“': 73,
 'G': 74,
 'K': 75,
 'e': 76,
 '–': 77,
 's': 78,
 'x': 79,
 'H': 80,
 'J': 81,
 '"': 82,
 '—': 83,
 '9': 84,
 'M': 85,
 'L': 86,
 'b': 87,
 '5': 88}

In [5]:
char

array([42, 68, 76, ..., 76, 61, 40], dtype=int32)

In [6]:
# let's check first 100 words from file

text[:100]

'When most people hear “Machine Learning,” they picture a robot: a dependable butler or a deadly Term'

In [7]:
# let's check the corresponding numerical variable

char[:100]

array([42, 68, 76, 24,  6, 61, 37, 78, 17,  6, 62, 76, 37, 62, 22, 76,  6,
       68, 76, 43, 66,  6, 73, 85, 43, 36, 68, 45, 24, 76,  6, 86, 76, 43,
       66, 24, 45, 24, 19, 12, 32,  6, 17, 68, 76,  4,  6, 62, 45, 36, 17,
       59, 66, 76,  6, 43,  6, 66, 37, 87, 37, 17, 41,  6, 43,  6, 23, 76,
       62, 76, 24, 23, 43, 87, 22, 76,  6, 87, 59, 17, 22, 76, 66,  6, 37,
       66,  6, 43,  6, 23, 76, 43, 23, 22,  4,  6, 20, 76, 66, 61],
      dtype=int32)

In [8]:
# let's check the maximum number of class available inside file

np.max(char)+1

89

In [9]:
# creating function to split data

def split_data(char,batch_size,num_step,split_frac=0.9):
    
    slice_size = batch_size*num_step
    n_batches = int(len(char)/slice_size)
    
    
    x = char[:n_batches*slice_size]
    y = char[1:n_batches*slice_size+1]
    
    x = np.stack(np.split(x,batch_size))
    y = np.stack(np.split(y,batch_size))
    
    split_idx = int(n_batches*split_frac)
    train_x,train_y = x[:, :split_idx*num_step],y[:,:split_idx*num_step]
    test_x,test_y = x[:,split_idx*num_step:],y[:,split_idx*num_step:]
    
    return train_x,train_y,test_x,test_y

In [10]:
# splitting the dataset

train_x,train_y,test_x,test_y = split_data(char,10,50)

In [11]:
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

(10, 6400)
(10, 6400)
(10, 750)
(10, 750)


In [12]:
# let's check train dataset

train_x[:,:50]

array([[42, 68, 76, 24,  6, 61, 37, 78, 17,  6, 62, 76, 37, 62, 22, 76,
         6, 68, 76, 43, 66,  6, 73, 85, 43, 36, 68, 45, 24, 76,  6, 86,
        76, 43, 66, 24, 45, 24, 19, 12, 32,  6, 17, 68, 76,  4,  6, 62,
        45, 36],
       [76, 78,  6, 37, 27,  6, 17, 76, 36, 68, 24, 45, 67, 59, 76, 78,
         6, 17, 68, 43, 17,  6,  4, 37, 59,  6, 21, 37, 59, 22, 23,  6,
        59, 78, 76,  6, 27, 37, 66,  6, 76, 43, 36, 68,  6, 17, 43, 78,
        35, 40],
       [22, 76,  6, 78,  4, 78, 17, 76, 61,  6, 45, 78,  6, 27, 45, 24,
        76,  1, 17, 59, 24, 76, 23,  6, 59, 78, 45, 24, 19,  6, 78, 59,
        62, 76, 66, 55, 45, 78, 76, 23,  6, 22, 76, 43, 66, 24, 45, 24,
        19,  6],
       [68, 76,  6, 78,  4, 78, 17, 76, 61,  6, 21, 37, 59, 22, 23,  6,
        27, 22, 43, 19,  6, 43, 24,  6, 76, 61, 43, 45, 22,  6, 43, 78,
         6, 78, 62, 43, 61,  6, 45, 27,  6, 45, 17,  6, 68, 43, 78,  6,
        61, 43],
       [76, 78,  6, 78, 76, 76, 61,  6, 68, 43, 62, 62, 45, 76, 66, 

In [21]:
# creating model function

def build_rnn(num_classes,batch_size=50,num_step=50,lstm_size=128,num_layers=2,learning_rate = 0.001,
       grad_clip=5,sampling=False):
    
    if sampling==True:
        batch_size,num_step=1,1
        
    #tf.reset_default_graph()
    tf.reset_default_graph()
    #tf.disable_v2_behavior() 
    
    
    inputs = tf.placeholder(tf.int32,[batch_size,num_step],name = 'inputs')
    targets = tf.placeholder(tf.int32,[batch_size,num_step],name = 'targets')
    
    keep_prob = tf.placeholder(tf.float32,name = 'keep_prob')
    
    x_one_hot = tf.one_hot(inputs,num_classes)
    y_one_hot = tf.one_hot(targets,num_classes)
    
    # build LSTM
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    
    # dropout ratio
    drop = tf.contrib.rnn.DropoutWrapper(lstm,output_keep_prob = keep_prob)
    
    # creating multiple layer using multi layer
    cell = tf.contrib.rnn.MultiRNNCell([drop],num_layers)
    initial_state = cell.zero_state(batch_size,tf.float32)
    
    rnn_inputs = [tf.squeeze(i,squeeze_dims=[1]) for i in tf.split(x_one_hot,num_step,1)]
    
    outputs,state = tf.contrib.rnn.static_rnn(cell,rnn_inputs,initial_state = initial_state)
    final_state = state
    
    seq_output = tf.concat(outputs,axis = 1)
    output = tf.reshape(seq_output,[-1,lstm_size])
    
    with tf.variable_scope('softmax'):
        softmax_w = tf.Variable(tf.truncated_normal((lstm_size,num_classes),stddev = 0.1))
        softmax_b = tf.Variable(tf.zeros(num_classes))
        
    logits = tf.matmul(output,softmax_w) + softmax_b
    
    preds = tf.nn.softmax(logits,name = 'predictions')
    
    y_reshaped = tf.reshape(y_one_hot,[-1,num_classes])
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels = y_reshaped)
    cost = tf.reduce_mean(loss)
    
    # optimizer
    tvars = tf.trainable_variables()
    grads,_ = tf.clip_by_global_norm(tf.gradients(cost,tvars),grad_clip)
    train_op = tf.train.AdamOptimizer(learning_rate)
    optimizer = train_op.apply_gradients(zip(grads,tvars))
    
    export_nodes = ['inputs','targets','initial_state','final_state','keep_prob','cost','preds','optimizer']
    Graph = namedtuple('Graph',export_nodes)
    local_dict = locals()
    graph = Graph(*[local_dict[each] for each in export_nodes])
    
    return graph

In [22]:
batch_size = 100
num_step = 100 
lstm_size = 512
num_layers = 2
learning_rate = 0.001
keep_prob = 10.5

In [23]:
!mkdir checkpoints

mkdir: checkpoints: File exists


In [31]:
epochs = 35
# Save every N iterations
save_every_n = 200
train_x, train_y, test_x, test_y = split_data(char, batch_size, num_step)

model = build_rnn(len(vocab), 
                  batch_size=batch_size,
                  num_step=num_step,
                  learning_rate=learning_rate,
                  lstm_size=lstm_size,
                  num_layers=num_layers)

saver = tf.train.Saver(max_to_keep=100)

In [32]:
def get_batch(arrs, num_step):
    batch_size, slice_size = arrs[0].shape
    
    n_batches = int(slice_size/num_step)
    for b in range(n_batches):
        yield [x[:, b*num_step: (b+1)*num_step] for x in arrs]

In [33]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Use the line below to load a checkpoint and resume training
    #saver.restore(sess, 'checkpoints/______.ckpt')
    
    n_batches = int(train_x.shape[1]/num_step)
    iterations = n_batches * epochs
    for e in range(epochs):
        
        # Train network
        new_state = sess.run(model.initial_state)
        loss = 0
        for b, (x, y) in enumerate(get_batch([train_x, train_y], num_step), 1):
            iteration = e*n_batches + b
            start = time.time()
            feed = {model.inputs: x,
                    model.targets: y,
                    model.keep_prob: keep_prob,
                    model.initial_state: new_state}
            batch_loss, new_state, _ = sess.run([model.cost, model.final_state, model.optimizer], 
                                                 feed_dict=feed)
            loss += batch_loss
            end = time.time()
            print('Epoch {}/{} '.format(e+1, epochs),
                  'Iteration {}/{}'.format(iteration, iterations),
                  'Training loss: {:.4f}'.format(loss/b),
                  '{:.4f} sec/batch'.format((end-start)))
        
            
            if (iteration%save_every_n == 0) or (iteration == iterations):
                # Check performance, notice dropout has been set to 1
                val_loss = []
                new_state = sess.run(model.initial_state)
                for x, y in get_batch([test_x, test_y], num_step):
                    feed = {model.inputs: x,
                            model.targets: y,
                            model.keep_prob: 1.,
                            model.initial_state: new_state}
                    batch_loss, new_state = sess.run([model.cost, model.final_state], feed_dict=feed)
                    val_loss.append(batch_loss)

                print('Validation loss:', np.mean(val_loss),
                      'Saving checkpoint!')
                saver.save(sess, "checkpoints/i{}_l{}_v{:.3f}.ckpt".format(iteration, lstm_size, np.mean(val_loss)))

Epoch 1/35  Iteration 1/210 Training loss: 4.4836 2.1586 sec/batch
Epoch 1/35  Iteration 2/210 Training loss: 4.4527 1.7658 sec/batch
Epoch 1/35  Iteration 3/210 Training loss: 4.3923 1.7703 sec/batch
Epoch 1/35  Iteration 4/210 Training loss: 4.2873 1.8190 sec/batch
Epoch 1/35  Iteration 5/210 Training loss: 4.0989 1.8583 sec/batch
Epoch 1/35  Iteration 6/210 Training loss: 3.9809 1.7911 sec/batch
Epoch 2/35  Iteration 7/210 Training loss: 3.5751 1.7600 sec/batch
Epoch 2/35  Iteration 8/210 Training loss: 3.4280 1.7937 sec/batch
Epoch 2/35  Iteration 9/210 Training loss: 3.3412 1.7612 sec/batch
Epoch 2/35  Iteration 10/210 Training loss: 3.2861 1.8183 sec/batch
Epoch 2/35  Iteration 11/210 Training loss: 3.2590 1.8558 sec/batch
Epoch 2/35  Iteration 12/210 Training loss: 3.2395 1.7824 sec/batch
Epoch 3/35  Iteration 13/210 Training loss: 3.2422 1.7786 sec/batch
Epoch 3/35  Iteration 14/210 Training loss: 3.1576 1.7723 sec/batch
Epoch 3/35  Iteration 15/210 Training loss: 3.1312 1.7708

In [34]:
def pick_top_n(preds, vocab_size, top_n=5):
    p = np.squeeze(preds)
    p[np.argsort(p)[:-top_n]] = 0
    p = p / np.sum(p)
    c = np.random.choice(vocab_size, 1, p=p)[0]
    return c

In [35]:
def sample(checkpoint, n_samples, lstm_size, vocab_size, prime="The "):
    samples = [c for c in prime]
    model = build_rnn(vocab_size, lstm_size=lstm_size, sampling=True)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
        for c in prime:
            x = np.zeros((1, 1))
            x[0,0] = vocab_to_int[c]
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.preds, model.final_state], 
                                         feed_dict=feed)

        c = pick_top_n(preds, len(vocab))
        samples.append(int_to_vocab[c])

        for i in range(n_samples):
            x[0,0] = c
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.preds, model.final_state], 
                                         feed_dict=feed)

            c = pick_top_n(preds, len(vocab))
            samples.append(int_to_vocab[c])
        
    return ''.join(samples)

In [37]:
tf.train.get_checkpoint_state('checkpoints')

model_checkpoint_path: "checkpoints/i210_l512_v2.369.ckpt"
all_model_checkpoint_paths: "checkpoints/i200_l512_v2.381.ckpt"
all_model_checkpoint_paths: "checkpoints/i210_l512_v2.369.ckpt"

In [40]:
# Change the name of latest checkpoint accordingly
checkpoint = "checkpoints/i210_l512_v2.369.ckpt"
samp = sample(checkpoint,2000, lstm_size, len(vocab), prime="ok")
print(samp)

INFO:tensorflow:Restoring parameters from checkpoints/i210_l512_v2.369.ckpt
ok stisengentit ant estaistoonen ate tom lo tes ang inthe sot in the sotined ondith ion thandiling to the serat ing out tomprot thin te thes inserall tor ined thon sote tiat of welere the sicanenthat esterit anto th t mest in atee tes chille toretins ther int ous out it on an oo ply the tand tom leth inge tite ins thes tereste to chetarit oo ser issestis the the dat int tam te andinte ale sat on and to sere sice the d an at outing sichit are satang ar ondaly youstis ist ant in th the thes at ond ta t at te se cally th medate ta the the calil  of ing setating an tas ine there atine ane deandithendet ar on the  or ing ar inet in sing tes the  insuran in to the thoulis oususethat or in iondit and thin th ce s ine caster ce s at iou areall to s an tou ly ureangon tores alitingite areing teat aset eas ou ate th ing sat tore cat ingeas ont on wat ing itit set astiset oo lis an te the datisict iner asu te th ie anengo

#### Conclusion : Seems like prediction is not well need more training and generic one to train our model