<a href="https://colab.research.google.com/github/panimesh14/Natural-Language-Processing-NLP/blob/main/Character_Based_GRU_RNN_model_for_Text_Generation_using_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

GRU RNN based Text Generator Model

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as put
%matplotlib inline
import seaborn as sns
import tensorflow as tf

In [3]:
# Data Reading
bookfile_path="shakespeare.txt";#path to file
booktext=open(bookfile_path,'r').read();#reading text from book
booktext[100:500]#random text from book

"ght never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  But thou contracted to thine own bright eyes,\n  Feed'st thy light's flame with self-substantial fuel,\n  Making a famine where abundance lies,\n  Thy self thy foe, to thy sweet self too cruel:\n  Thou that art now the world's fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bu"

In [4]:
#Vocab/Unique words
vocab=sorted(set(booktext));
len(vocab)

84

In [5]:

#Character and Index Map
#Vocab conversion to dictionary
char_to_ind={char:ind for ind,char in enumerate(vocab)}
ind_to_char={ind:char for ind,char in enumerate(vocab)}
ind_to_char

{0: '\n',
 1: ' ',
 2: '!',
 3: '"',
 4: '&',
 5: "'",
 6: '(',
 7: ')',
 8: ',',
 9: '-',
 10: '.',
 11: '0',
 12: '1',
 13: '2',
 14: '3',
 15: '4',
 16: '5',
 17: '6',
 18: '7',
 19: '8',
 20: '9',
 21: ':',
 22: ';',
 23: '<',
 24: '>',
 25: '?',
 26: 'A',
 27: 'B',
 28: 'C',
 29: 'D',
 30: 'E',
 31: 'F',
 32: 'G',
 33: 'H',
 34: 'I',
 35: 'J',
 36: 'K',
 37: 'L',
 38: 'M',
 39: 'N',
 40: 'O',
 41: 'P',
 42: 'Q',
 43: 'R',
 44: 'S',
 45: 'T',
 46: 'U',
 47: 'V',
 48: 'W',
 49: 'X',
 50: 'Y',
 51: 'Z',
 52: '[',
 53: ']',
 54: '_',
 55: '`',
 56: 'a',
 57: 'b',
 58: 'c',
 59: 'd',
 60: 'e',
 61: 'f',
 62: 'g',
 63: 'h',
 64: 'i',
 65: 'j',
 66: 'k',
 67: 'l',
 68: 'm',
 69: 'n',
 70: 'o',
 71: 'p',
 72: 'q',
 73: 'r',
 74: 's',
 75: 't',
 76: 'u',
 77: 'v',
 78: 'w',
 79: 'x',
 80: 'y',
 81: 'z',
 82: '|',
 83: '}'}

In [6]:
char_to_ind#Character to Index Map

{'\n': 0,
 ' ': 1,
 '!': 2,
 '"': 3,
 '&': 4,
 "'": 5,
 '(': 6,
 ')': 7,
 ',': 8,
 '-': 9,
 '.': 10,
 '0': 11,
 '1': 12,
 '2': 13,
 '3': 14,
 '4': 15,
 '5': 16,
 '6': 17,
 '7': 18,
 '8': 19,
 '9': 20,
 ':': 21,
 ';': 22,
 '<': 23,
 '>': 24,
 '?': 25,
 'A': 26,
 'B': 27,
 'C': 28,
 'D': 29,
 'E': 30,
 'F': 31,
 'G': 32,
 'H': 33,
 'I': 34,
 'J': 35,
 'K': 36,
 'L': 37,
 'M': 38,
 'N': 39,
 'O': 40,
 'P': 41,
 'Q': 42,
 'R': 43,
 'S': 44,
 'T': 45,
 'U': 46,
 'V': 47,
 'W': 48,
 'X': 49,
 'Y': 50,
 'Z': 51,
 '[': 52,
 ']': 53,
 '_': 54,
 '`': 55,
 'a': 56,
 'b': 57,
 'c': 58,
 'd': 59,
 'e': 60,
 'f': 61,
 'g': 62,
 'h': 63,
 'i': 64,
 'j': 65,
 'k': 66,
 'l': 67,
 'm': 68,
 'n': 69,
 'o': 70,
 'p': 71,
 'q': 72,
 'r': 73,
 's': 74,
 't': 75,
 'u': 76,
 'v': 77,
 'w': 78,
 'x': 79,
 'y': 80,
 'z': 81,
 '|': 82,
 '}': 83}

In [7]:

#Text Encoding of character to index
encoding_text=np.array([char_to_ind[char] for char in booktext])
print(encoding_text.shape)
encoding_text
#~5.45 million sample characters or encoded character tokens

(5445609,)


array([ 0,  1,  1, ..., 30, 39, 29])

In [8]:
#Sequence Length and batches
booktext[:170]
#sequence length should be sufficient to capture some logical trends for prediction

"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose might never die,\n  But as the riper should by time decease,\n  His tender"

# Text Sequence Generation

In [9]:
seq_len=512;#somewhat random but given above text, it seems enough
num_seq=len(booktext)//(seq_len+1);#adding 1 for 0 index
num_seq

10615

In [10]:

#Text Sequence Generation#Text Series Generator
char_dataset = tf.data.Dataset.from_tensor_slices(encoding_text);
sequences=char_dataset.batch(seq_len+1,drop_remainder=True)#dropping remainder to account for remainder in number of sequences

In [11]:

#For indexed text matching and loss calculation after character prediction 
def target_series_gen(seq):#for matching and loss calculation
    input_txt=seq[:-1];
    target_txt=seq[1:];
    return input_txt,target_txt
#Text Series Generator predicting next character from a given series
#(X,y) tuple pair generation function

In [12]:

#Indexed Text Series Generation from sequence
indexed_dataset=sequences.map(target_series_gen)#text series generator tuple of input text and corresponding text after character prediction
batch_size=1;
buffer_size=1000;
indexed_dataset = indexed_dataset.shuffle(buffer_size).batch(batch_size,drop_remainder=True)

In [13]:
#MODEL BUILDING
embed_dim=len(vocab);#taking the entire vocabulary for embedding, less can be taken and would be less computationally expensive
hidden_state=1024;

In [14]:
#For function call from model
#For function calls as sparse_categorical_crossentropy cannot be modified inside function
from tensorflow.keras.losses import sparse_categorical_crossentropy
def sparse_cat_crossentropy_loss(y_true,y_pred):
    return sparse_categorical_crossentropy(y_true,y_pred,from_logits=True)

In [15]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,GRU,Dense
def text_model(vocab_size,embed_dim,hidden_state,batch_size):
    char_level_model=Sequential()
    char_level_model.add(Embedding(vocab_size,embed_dim,batch_input_shape=[batch_size,None]))
    char_level_model.add(GRU(hidden_state,return_sequences=True, stateful=True))
    char_level_model.add(Dense(vocab_size))
    char_level_model.compile(optimizer='adam',loss=sparse_cat_crossentropy_loss)#since sparse categorical cross entropy could not be modified inside the function for logits=True
    return char_level_model

In [16]:
text_gen_model=text_model(len(vocab),embed_dim,hidden_state,batch_size)
text_gen_model.summary()
#~3.5 million weight parameters for learning

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (1, None, 84)             7056      
                                                                 
 gru (GRU)                   (1, None, 1024)           3409920   
                                                                 
 dense (Dense)               (1, None, 84)             86100     
                                                                 
Total params: 3,503,076
Trainable params: 3,503,076
Non-trainable params: 0
_________________________________________________________________


In [17]:
#checking just random sample output and shape
sample_size=1;
for input_sample_batch,output_sample_batch in indexed_dataset.take(sample_size):
    model_pred_sample_batch=text_gen_model(input_sample_batch);
model_pred_sample_batch.shape

TensorShape([1, 512, 84])

In [18]:
model_pred_sample_batch[0]#Model returns logits/log odds probability of occurrence of any character in vocab at every character position of sequence
#probability distribution vectors of size of vocab at each character position 
#size therefore=seq_lengthxvocab_size

<tf.Tensor: shape=(512, 84), dtype=float32, numpy=
array([[ 0.00231476, -0.00675318,  0.00972144, ..., -0.00211171,
         0.00271326, -0.00448302],
       [ 0.00400221, -0.00912479,  0.01355366, ..., -0.00364586,
         0.00323259, -0.00636193],
       [ 0.0050957 , -0.00979108,  0.01504654, ..., -0.00455231,
         0.00311726, -0.0072171 ],
       ...,
       [-0.00297844,  0.00301474, -0.00456339, ...,  0.00336004,
         0.00949498, -0.0029708 ],
       [-0.00417688,  0.00088109, -0.01027511, ...,  0.00990327,
         0.00343948, -0.00214919],
       [ 0.00206069,  0.00189897,  0.00345223, ...,  0.00521082,
         0.00479259,  0.00538615]], dtype=float32)>

In [19]:
sample_indices=tf.random.categorical(model_pred_sample_batch[0],num_samples=1)#number of sample =1 as the most probable character is to be extracted
#converting probability distribution at each character position to index with max probability character being output
sample_indices

<tf.Tensor: shape=(512, 1), dtype=int64, numpy=
array([[15],
       [18],
       [15],
       [73],
       [57],
       [ 4],
       [21],
       [79],
       [72],
       [44],
       [ 3],
       [30],
       [73],
       [60],
       [81],
       [22],
       [21],
       [45],
       [17],
       [54],
       [26],
       [35],
       [58],
       [80],
       [17],
       [ 1],
       [13],
       [69],
       [63],
       [31],
       [73],
       [11],
       [52],
       [ 3],
       [ 6],
       [49],
       [57],
       [ 3],
       [55],
       [55],
       [58],
       [30],
       [52],
       [35],
       [16],
       [29],
       [70],
       [59],
       [24],
       [ 2],
       [17],
       [80],
       [65],
       [82],
       [14],
       [78],
       [57],
       [10],
       [10],
       [61],
       [31],
       [83],
       [ 6],
       [59],
       [38],
       [46],
       [41],
       [25],
       [82],
       [ 8],
       [73],
       [37],
       [66],
   

In [20]:
sample_indices=tf.squeeze(sample_indices,axis=-1).numpy()
#Reshaping vector

In [21]:
sample_indices[:50]

array([15, 18, 15, 73, 57,  4, 21, 79, 72, 44,  3, 30, 73, 60, 81, 22, 21,
       45, 17, 54, 26, 35, 58, 80, 17,  1, 13, 69, 63, 31, 73, 11, 52,  3,
        6, 49, 57,  3, 55, 55, 58, 30, 52, 35, 16, 29, 70, 59, 24,  2])

In [22]:
sample_character=[ind_to_char[ind] for ind in sample_indices]
#converting index to character
#essentially random output as output is on random untrained model
"".join(sample_character)
#as stated untrained model giving random output

'474rb&:xqS"Erez;:T6_AJcy6 2nhFr0["(Xb"``cE[J5Dod>!6yj|3wb..fF}(dMUP?|,rLkZGRdOv"z-7\nTis_fu]h(!2u22&m68hjC6VG&6\n ":6B;>H>o.R.<|`iq_o?!>6>7[-(owr:X4BpVp-W-WBq)b!Gqw2T!_;t`G.wy7z};p[UlC45:kE(c,_UlfPy97>j"TP03IcCXPiEU(4rIQpH(NZagPsb.HW,_jXJ69ejCZS3knl;_Go>WuIen_J?R(cYlC;ET\n2QpK`TsU).yqF.f\n,T|Nc"LW\'qIO1`q_<9.iwBCx![0}H\'4PJUeixtS4rylgcI tTi(xyOSi5|>kX[CiiHwP;1>H\nW\'[}RKkubXaD|lXl\'P,500Lq>1;Y8&W6>\n7S5s;No;u]GoKn(?k3b:SeU<foc-,\'PvQ>Td<kKF,]RmO1!Ob"-;KFeCH?[8hdM<!PP03\nj):B1}AKY-4 \nHPIr33xb2GM\'"UqQ821ecIAePfX}Y1xRuFN]'

In [23]:
#MODEL TRAINING
text_gen_model.fit(indexed_dataset,epochs=6)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7fd76640f710>

In [24]:
#Text generation/MODEL PREDICTION
def generate_text(model,start_seed_text,generation_size=500,temp=1.0):
    num_generate=generation_size;
    input_eval = [char_to_ind[c] for c in start_seed_text]
    input_eval = tf.expand_dims(input_eval,0);
    
    text_generated=[];
    temperature=temp;
    model.reset_states()
    for i in range(num_generate):
        predictions=model(input_eval)
        predictions=tf.squeeze(predictions, axis=0)#reshaping 
        predictions=predictions/temperature;#probability adjustment 
        predicted_id=tf.random.categorical(predictions,num_samples=1)[-1,0].numpy()#vector of probability distributions to categorical distribution vector
        input_eval=tf.expand_dims([predicted_id],0)#expansion to fit
        text_generated.append(ind_to_char[predicted_id])#appending character to text sequence generates
    return (start_seed_text+"".join(text_generated))#entire text sequence

In [25]:
#text_gen_model.build(tf.TensorShape([1,None]))
start_seed_text="Romeo";
print(generate_text(text_gen_model,start_seed_text,generation_size=1000,temp=1.0));

Romeogs. So speak, the veignants
    Any inso bead-norter bestrayown took like owed,
    Thristy his daughter, for attemsats
    Rosering ius'd against the replorious
      fail, I will diserse swift abrears, for validitions
    fame stop our wish! And do any can the chokerabs:
    come but me anone, that 'twent not.
  Man. A what take his but one ungrace case me lorg,
    purenetaditary, bright.
  LAUNCE. Sprill, I will not ass welcome.
                             Wence Anto
  round Atemnon
  VALENT I thereriver, to CRIS, and your suret wound, presently
       Undersplain, their exit play for a cold direction gorder, for my lord
  S. For the bed-shall do worth
    in oney-   ho, i' for timptre ha! why, so, I thank you his illy too meet.
  SHEPHERD. And would you are frt.
    And the Kench charlereted to Rome, nelder
  THERMONTERSICEA. By; with othelstand
       Enge MROM. O, Liven, sir, their alms

  ATHANGLANT


         ceptoratur; an oteens here Il, already
  APTIMACL beN COMMARDA

The text generator weights can further be improved by training the model over more epochs and despite the few namely 6 epochs, the results show the model's capability to effectively learn weights.