In [2]:
import tensorflow as tf
import numpy as np
import os
import time

In [3]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

 
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [4]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 
                                       'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
with open("/Users/pankaj/dev/git/smu/nlp337/sanskrit/geeta.txt") as f:
    lines = f.readlines()

text = "".join([l for l in lines if l.strip()!=''])

# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))
vocab = sorted(set(text))
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in text])
# The maximum length sentence we want for a single input in characters
seq_length = 30
examples_per_epoch = len(text)//(seq_length+1)
# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)
dataset = sequences.map(split_input_target)

Length of text: 71239 characters


In [5]:
from nltk import word_tokenize

In [6]:
words = word_tokenize(text)

In [16]:
import pandas as pd
pd.DataFrame(Counter(words))

ValueError: If using all scalar values, you must pass an index

In [11]:
sorted(Counter(words), key = lambda x:x.)

['orलो',
 'orवियुक्तैस्तु',
 'orसंशयम्',
 'var',
 'अकर्मणश्च',
 'अकीर्तिं',
 'अक्षरं',
 'अक्षरब्रह्मयोगः',
 'अक्षरब्रह्मयोगो',
 'अक्षराणामकारोऽस्मि',
 'अग्निर्ज्योतिरहः',
 'अघायुरिन्द्रियारामो',
 'अच्छेद्योऽयमदाह्योऽयमक्लेद्योऽशोष्य',
 'अजानता',
 'अजो',
 'अजोऽपि',
 'अज्ञश्चाश्रद्दधानश्च',
 'अज्ञानं',
 'अज्ञानेनावृतं',
 'अत',
 'अतत्त्वार्थवदल्पं',
 'अतोऽस्मि',
 'अत्येति',
 'अत्र',
 'अथ',
 'अथवा',
 'अथाष्टादशोऽध्यायः',
 'अथैकादशोऽध्यायः',
 'अथैतदप्यशक्तोऽसि',
 'अदृष्टपूर्वं',
 'अदेशकाले',
 'अद्वेष्टा',
 'अधर्मं',
 'अधर्माभिभवात्कृष्ण',
 'अधश्च',
 'अधश्चोर्ध्वं',
 'अधिभूतं',
 'अधियज्ञः',
 'अधियज्ञोऽहमेवात्र',
 'अधिष्ठानं',
 'अधिष्ठाय',
 'अधो',
 'अध्यात्मज्ञाननित्यत्वं',
 'अध्यात्मनित्या',
 'अध्यात्मविद्या',
 'अध्येष्यते',
 'अनन्त',
 'अनन्तविजयं',
 'अनन्तवीर्यामितविक्रमस्त्वं',
 'अनन्तश्चास्मि',
 'अनन्यचेताः',
 'अनन्याश्चिन्तयन्तो',
 'अनन्येनैव',
 'अनपेक्षः',
 'अनात्मनस्तु',
 'अनादित्वान्निर्गुणत्वात्परमात्मायमव्ययः',
 'अनादिमत्परं',
 'अनादिमध्यान्तमनन्तवीर्य-',
 'अनार्यजुष्टमस्वर्ग्यमकीर्

In [17]:
c = Counter(words)

In [20]:
import operator
x = c
sorted_x = sorted(x.items(), key=operator.itemgetter(1), reverse=True)

In [21]:
sorted_x

[('।', 783),
 ('॥', 724),
 ('च', 186),
 ('न', 158),
 ('स', 62),
 ('मे', 52),
 ('मां', 44),
 ('ते', 40),
 ('हि', 38),
 ('पार्थ', 34),
 ('उवाच', 33),
 ('कर्म', 33),
 ('श्रीभगवानुवाच', 28),
 ('तु', 24),
 ('कौन्तेय', 24),
 ('अथ', 23),
 ('ये', 23),
 ('अर्जुन', 22),
 ('भारत', 22),
 ('ॐ', 20),
 ('चैव', 20),
 ('यः', 20),
 ('एव', 19),
 ('यो', 19),
 ('वा', 18),
 ('तत्सदिति', 18),
 ('श्रीमद्भगवद्गीतासूपनिषत्सु', 18),
 ('ब्रह्मविद्यायां', 18),
 ('योगशास्त्रे', 18),
 ('मम', 17),
 ('श्रीकृष्णार्जुनसंवादे', 17),
 ('नाम', 16),
 ('परं', 16),
 ('तथा', 15),
 ('ब्रह्म', 15),
 ('ज्ञानं', 15),
 ('महाबाहो', 14),
 ('भूतानि', 14),
 ('विद्धि', 14),
 ('योगी', 14),
 ('तं', 13),
 ('त्वां', 13),
 ('कर्माणि', 13),
 ('मयि', 13),
 ('मया', 13),
 ('इति', 12),
 ('यथा', 12),
 ('यदा', 12),
 ('किं', 11),
 ('वेत्ति', 11),
 ('ततो', 11),
 ('यान्ति', 11),
 ('परमं', 11),
 ('सञ्जय', 10),
 ('सर्वे', 10),
 ('मनः', 10),
 ('परन्तप', 10),
 ('त्वं', 10),
 ('श\u200dृणु', 10),
 ('प्रकृतिं', 10),
 ('पश्यति', 10),
 ('भवति', 10),
 ('विद्यते

In [90]:
len(words)

9026

In [91]:
len(vocab)

80

In [8]:
from collections import Counter

In [22]:
x1 = Counter(text)
sorted_x1 = sorted(x1.items(), key=operator.itemgetter(1), reverse=True)
sorted_x1

[(' ', 8774),
 ('्', 7885),
 ('ा', 4190),
 ('त', 4074),
 ('र', 2982),
 ('य', 2877),
 ('म', 2775),
 ('ि', 2694),
 ('न', 2653),
 ('व', 2348),
 ('स', 2102),
 ('\n', 1651),
 ('े', 1573),
 ('॥', 1443),
 ('ं', 1403),
 ('द', 1384),
 ('क', 1371),
 ('प', 1367),
 ('ु', 1362),
 ('ो', 1106),
 ('ः', 940),
 ('श', 892),
 ('च', 838),
 ('।', 783),
 ('ष', 766),
 ('ह', 742),
 ('भ', 719),
 ('-', 714),
 ('ज', 710),
 ('१', 699),
 ('ग', 653),
 ('ध', 584),
 ('ी', 462),
 ('ण', 454),
 ('ृ', 409),
 ('थ', 364),
 ('२', 324),
 ('ल', 323),
 ('ू', 293),
 ('ब', 271),
 ('ञ', 270),
 ('ै', 260),
 ('३', 246),
 ('ऽ', 243),
 ('अ', 217),
 ('४', 200),
 ('८', 174),
 ('६', 162),
 ('५', 148),
 ('७', 141),
 ('ख', 118),
 ('ट', 114),
 ('ङ', 109),
 ('ौ', 105),
 ('०', 105),
 ('९', 98),
 ('छ', 79),
 ('ए', 73),
 ('उ', 70),
 ('इ', 60),
 ('ठ', 49),
 ('आ', 47),
 ('फ', 35),
 ('ढ', 33),
 ('घ', 33),
 ('ड', 29),
 ('ॐ', 21),
 ('\u200d', 14),
 ('r', 5),
 ('ई', 5),
 ('o', 4),
 ('ॄ', 4),
 ('ँ', 3),
 ('ऋ', 3),
 ('ऊ', 3),
 ('v', 1),
 ('a', 1),
 ('ओ

In [93]:
Counter(text)

Counter({'श': 892,
         '्': 7885,
         'र': 2982,
         'ी': 462,
         'म': 2775,
         'द': 1384,
         'भ': 719,
         'ग': 653,
         'व': 2348,
         'त': 4074,
         'ा': 4190,
         '\n': 1651,
         '॥': 1443,
         ' ': 8774,
         'ॐ': 21,
         'प': 1367,
         'न': 2653,
         'े': 1573,
         'ः': 940,
         'अ': 217,
         'थ': 364,
         'ो': 1106,
         'ऽ': 243,
         'ध': 584,
         'य': 2877,
         '।': 783,
         'ज': 710,
         'ु': 1362,
         'ि': 2694,
         'ष': 766,
         'ृ': 409,
         'ट': 114,
         'उ': 70,
         'च': 838,
         'क': 1371,
         'स': 2102,
         'ण': 454,
         'ड': 29,
         'ै': 260,
         'ञ': 270,
         '१': 699,
         '-': 714,
         'ं': 1403,
         'ू': 293,
         'ढ': 33,
         'आ': 47,
         'ङ': 109,
         'ब': 271,
         '२': 324,
         'ह': 742,
         '३': 246,
         '४': 2

In [94]:
# Batch size
BATCH_SIZE = 64

BATCH_SIZE = 16


# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

# The embedding dimension
#embedding_dim = 16

# Number of RNN units
#rnn_units = 32


In [95]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)
model.summary()
model.compile(optimizer='adam', loss=loss)


Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (16, None, 256)           20480     
_________________________________________________________________
gru_10 (GRU)                 (16, None, 1024)          3938304   
_________________________________________________________________
dense_10 (Dense)             (16, None, 80)            82000     
Total params: 4,040,784
Trainable params: 4,040,784
Non-trainable params: 0
_________________________________________________________________


In [96]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(16, 30, 80) # (batch_size, sequence_length, vocab_size)


In [97]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
vexample_batch_loss  = loss(target_example_batch, example_batch_predictions)
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [98]:
EPOCHS=10
#EPOCHS=5
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])


Train for 143 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [99]:
tf.train.latest_checkpoint(checkpoint_dir)


'./training_checkpoints/ckpt_10'

In [100]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))
model.summary()


Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (1, None, 256)            20480     
_________________________________________________________________
gru_11 (GRU)                 (1, None, 1024)           3938304   
_________________________________________________________________
dense_11 (Dense)             (1, None, 80)             82000     
Total params: 4,040,784
Trainable params: 4,040,784
Non-trainable params: 0
_________________________________________________________________


In [101]:
#print(generate_text(model, start_string=u"ROMEO: "))

print(generate_text(model, start_string=u"श्रीभगवानुवाच"))


श्रीभगवानुवाच ।
संन्यस्य श्रभमः ॥ ८-१६॥
ब्राह्मसंस्पुश्वतेः कर्मणन्निवम् ।
लःभक्तुं पे च निवधामहम् ॥ ९-२८॥
यज्ञे देवार्थि पाण्डव ।मनम् ।
तेनामिभार्ति शघारि ॥ १६-२८॥
आथायतस्तपश्च लभयेन पाण्डव ॥ ११-१४॥
दैवत्यापौ नैव त्वकर्मणि ततः स्थितं तव ।
ते प्रविष्दी ॥ २-२६॥
तथैव चैर्त्रज्ञयोर्ज्ञात्वा मां प्रलाय मच ।
धर्मां शत्रु कर्मसंन्यासयोगयुक्तः स एवाहं सूत श्रीभगवानुवाच ।
क्षेत्रं यथवेदशूत्रविमानिगच्छति ॥ सामदेतवः ।
महानां बलं भर्तस्तृति भारत ।
तेऽपि मामजिष्ठोनामपि चैव प्रमुखैः पार्थ रूपसि ।
ईश्चित्य बुद्धिः सर्वं च सर्वशः षथयम् ।
वशूतात्मश्नुत्कर्षणम् ।
प्रमापव्यं बहविदश्रैताः स्तापयोतिष्ठा न पुनः सर्वं दशूमं मत्तरे ॥ १८-२७॥
गृह्णीमाद्धारं सर्वकर्माणं न विधागयोगः
        श्रदभूयाभियक्षर्षव ॥ १३-१५॥
अपार्थ नैर्गद्वेतवस्तुबस्ति ।
अहते ॥ १८-२०॥
ये त्वन्नमपिन्तमसो जनाः ॥ ८-१४॥
अथेतदेवरा परप्तरम् ॥ ११-२४॥
द्वौ भावेता भवार्जुन ।
अप्रतानां मयो बुद्धिरपैव मदर्यं हृददसःमिक्षिभिः ॥ ११-३९॥
कक्ष्ट्रयस्यदावपि ज्ञानयुञ्चति निध्यते ॥ १४-१७॥
अहं तथापरां शार्तं च भूततश्च दीयते च ॥ १४-१७॥
सर्वकर्माणि च महाबाहो

In [102]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [103]:
optimizer = tf.keras.optimizers.Adam()


In [104]:
@tf.function

def train_step(inp, target):
  with tf.GradientTape() as tape:
    predictions = model(inp)
    loss = tf.reduce_mean(
        tf.keras.losses.sparse_categorical_crossentropy(
            target, predictions, from_logits=True))
  grads = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

  return loss

In [105]:
# Training step
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  # initializing the hidden state at the start of every epoch
  # initally hidden is None
  hidden = model.reset_states()

  for (batch_n, (inp, target)) in enumerate(dataset):
    loss = train_step(inp, target)

    if batch_n % 100 == 0:
      template = 'Epoch {} Batch {} Loss {}'
      print(template.format(epoch+1, batch_n, loss))

  # saving (checkpoint) the model every 5 epochs
  if (epoch + 1) % 5 == 0:
    model.save_weights(checkpoint_prefix.format(epoch=epoch))

  print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))
  print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

model.save_weights(checkpoint_prefix.format(epoch=epoch))

Epoch 1 Batch 0 Loss 4.381718158721924
Epoch 1 Batch 100 Loss 2.4497358798980713
Epoch 1 Loss 2.3139
Time taken for 1 epoch 54.01252794265747 sec

Epoch 2 Batch 0 Loss 2.354511022567749
Epoch 2 Batch 100 Loss 2.4325413703918457
Epoch 2 Loss 2.2089
Time taken for 1 epoch 51.287450313568115 sec

Epoch 3 Batch 0 Loss 2.1450743675231934
Epoch 3 Batch 100 Loss 1.9633461236953735
Epoch 3 Loss 2.0679
Time taken for 1 epoch 52.35612607002258 sec

Epoch 4 Batch 0 Loss 1.890282392501831
Epoch 4 Batch 100 Loss 1.9342198371887207
Epoch 4 Loss 1.9022
Time taken for 1 epoch 54.84479212760925 sec

Epoch 5 Batch 0 Loss 1.8364064693450928
Epoch 5 Batch 100 Loss 1.7047028541564941
Epoch 5 Loss 1.8721
Time taken for 1 epoch 56.429481983184814 sec

Epoch 6 Batch 0 Loss 1.6121739149093628
Epoch 6 Batch 100 Loss 1.7303071022033691
Epoch 6 Loss 1.4901
Time taken for 1 epoch 49.392452239990234 sec

Epoch 7 Batch 0 Loss 1.6311205625534058
Epoch 7 Batch 100 Loss 1.4953564405441284
Epoch 7 Loss 1.3061
Time taken

Epoch 8 Batch 0 Loss 1.2487035989761353
Epoch 8 Batch 100 Loss 1.529293179512024
Epoch 8 Loss 1.5698
Time taken for 1 epoch 49.09016799926758 sec

Epoch 9 Batch 0 Loss 1.126464605331421
Epoch 9 Batch 100 Loss 1.3225820064544678
Epoch 9 Loss 1.2992
Time taken for 1 epoch 51.88028407096863 sec

Epoch 10 Batch 0 Loss 1.0157569646835327
Epoch 10 Batch 100 Loss 1.0376957654953003
Epoch 10 Loss 1.1655
Time taken for 1 epoch 51.3341600894928 sec

