<a href="https://colab.research.google.com/github/nagaharikathota/dl-4/blob/main/dl_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

//by importing the necessary Python libraries and the dataset:

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


In [5]:
with open('/content/Sherlock.txt', 'r', encoding='utf-8') as file:
    text = file.read()


//Now let’s tokenize the text to create a sequence of words:

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1


//Now let’s create input-output pairs by splitting the text into sequences of tokens and forming n-grams from the sequences:

In [7]:
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)



//Now let’s pad the input sequences to have equal length:

In [8]:
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))


//Now let’s split the sequences into input and output:

In [9]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]


//Now let’s convert the output to one-hot encode vectors:

In [10]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

//Now let’s build a neural network architecture to train the model:

In [11]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
print(model.summary())


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 16, 100)           846400    
                                                                 
 lstm (LSTM)                 (None, 150)               150600    
                                                                 
 dense (Dense)               (None, 8464)              1278064   
                                                                 
Total params: 2,275,064
Trainable params: 2,275,064
Non-trainable params: 0
_________________________________________________________________
None


//Now let’s compile and train the model:

In [12]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=10, verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ee6bd1bd2a0>

//we can generate the next word predictions using our model:

In [16]:
seed_text = "i am rohini"
next_words = 16

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word

print(seed_text)


i am rohini glad to give you a little good day said holmes laughing i shall not be able
