In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
tokenizer = Tokenizer()

In [7]:

with open("LSTM DATA.txt", "r", encoding="utf-8") as file:
    text = file.read()

tokenizer.fit_on_texts([text])


In [8]:
tokenizer.word_index

{'the': 1,
 'to': 2,
 'of': 3,
 'and': 4,
 'her': 5,
 'a': 6,
 'in': 7,
 'was': 8,
 'i': 9,
 '”': 10,
 'she': 11,
 'that': 12,
 'it': 13,
 'not': 14,
 'you': 15,
 'he': 16,
 'his': 17,
 'be': 18,
 'as': 19,
 'had': 20,
 'with': 21,
 'for': 22,
 'but': 23,
 'is': 24,
 'have': 25,
 'at': 26,
 'mr': 27,
 'him': 28,
 'on': 29,
 'by': 30,
 'my': 31,
 'all': 32,
 'elizabeth': 33,
 'so': 34,
 'they': 35,
 'which': 36,
 'were': 37,
 'been': 38,
 'could': 39,
 'from': 40,
 'very': 41,
 'this': 42,
 'no': 43,
 'would': 44,
 'me': 45,
 'what': 46,
 'their': 47,
 'your': 48,
 'them': 49,
 'will': 50,
 'said': 51,
 'or': 52,
 'such': 53,
 'darcy': 54,
 'are': 55,
 'an': 56,
 'when': 57,
 'there': 58,
 'do': 59,
 'if': 60,
 'mrs': 61,
 'more': 62,
 'much': 63,
 'am': 64,
 'must': 65,
 'any': 66,
 'bennet': 67,
 'miss': 68,
 'than': 69,
 '“i': 70,
 'who': 71,
 'one': 72,
 'jane': 73,
 'did': 74,
 'bingley': 75,
 'should': 76,
 'we': 77,
 'has': 78,
 'know': 79,
 'other': 80,
 'herself': 81,
 'though'

In [14]:
input_sequences = []
for sentence in text.split('\n'):
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0] 
    
    for i in range(1, len(tokenized_sentence)):
        n_gram_sequence = tokenized_sentence[:i+1]
        input_sequences.append(n_gram_sequence)


In [15]:
input_sequences

[[4535, 189],
 [4535, 189, 451],
 [4535, 189, 451, 1029],
 [4535, 189, 451, 1029, 3],
 [4535, 189, 451, 1029, 3, 304],
 [4535, 189, 451, 1029, 3, 304, 4],
 [4535, 189, 451, 1029, 3, 304, 4, 975],
 [42, 1029],
 [42, 1029, 24],
 [42, 1029, 24, 22],
 [42, 1029, 24, 22, 1],
 [42, 1029, 24, 22, 1, 515],
 [42, 1029, 24, 22, 1, 515, 3],
 [42, 1029, 24, 22, 1, 515, 3, 560],
 [42, 1029, 24, 22, 1, 515, 3, 560, 2720],
 [42, 1029, 24, 22, 1, 515, 3, 560, 2720, 7],
 [42, 1029, 24, 22, 1, 515, 3, 560, 2720, 7, 1],
 [42, 1029, 24, 22, 1, 515, 3, 560, 2720, 7, 1, 701],
 [42, 1029, 24, 22, 1, 515, 3, 560, 2720, 7, 1, 701, 779],
 [42, 1029, 24, 22, 1, 515, 3, 560, 2720, 7, 1, 701, 779, 4],
 [95, 80],
 [95, 80, 1453],
 [95, 80, 1453, 3],
 [95, 80, 1453, 3, 1],
 [95, 80, 1453, 3, 1, 217],
 [95, 80, 1453, 3, 1, 217, 26],
 [95, 80, 1453, 3, 1, 217, 26, 43],
 [95, 80, 1453, 3, 1, 217, 26, 43, 1613],
 [95, 80, 1453, 3, 1, 217, 26, 43, 1613, 4],
 [95, 80, 1453, 3, 1, 217, 26, 43, 1613, 4, 21],
 [95, 80, 1453,

In [19]:
max_len = max([len(x) for x in input_sequences])

In [23]:
#We take 0 in the front so that we can split the input output properly

from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_sequences = pad_sequences(input_sequences,maxlen = max_len,padding='pre')
print(padded_sequences)

[[   0    0    0 ...    0 4535  189]
 [   0    0    0 ... 4535  189  451]
 [   0    0    0 ...  189  451 1029]
 ...
 [   0    0    0 ...    2  229  129]
 [   0    0    0 ...  229  129  574]
 [   0    0    0 ...  129  574 1805]]


In [24]:
X = padded_sequences[:,:-1]
Y = padded_sequences[:,-1]

In [25]:
print(X)

[[   0    0    0 ...    0    0 4535]
 [   0    0    0 ...    0 4535  189]
 [   0    0    0 ... 4535  189  451]
 ...
 [   0    0    0 ... 7562    2  229]
 [   0    0    0 ...    2  229  129]
 [   0    0    0 ...  229  129  574]]


In [26]:
print(Y)

[ 189  451 1029 ...  129  574 1805]


In [27]:
#We take this as a classification problem and not as a regression problem because if the model gives 2.1(e.g) then there is no word corresponding to 2.1 so we take multiclass classification

In [28]:
X.shape

(121111, 24)

In [29]:
Y.shape

(121111,)

In [32]:
len(tokenizer.word_index)

7562

In [37]:
import numpy as np
from tensorflow.keras.utils import to_categorical

# Assuming Y is already defined somewhere in your code
# Example: Y = np.array([...])  # Your actual data

# Check the maximum value in Y
max_value_in_Y = np.max(Y)
print(f"Maximum value in Y: {max_value_in_Y}")

# Set num_classes to the maximum value in Y + 1
num_classes = max_value_in_Y + 1
print(f"Using num_classes: {num_classes}")

# Convert Y to one-hot encoding
Y_categorical = to_categorical(Y, num_classes=num_classes)

# Print the shape to verify the conversion
print(f"Shape of one-hot encoded Y: {Y_categorical.shape}")

# Print the one-hot encoded array for verification
print(Y_categorical)


Maximum value in Y: 7562
Using num_classes: 7563
Shape of one-hot encoded Y: (121111, 7563)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [38]:
Y.shape

(121111,)

In [39]:
Y_categorical.shape

(121111, 7563)

In [40]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense

In [53]:
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=64, input_length=24))
model.add(LSTM(128))
model.add(Dense(num_classes, activation='softmax'))

In [54]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [55]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 24, 64)            640000    
                                                                 
 lstm_3 (LSTM)               (None, 128)               98816     
                                                                 
 dense_3 (Dense)             (None, 7563)              975627    
                                                                 
Total params: 1714443 (6.54 MB)
Trainable params: 1714443 (6.54 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
import numpy as np
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding

model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=64, input_length=24))
model.add(LSTM(128))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
     
model.fit(X, Y_categorical, epochs=50)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50