In [None]:
!pip install tensorflow==2.15.0 pandas numpy scikit-learn tensorboard matplotlib scikeras nltk

Collecting tensorflow==2.15.0
  Downloading tensorflow-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Collecting ml-dtypes~=0.2.0 (from tensorflow==2.15.0)
  Downloading ml_dtypes-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting wrapt<1.15,>=1.11.0 (from tensorflow==2.15.0)
  Downloading wrapt-1.14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting tensorboard
  Downloading tensorboard-2.15.2-py3-none-any.whl.metadata (1.7 kB)
Collecting tensorflow-estimator<2.16,>=2.15.0 (from tensorflow==2.15.0)
  Downloading tensorflow_estimator-2.15.0-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting keras<2.16,>=2.15.0 (from tensorflow==2.15.0)
  Downloading keras-2.15.0-py3-none-any.whl.metadata (2.4 kB)
INFO: pip is looking at multiple versions of scikeras

In [None]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd


data = gutenberg.raw('shakespeare-hamlet.txt')

with open('hamlet.txt','w') as f:
  f.write(data)

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
with open('hamlet.txt','r') as file:
  text = file.read().lower()

tokenizer=Tokenizer()
tokenizer.fit_on_texts([text])
total_words=len(tokenizer.word_index)+1
print(total_words)


In [None]:
input_sequences=[]
for line in text.split('\n'):
  #print("line",line)
  tokenlist=tokenizer.texts_to_sequences([line])[0]
  #print("tokenlist",tokenlist)
  for i in range(1,len(tokenlist)):
    n_gram_sequence=tokenlist[:i+1]
    #print("n_gram_sequence",n_gram_sequence)
    input_sequences.append(n_gram_sequence)

  #print(input_sequences)


In [None]:
max_sequence_len=max([len(x) for x in input_sequences])
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre'))
print(input_sequences)

In [None]:
import tensorflow as tf
x,y=input_sequences[:,:-1],input_sequences[:,-1]


In [None]:
x

In [None]:
y

In [None]:
y=tf.keras.utils.to_categorical(y,num_classes=total_words)

In [None]:
y

In [None]:
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout

model=Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_len-1))
model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
# model.add(Dense(total_words/2,activation='relu'))
model.add(Dense(total_words,activation='softmax'))

model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

In [None]:
history=model.fit(X_train,y_train,epochs=300,validation_data=(X_test,y_test),verbose=1)

In [None]:
def generate_text(seed_text,model,max_sequence_len):
  token_list=tokenizer.texts_to_sequences([seed_text])[0]
  token_list=pad_sequences([token_list],maxlen=max_sequence_len-1,padding='pre')
  predicted=model.predict(token_list,verbose=0)
  predicted_word_index=np.argmax(predicted,axis=1)
  for word,index in tokenizer.word_index.items():
    if index==predicted_word_index:
      return word
  return None

In [None]:
sample_text='i like to'
max_sequence_len=model.input_shape[1]+1
next_word=generate_text(sample_text,model,max_sequence_len)
print(next_word)

In [None]:
model.save('hamlet_model.h5')


In [None]:
import pickle
with open('tokenizer.pickle','wb') as handle:
  pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)