In [4]:
# !kaggle datasets download -d projjal1/human-conversation-training-data
# !unzip *.zip
# !rm -rf *.zip

In [5]:
import tensorflow as tf

print(tf.config.list_physical_devices("GPU"))

[]


In [6]:
corpus = ""
with open("human_chat.txt", "r") as f:
    corpus = f.read()

In [7]:
import contractions

corpus = corpus.replace("Human 1:", "")
corpus = corpus.replace("Human 2:", "")
corpus = corpus.lower()
corpus = contractions.fix(corpus)

In [8]:
import nltk
import os
from nltk.text import sent_tokenize

cwd = os.curdir

nltk.download("punkt", download_dir=cwd)

sentences = sent_tokenize(corpus)
sentences[:5]

[nltk_data] Downloading package punkt to ....
[nltk_data]   Package punkt is already up-to-date!


[' hi!',
 'what is your favorite holiday?',
 'one where i get to meet lots of different people.',
 'what was the most number of people you have ever met during a holiday?',
 'hard to keep a count.']

In [9]:
import re

sentences = [re.sub(r"[^a-zA-Z ]", "", sentence).strip() for sentence in sentences]
sentences[:5]

['hi',
 'what is your favorite holiday',
 'one where i get to meet lots of different people',
 'what was the most number of people you have ever met during a holiday',
 'hard to keep a count']

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(sentences)

In [12]:
unique_words = len(tokenizer.word_index)
unique_words

2723

In [13]:
max_length = max([len(sentence.split()) for sentence in sentences])
max_length

71

In [14]:
n_gram_sequence = []

for sentence in sentences:
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence.append(token_list[: i + 1])

In [15]:
X = []
y = []

for sequence in n_gram_sequence:
    X.append(sequence[:-1])
    y.append(sequence[-1])

X = pad_sequences(X, maxlen=max_length + 1, padding="pre")

In [16]:
print(X[:2])
print(y[:2])

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 13]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 13  6]]
[6, 27]


In [17]:
from tensorflow.keras.utils import to_categorical

y = to_categorical(y, num_classes=unique_words + 1)

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

model = Sequential()
model.add(Embedding(unique_words + 1, 100))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(unique_words + 1, activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(X, y, epochs=50, batch_size=32)

Epoch 1/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 108ms/step - accuracy: 0.0345 - loss: 6.5986
Epoch 2/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 103ms/step - accuracy: 0.1053 - loss: 5.6492
Epoch 3/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 114ms/step - accuracy: 0.1349 - loss: 5.2064
Epoch 4/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 110ms/step - accuracy: 0.1609 - loss: 4.8348
Epoch 5/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 113ms/step - accuracy: 0.1841 - loss: 4.5078
Epoch 6/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 109ms/step - accuracy: 0.2008 - loss: 4.2404
Epoch 7/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 109ms/step - accuracy: 0.2180 - loss: 3.9591
Epoch 8/50
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 103ms/step - accuracy: 0.2532 - loss: 3.6544
Epoch 9/50
[1m5

In [54]:
import numpy as np

def predict_next_words(input_text):
  token_list = tokenizer.texts_to_sequences([input_text])[0]
  token_list = pad_sequences([token_list], maxlen=max_length+1, padding="pre")

  predicted_token = model.predict(token_list)
  predicted_token = np.argsort(predicted_token[0])[-3:]
  predicted_words = []
  for token in predicted_token:
    predicted_words.append(tokenizer.index_word[token])
  return predicted_words[::-1]


In [55]:
predict_next_words("i ")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step


['am', 'have', 'think']

In [62]:
out = "i have"
for i in range(10):
    out += " " + predict_next_words(out)[0]

print(out)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
i have been super busy so i have not study much about


In [21]:
model.save("predict_next_words.keras")

In [23]:
model.save_weights("model.weights.h5")

In [44]:
import joblib

joblib.dump(tokenizer, "tokenizer")

['tokenizer']

In [45]:
import joblib
from tensorflow.keras.models import load_model

# Load the tokenizer
tokenizer = joblib.load("tokenizer")

# Load the model
model = load_model("predict_next_words.keras")

# Function to predict next words
def predict_next_words(input_text):
    token_list = tokenizer.texts_to_sequences([input_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_length+1, padding="pre")

    predicted_token = model.predict(token_list)
    predicted_token = np.argsort(predicted_token[0])[-3:]
    predicted_words = []
    for token in predicted_token:
        predicted_words.append(tokenizer.index_word[token])
    print(input_text)
    return predicted_words[::-1]

# Example usage
predicted_words = predict_next_words("what")
print(predicted_words)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 299ms/step
what
['is', 'are', 'about']
