In [1]:
import os
import yaml
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


In [2]:
# Path to dataset
dataset_path = r"C:\chat dataset"

texts = []

# Loop through all files in folder
for file_name in os.listdir(dataset_path):
    if file_name.endswith(".yml") or file_name.endswith(".yaml"):
        file_path = os.path.join(dataset_path, file_name)
        with open(file_path, "r", encoding="utf-8") as f:
            data = yaml.safe_load(f)
            # Extract conversations from YAML
            if "conversations" in data:
                for conv in data["conversations"]:
                    for line in conv:
                        texts.append(str(line))


In [3]:
# Initialize tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

total_words = len(tokenizer.word_index) + 1
print("Vocabulary size:", total_words)


Vocabulary size: 1909


In [4]:
input_sequences = []

for line in texts:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_seq_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')

# Separate predictors and labels
X, y = input_sequences[:,:-1], input_sequences[:,-1]
y = to_categorical(y, num_classes=total_words)

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (7951, 57)
y shape: (7951, 1909)


In [5]:
model = Sequential()
model.add(Embedding(total_words, 50, input_length=max_seq_len-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




In [6]:
history = model.fit(X, y, epochs=50, verbose=1)
model.save("next_word_model.h5")


Epoch 1/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 37ms/step - accuracy: 0.0376 - loss: 6.4482
Epoch 2/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 40ms/step - accuracy: 0.0550 - loss: 5.9481
Epoch 3/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 43ms/step - accuracy: 0.0848 - loss: 5.7513
Epoch 4/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 41ms/step - accuracy: 0.1114 - loss: 5.5345
Epoch 5/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 39ms/step - accuracy: 0.1261 - loss: 5.3075
Epoch 6/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 39ms/step - accuracy: 0.1426 - loss: 5.0972
Epoch 7/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 39ms/step - accuracy: 0.1580 - loss: 4.9030
Epoch 8/50
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 39ms/step - accuracy: 0.1787 - loss: 4.7258
Epoch 9/50
[1m249/249[



In [7]:
model.save("next_word_model.keras")   # recommended



In [8]:
def predict_next_word(seed_text, tokenizer, model, max_seq_len):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)[0]
    
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None


In [9]:
seed_text = "how are"
next_word = predict_next_word(seed_text, tokenizer, model, max_seq_len)
print("Next word prediction:", next_word)


Next word prediction: you


In [10]:
def generate_text(seed_text, next_words, model, tokenizer, max_seq_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        output_word = tokenizer.index_word.get(np.argmax(predicted), '')
        seed_text += " " + output_word
    return seed_text

print(generate_text("how are", 5, model, tokenizer, max_seq_len))


how are you doing as i would


In [12]:
def predict_top_k(seed_text, model, tokenizer, max_seq_len, k=3):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
    predictions = model.predict(token_list, verbose=0)[0]

    # Get top k words
    top_indices = predictions.argsort()[-k:][::-1]
    return [tokenizer.index_word[i] for i in top_indices if i in tokenizer.index_word]

print(predict_top_k("how are", model, tokenizer, max_seq_len, k=3))


['you', 'your', 'not']


In [None]:
while True:
    seed_text = input("You: ")
    if seed_text.lower() in ["exit", "quit"]:
        break
    suggestions = predict_top_k(seed_text, model, tokenizer, max_seq_len, k=3)
    print("Suggestions:", suggestions)


You:  hi 


Suggestions: ['how', 'on', 'nice']


You:  how are 


Suggestions: ['you', 'your', 'not']


You:  am 


Suggestions: ['software', 'i', 'not']


You:  am i 


Suggestions: ['am', "don't", 'can']


You:  hi 


Suggestions: ['how', 'on', 'nice']
