In [None]:
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

In [None]:
import sys
print('python version',sys.version)
print('tensorflow version',tf.__version__)


python version 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0]
tensorflow version 2.17.0


In [None]:
#read the json file
with open('intents\.json') as file:
    data = json.load(file)

In [None]:
#data

In [None]:
df = pd.json_normalize(data['intents'])
df.head()

Unnamed: 0,tag,patterns,responses,context
0,google,"[google, search, internet]",[Redirecting to Google...],
1,greeting,"[Hi there, How are you, Is anyone there?, Hey,...","[Hello, Good to see you again, Hi there, how c...",[]
2,goodbye,"[Bye, See you later, Goodbye, Get lost, Till n...","[See you!, Have a nice day, Bye! Come back aga...",[]
3,thanks,"[Thanks, Thank you, That's helpful, Awesome, t...","[Happy to help!, Any time!, My pleasure]",[]
4,noanswer,[],"[Sorry, can't understand you, Please give me m...",[]


In [None]:
df_patterns = df.explode('patterns')
df_responses = df.explode('responses')

df_final = df_patterns.merge(df_responses, on='tag')

df_final

Unnamed: 0,tag,patterns_x,responses_x,context_x,patterns_y,responses_y,context_y
0,google,google,[Redirecting to Google...],,"[google, search, internet]",Redirecting to Google...,
1,google,search,[Redirecting to Google...],,"[google, search, internet]",Redirecting to Google...,
2,google,internet,[Redirecting to Google...],,"[google, search, internet]",Redirecting to Google...,
3,greeting,Hi there,"[Hello, Good to see you again, Hi there, how c...",[],"[Hi there, How are you, Is anyone there?, Hey,...",Hello,[]
4,greeting,Hi there,"[Hello, Good to see you again, Hi there, how c...",[],"[Hi there, How are you, Is anyone there?, Hey,...",Good to see you again,[]
...,...,...,...,...,...,...,...
205,riddle,Riddle,[What two things can you never eat for breakfa...,[riddles],"[Ask me a riddle, Ask me a question, Riddle]",What 5-letter word becomes shorter when you a...,[riddles]
206,riddle,Riddle,[What two things can you never eat for breakfa...,[riddles],"[Ask me a riddle, Ask me a question, Riddle]",Why can't a bike stand on it's own?.....It is ...,[riddles]
207,age,how old are you,"[I was made in 2020, if that's what you are as...",,"[how old are you, when were you made, what is ...","I was made in 2020, if that's what you are ask...",
208,age,when were you made,"[I was made in 2020, if that's what you are as...",,"[how old are you, when were you made, what is ...","I was made in 2020, if that's what you are ask...",


In [None]:
df_final.drop(columns=['responses_x', 'context_x', 'patterns_y', 'context_y'], inplace=True)


In [None]:
df_final

Unnamed: 0,tag,patterns_x,responses_y
0,google,google,Redirecting to Google...
1,google,search,Redirecting to Google...
2,google,internet,Redirecting to Google...
3,greeting,Hi there,Hello
4,greeting,Hi there,Good to see you again
...,...,...,...
205,riddle,Riddle,What 5-letter word becomes shorter when you a...
206,riddle,Riddle,Why can't a bike stand on it's own?.....It is ...
207,age,how old are you,"I was made in 2020, if that's what you are ask..."
208,age,when were you made,"I was made in 2020, if that's what you are ask..."


In [None]:
df_final['tag'].nunique()

31

In [None]:
df_encoding = df_final.copy()

In [None]:
# label encoding to the tag (label) feature
lbl_encoder = LabelEncoder()
lbl_encoder.fit(df_final['tag'])
df_encoding['tag'] = lbl_encoder.transform(df_final['tag'])

In [None]:
df_encoding

Unnamed: 0,tag,patterns_x,responses_y
0,10,google,Redirecting to Google...
1,10,search,Redirecting to Google...
2,10,internet,Redirecting to Google...
3,11,Hi there,Hello
4,11,Hi there,Good to see you again
...,...,...,...
205,24,Riddle,What 5-letter word becomes shorter when you a...
206,24,Riddle,Why can't a bike stand on it's own?.....It is ...
207,2,how old are you,"I was made in 2020, if that's what you are ask..."
208,2,when were you made,"I was made in 2020, if that's what you are ask..."


In [None]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   tag          210 non-null    object
 1   patterns_x   207 non-null    object
 2   responses_y  210 non-null    object
dtypes: object(3)
memory usage: 5.0+ KB


In [None]:
# Check for missing values in 'patterns_x'
df_encoding['patterns_x'].isna().sum()

df_final.dropna(inplace = True)

In [None]:
df_encoding.dropna(inplace = True)

# Tokenization process

In [None]:
vocab_size = 1000
embedding_dim = 16
max_len = 20
oov_token = "<OOV>"


tokenizer = Tokenizer(num_words=vocab_size, oov_token = oov_token)
tokenizer.fit_on_texts(df_encoding['patterns_x'])

word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(df_encoding['patterns_x'])
padded_sequences = pad_sequences(sequences, truncating='post', maxlen = max_len)


# Train The Neural Network

In [None]:
num_classes = df_encoding['tag'].nunique()+1
epochs = 500

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(GlobalAveragePooling1D())
model.add(Dense(16, activation='relu')) # Activation function Relu 1
model.add(Dense(16, activation='relu')) # Activation function Relu 2
model.add(Dense(num_classes, activation='softmax'))



In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
print(model.summary())


None


In [None]:
history = model.fit(padded_sequences, df_encoding['tag'], epochs=epochs)

Epoch 1/500
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.0619 - loss: 3.4311
Epoch 2/500
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1178 - loss: 3.4161 
Epoch 3/500
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.1244 - loss: 3.3934  
Epoch 4/500
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.1299 - loss: 3.3683 
Epoch 5/500
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.1148 - loss: 3.3477 
Epoch 6/500
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1439 - loss: 3.2866 
Epoch 7/500
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1248 - loss: 3.2604 
Epoch 8/500
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.1380 - loss: 3.1969 
Epoch 9/500
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

In [None]:
# to save the trained model
model.save("chat_model.h5")

import pickle

# save the fitted tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

#save the fitted label encoder
with open('label_encoder.pickle', 'wb') as ecn_file:
    pickle.dump(lbl_encoder, ecn_file, protocol=pickle.HIGHEST_PROTOCOL)




# Load The Model and Start Generating Texts

In [None]:
import numpy as np
from tensorflow.keras.models import load_model

In [None]:
# load the saved tokenizer , label encoder and the model
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

with open('label_encoder.pickle', 'rb') as ecn_file:
    lbl_encoder = pickle.load(ecn_file)

model = load_model("chat_model.h5")



In [None]:
print(tokenizer)

<keras.src.legacy.preprocessing.text.Tokenizer object at 0x7a24cf5dd750>


In [None]:
print(lbl_encoder)

LabelEncoder()


In [None]:
print(model)

<Sequential name=sequential, built=True>


.

In [None]:
import numpy as np
import random
from tensorflow.keras.preprocessing.sequence import pad_sequences

def generate_responses(user_input):
    # Convert input into padded sequence using tokenizer
    sequence = tokenizer.texts_to_sequences([user_input])
    padded_sequence = pad_sequences(sequence, truncating='post', maxlen=20)

    # Predict the class with the highest probability
    predictions = model.predict(padded_sequence)
    class_index = np.argmax(predictions[0])
#     print(f"Predictions: {predictions}")
#     print(f"Class Index: {class_index}")

    # Map class index to tag
    tag = lbl_encoder.inverse_transform([class_index])[0]
#     print(f"Predicted Tag: {tag}")

    # # Fetch responses for the predicted tag
    # responses_for_tag = df_final[df_final['tag'] == tag]['responses_y'].tolist()

    # # Flatten and deduplicate responses
    # if responses_for_tag:
    #     # Create a set to remove duplicates, then convert back to list
    #     unique_responses = list(set(responses_for_tag))
    #     response = random.choice(unique_responses)
    # else:
    #     response = "Sorry, I don't understand that."

    return tag


In [None]:
# df_final[df_final['tag'] == 'greeting']

In [None]:
message_categorization = generate_responses("nijfebfubifbee")
print(message_categorization)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Unknown Message


In [None]:
import random
def chat():
    print("Start Talking with the bot(type quit to stop!")
    while True:
        inp = input("You: ")
        if inp.lower() == "quit":
            break

        results = generate_responses(inp)

        print("Bot: ", results)

In [None]:
chat()

Start Talking with the bot(type quit to stop!
You: hi
Bot:  Hello
You: tell a joke
Bot:  I own the world's worst thesaurus. Not only is it awful, it's awful.
You: what are you doing
Bot:  Talking to you, of course!
You: quit
