# Coffeshop Chatbot
It focuses on training a `neural network model` to classify user input into predefined intents.

# Imports and NLTK Setup

In [1]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
import json
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import LabelEncoder
import pickle

# Resources

In [2]:
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Loading Data

In [3]:
with open("./data/coffeshop_chat.json", 'r') as f:
    intents=json.load(f)

# Preprocessing

In [4]:
stemmer=PorterStemmer()

In [5]:
words=[]
tags=[]
tokens_tag=[]

- `words` - Stores all the words (tokens) from the patterns in the intents.
- `tags` - Stores the different tags (categories) from the intents.
- `tokens_tag` is a list of tuples, each containing a list of words (tokens) and the corresponding tag.
__Sample:__
```
 (['Thank', "'s", 'a', 'lot', '!'], 'thanks'),
 (['Which', 'items', 'do', 'you', 'have', '?'], 'items'),
 (['What', 'kinds', 'of', 'items', 'are', 'there', '?'], 'items'),
 (['What', 'do', 'you', 'sell', '?'], 'items'),
 (['Do', 'you', 'take', 'credit', 'cards', '?'], 'payments'),
```
`Thank's a lot!` converted to `['Thank', "'s", 'a', 'lot', '!']` and the corresponding tag is `thanks`

In [8]:
for intent in intents['intents']:
    tag=intent['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        word=word_tokenize(pattern)
        words.extend(word)

        tokens_tag.append((word, tag))

## append()
__Add a Single Element to the End of a List__
```python
my_list = [1, 2, 3]
my_list.append(4)
print(my_list)
# Output: [1, 2, 3, 4]
```
It keep the new item as it is.
```python
# Appending another list as a single item
my_list.append([5, 6])
print(my_list)
# Output: [1, 2, 3, 4, [5, 6]]  # The entire list is added as one element
```
## extend()
__Add Multiple Elements (from an Iterable) to a List__
```python
my_list = [1, 2, 3]
my_list.extend([4, 5, 6])
print(my_list)
# Output: [1, 2, 3, 4, 5, 6]  # Each element of [4, 5, 6] is added individually
```
It convert the new item into iterable item
```python
# Extending with a string
my_list.extend("abc")
print(my_list)
# Output: [1, 2, 3, 4, 5, 6, 'a', 'b', 'c']
```
## enumerate()
__Iterate Over a List (or Iterable) and Get Index Along With Element__
```python
my_list = ['apple', 'banana', 'cherry']

for idx, fruit in enumerate(my_list):
    print(idx, fruit)
```
You can also specify a different starting index:
```python
for idx, fruit in enumerate(my_list, start=1):
    print(idx, fruit)
```

# Cleaning 

In [11]:
ignore_words=['?','!','.',',','\'s']
words=[stemmer.stem(word) for word in words if word not in ignore_words]

## set()
It covert the list into a set, which automatically removes duplicate words. The model would process duplicate words, increasing memory usage and possibly affecting performance.
## sorted()
It is used to order the words and tags alphabetically or numerically (depending on the content).

In [12]:
words=sorted(set(words))
tags=sorted(set(tags))

# Bag of Words Representation

In [13]:
def bag_of_words(tokens, words):
    stemmed_token=[stemmer.stem(token) for token in tokens]

    bag=np.zeros(len(words), dtype=np.float32)

    for idx, word in enumerate(words):
        if word in stemmed_token:
            bag[idx]=1.0
        
    return bag

- `tokens` - tokens of a specifc pattern
- `words` - all the words (tokens) from the patterns in the intents.

# Training Data

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

patterns = [' '.join(tokens) for tokens, tag in tokens_tag]
vectorizer.fit(patterns)

X_train = vectorizer.transform(patterns).toarray()
y_train = [tags.index(tag) for tokens, tag in tokens_tag]
y_train = np.array(y_train)

## Custom BoW

In [168]:
# X_train=[]
# y_train=[]
# for (tokens, tag) in tokens_tag:
#     bow=bag_of_words(tokens, words)
#     X_train.append(bow)

#     y_train.append(tags.index(tag))

# X_train=np.array(X_train)
# y_train=np.array(y_train)

# Building the Neural Network Model

In [None]:
model = Sequential()
model.add(Dense(128, input_shape=(len(X_train[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(tags), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=200, batch_size=8, verbose=1)
model.save("chatbot_model.keras")

with open('words.pkl', 'wb') as f:
    pickle.dump(words, f)

with open('tags.pkl', 'wb') as f:
    pickle.dump(tags, f)

print("Model training complete!")

# Files
`.keras` and `.pkl` files are used to save trained models and associated data structures so that they can be reused later without needing to retrain the model or recreate the necessary data processing structures.
## `.keras`
It is used to save the entire trained model. This includes:
- The model architecture (layers, neurons, activation functions, etc.).
- The model weights (the learned parameters from training).
- The optimizer configuration (so the model can be reloaded and continued training if needed).
## `.pkl`
It is used to save Python objects to a file, which can be later loaded into memory.
- `words.pkl` - Contains the list of all unique words (tokens) that the model was trained on.
- `tags.pkl` - Contains the list of unique tags (intents) that the model can classify.