# Initialization

## -- Import Dependencies

In [1]:
# ------------------- Imports ------------------- #

# Standard library imports
import json
import random
import re
import string

# Third-party library imports
import jieba
import numpy as np
import spacy
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Dense, Dropout, Embedding, Input, LSTM, Masking
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical




## -- Initialization

In [17]:
# ------------------- Configuration ------------------- #
EMBEDDING_DIM = 300
HIDDEN_SIZE = 512
MAX_LEN = 20

# Load Dataset

In [18]:
# Load intents
with open('intents.json') as file:
    data = json.load(file)

# Prepare data
texts = []
labels = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        texts.append(pattern)
        labels.append(intent['tag'])
        
# Summary
print(f"Number of texts: {len(texts)}")
print(f"Number of labels: {len(labels)}")
print(f"Unique labels: {set(labels)}")
print(f"Sample texts: {texts[:5]}")
print(f"Sample labels: {labels[:5]}")

Number of texts: 406
Number of labels: 406
Unique labels: {'syllabus', 'greeting', 'floors', 'facilities', 'sports', 'vacation', 'sem', 'uniform', 'hours', 'name', 'hod', 'extchod', 'ithod', 'library', 'location', 'canteen', 'fees', 'scholarship', 'swear', 'course', 'computerhod', 'infrastructure', 'random', 'salutaion', 'hostel', 'principal', 'placement', 'admission', 'event', 'creator', 'document', 'ragging', 'committee', 'menu', 'task', 'number', 'college intake', 'goodbye'}
Sample texts: ['Hi', 'Hi', 'How are you?', 'Is anyone there?', 'Hello']
Sample labels: ['greeting', 'greeting', 'greeting', 'greeting', 'greeting']


# Data Preprocessing

## -- Common Preprocessing

In [19]:

def preprocess_text_per_word(text):
    words = text.split()
    processed_words = [word.lower() for word in words]
    processed_words = [word.strip() for word in processed_words]
    processed_words = [re.sub('\s+',' ', word) for word in processed_words]
    processed_words = [word for word in processed_words if not all(char in string.punctuation for char in word.replace(' ',''))]
    
    processed_words = ' '.join(processed_words)
    return processed_words

# Preprocess texts
preprocessed_texts = [preprocess_text_per_word(text) for text in texts]

filtered_texts = []
skipped_indices = []
for i, word in enumerate(preprocessed_texts):
    if word.strip() != '':
        filtered_texts.append(word)
    else:
        skipped_indices.append(i)
        
preprocessed_texts = filtered_texts
labels = [label for i, label in enumerate(labels) if i not in skipped_indices]


# summary
print("===== data summary =====")
print(f"Original number of texts: {len(texts)}")
print(f"Number of texts after preprocessing: {len(preprocessed_texts)}")
print(f"Number of skipped texts: {len(skipped_indices)}")
print("Sample before preprocessing:", texts[:5])
print("Sample after preprocessing:", preprocessed_texts[:5])
print()
print("===== label summary =====")
print(f"Original number of labels: {len(labels) + len(skipped_indices)}")
print(f"Number of labels after preprocessing: {len(labels)}")
print("Sample labels before preprocessing:", labels[:5])


===== data summary =====
Original number of texts: 406
Number of texts after preprocessing: 405
Number of skipped texts: 1
Sample before preprocessing: ['Hi', 'Hi', 'How are you?', 'Is anyone there?', 'Hello']
Sample after preprocessing: ['hi', 'hi', 'how are you?', 'is anyone there?', 'hello']

===== label summary =====
Original number of labels: 406
Number of labels after preprocessing: 405
Sample labels before preprocessing: ['greeting', 'greeting', 'greeting', 'greeting', 'greeting']


## -- Semantic Tagging

In [20]:
nlp = spacy.load("en_core_web_lg")

In [21]:
distinct_tags = set()

def apply_ner_tags(text):
    doc = nlp(text)
    
    for ent in doc.ents:
        distinct_tags.add(ent.label_)
        
    tagged_tokens = []
    for token in doc:
        replaced = False
        for ent in doc.ents:
            if token.text == ent.text:
                tagged_tokens.append(f"<{ent.label_}>")
                replaced = True
                break
        if not replaced:
            tagged_tokens.append(token.text)
    return " ".join(tagged_tokens)

tagged_texts = [apply_ner_tags(text) for text in preprocessed_texts]

# Summary of distinct tags and tagged texts
print(f"Distinct NER tags: {distinct_tags}")
print(f"Sample tagged texts: {tagged_texts[:5]}")
print(f"Length of tagged texts: {len(tagged_texts)}")
print(f"Length of distinct tags: {len(distinct_tags)}")


Distinct NER tags: {'ORG', 'TIME', 'DATE', 'ORDINAL', 'PERSON'}
Sample tagged texts: ['hi', 'hi', 'how are you ?', 'is anyone there ?', 'hello']
Length of tagged texts: 405
Length of distinct tags: 5


## -- Embedding

In [None]:
# Load GloVe embeddings
class GloVeEmbeddings:
    def __init__(self, embedding_path, tokenizer):
        self.word2vec = {}
        with open(embedding_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split()
                word = parts[0]
                vec = np.array(parts[1:], dtype=np.float32)
                self.word2vec[word] = vec

        # Create embedding matrix
        vocab_size = len(tokenizer.word_index) + 1
        self.embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
        for word, i in tokenizer.word_index.items():
            vec = self.word2vec.get(word)
            if vec is not None:
                self.embedding_matrix[i] = vec

    def get_embedding_matrix(self):
        return self.embedding_matrix

# Example usage
glove_path = "glove.6B.300d.txt"  # Path to GloVe file
tokenizer = Tokenizer(oov_token="<OOV>")  # Assuming tokenizer is already defined
tokenizer.fit_on_texts(tagged_texts)  # Fit tokenizer on preprocessed texts

glove = GloVeEmbeddings(glove_path, tokenizer)
embedding_matrix = glove.get_embedding_matrix()

# Summary
print("tokenization")
print("Word Index:")
print(tokenizer.word_index)

sequences = tokenizer.texts_to_sequences(texts)
print("\nSequences:")
print(sequences)
print()

print(f"Embedding matrix shape: {embedding_matrix.shape}")


Embedding matrix shape: (277, 300)


## -- Feature and Label Preparation

In [23]:
# Convert texts to sequences
X = tokenizer.texts_to_sequences(tagged_texts)
X = pad_sequences(X, maxlen=MAX_LEN)

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
y = to_categorical(y, num_classes=len(label_encoder.classes_))

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Summary
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")
print(f"Classes: {label_encoder.classes_}")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X: (405, 20)
Shape of y: (405, 38)
Classes: ['admission' 'canteen' 'college intake' 'committee' 'computerhod' 'course'
 'creator' 'document' 'event' 'extchod' 'facilities' 'fees' 'floors'
 'goodbye' 'greeting' 'hod' 'hostel' 'hours' 'infrastructure' 'ithod'
 'library' 'location' 'menu' 'name' 'number' 'placement' 'principal'
 'ragging' 'random' 'salutaion' 'scholarship' 'sem' 'sports' 'swear'
 'syllabus' 'task' 'uniform' 'vacation']
Shape of X_train: (324, 20)
Shape of X_test: (81, 20)
Shape of y_train: (324, 38)
Shape of y_test: (81, 38)


# Model Development

## -- Architecture

In [None]:
model = Sequential()

# masking layer
model.add(Masking(mask_value=0., input_shape=(MAX_LEN,)))

# Embedding layer with pre-trained weights
model.add(Embedding(input_dim=embedding_matrix.shape[0],
                    output_dim=embedding_matrix.shape[1],  # EMBEDDING_DIM
                    weights=[embedding_matrix],
                    input_length=MAX_LEN,
                    trainable=False))

# LSTM layers
model.add(LSTM(512, return_sequences=True))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(128, return_sequences=False))

# Additional Layers
model.add(Dropout(0.3))
model.add(Dense(512))
model.add(Dropout(0.3))
model.add(Dense(256))
model.add(Dropout(0.3))
model.add(Dense(128))
model.add(Dense(y_train.shape[1], activation='softmax'))

# Compile
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking (Masking)           (None, 20)                0         
                                                                 
 embedding (Embedding)       (None, 20, 300)           83100     
                                                                 
 lstm (LSTM)                 (None, 20, 512)           1665024   
                                                                 
 lstm_1 (LSTM)               (None, 20, 256)           787456    
                                                                 
 lstm_2 (LSTM)               (None, 128)               197120    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 512)              

## -- Training

In [25]:
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x23a0a6414b0>

## -- Evaluation

In [26]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.metrics import classification_report

# hide warning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# evaluate
y_pred = model.predict(X_test)
y_pred_class = np.argmax(y_pred, axis=1)
print(classification_report(np.argmax(y_test, axis=1), y_pred_class))

loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       1.00      1.00      1.00         1
           2       0.67      1.00      0.80         2
           3       1.00      0.50      0.67         2
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         5
           6       0.62      1.00      0.77         5
           7       0.67      1.00      0.80         2
           8       1.00      1.00      1.00         3
          10       1.00      1.00      1.00         1
          11       1.00      0.80      0.89         5
          12       1.00      1.00      1.00         1
          13       0.33      1.00      0.50         1
          14       1.00      0.33      0.50         3
          15       0.00      0.00      0.00         1
          16       0.75      1.00      0.86         3
          17       0.67      0.67      0.67         3
          19       0.00    

### --- Evaluate Rouge

#### ---- Prepare Evaluation Data

In [27]:
# ===== imports ===== 
import json
import random
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm.notebook import tqdm

# ===== embedding =====
def create_padded_seq(sentence, tokenizer, max_len):
    """Convert text to padded embedding sequence"""
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_seq = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')[0]
    return padded_seq

# load data
with open("intents.json", "r", encoding="utf-8") as f:
    intents_data = json.load(f)

questions = []
answers = []
for intent in intents_data["intents"]:
    for pattern in intent.get("patterns", []):
        questions.append(pattern)
        answers.append(random.choice(intent["responses"]))

# preprocess
processed_questions = [preprocess_text_per_word(q) for q in questions]
valid_questions = []
valid_indices = []
for i, q in enumerate(processed_questions):
    if q.strip():
        valid_questions.append(apply_ner_tags(q))
        valid_indices.append(i)
valid_answers = [answers[i] for i in valid_indices]

# prepare bot_outputs
bot_outputs = []

for i, question in enumerate(tqdm(valid_questions, desc="Generating responses")):
    # Embed and predict
    embedded = create_padded_seq(question, tokenizer, MAX_LEN)
    pred = model.predict(np.expand_dims(embedded, axis=0))[0]
    tag = label_encoder.inverse_transform([np.argmax(pred)])[0]
    
    # Get response
    for intent in intents_data['intents']:
        if intent['tag'] == tag:
            bot_outputs.append(random.choice(intent['responses']))
            break
        
# %% [markdown]
# # 5. Results Verification
# %%
print("\nFinal Counts:")
print(f"- Questions: {len(valid_questions)}")
print(f"- Answers: {len(valid_answers)}")
print(f"- Bot Outputs: {len(bot_outputs)}")

if len(valid_questions) == len(bot_outputs):
    print("\nAll questions processed successfully")
else:
    print("\nMismatch in input/output counts!")

Generating responses:   0%|          | 0/405 [00:00<?, ?it/s]


Final Counts:
- Questions: 405
- Answers: 405
- Bot Outputs: 405

All questions processed successfully


In [28]:
print(bot_outputs[:20])
print("-------------------------------------------")
print(valid_answers[:20])

['Good to see you again!', 'Hello!', 'Hi there, how can I help?', 'Our university has canteen with variety of food available', 'Good to see you again!', 'Our university has canteen with variety of food available', 'Hello, $_user!', 'Good to see you again!', 'Good to see you again!', 'Hello!', 'Our university has canteen with variety of food available', 'Our university has canteen with variety of food available', 'Our university has canteen with variety of food available', 'Our university has canteen with variety of food available', 'Hello!', 'Our university has canteen with variety of food available', 'Our university has canteen with variety of food available', 'Our university has canteen with variety of food available', 'Our university has canteen with variety of food available', 'Good to see you again!']
-------------------------------------------
['Good to see you again!', 'Hello!', 'Hello!', 'Hi there, how can I help?', 'Hi there, how can I help?', 'Hello!', 'Hi there, how can I he

In [29]:
from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Evaluate all
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for ref, hyp in zip(valid_answers, bot_outputs):
    scores = scorer.score(ref, hyp)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

# Average scores
avg_r1 = sum(rouge1_scores) / len(rouge1_scores)
avg_r2 = sum(rouge2_scores) / len(rouge2_scores)
avg_rL = sum(rougeL_scores) / len(rougeL_scores)

print(f"ROUGE-1: {avg_r1:.4f}")
print(f"ROUGE-2: {avg_r2:.4f}")
print(f"ROUGE-L: {avg_rL:.4f}")

ROUGE-1: 0.0608
ROUGE-2: 0.0394
ROUGE-L: 0.0605


### --- Evaluate BERT

In [30]:
from bert_score import score

# Make sure you have these lists already:
# bot_outputs = [your chatbot's answers]
# answers = [expected/gold answers]

# Run BERTScore
P, R, F1 = score(bot_outputs, valid_answers, lang="en", verbose=True)

# Average scores
avg_precision = P.mean().item()
avg_recall = R.mean().item()
avg_f1 = F1.mean().item()

print(f"\nBERTScore:")
print(f"Precision: {avg_precision:.4f}")
print(f"Recall:    {avg_recall:.4f}")
print(f"F1 Score:  {avg_f1:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/7 [00:00<?, ?it/s]

done in 15.99 seconds, 25.33 sentences/sec

BERTScore:
Precision: 0.8428
Recall:    0.7969
F1 Score:  0.8185


## -- Save Model and Other

In [31]:
import pickle
base_name = 'chatbot_campus_lstm_original'

In [32]:
# Save Keras model
model.save(f"{base_name}.h5")

  saving_api.save_model(


In [33]:
# Save label encoder and onehot encoding
mapping = {
    "label_encoder": label_encoder,
    "classes": label_encoder.classes_  # ['cat', 'dog', 'bird']
}

with open(f"{base_name}_label_mapping.pkl", "wb") as f:
    pickle.dump(mapping, f)

# code to open the mapping
# with open("label_mapping.pkl", "rb") as f:
#     mapping = pickle.load(f)
# 
# new_label = "dog"
# integer = mapping["label_encoder"].transform([new_label])  # [1]
# one_hot = to_categorical(integer, num_classes=len(mapping["classes"]))  # [0,1,0]

In [34]:
# save tokenizer
with open(f"{base_name}_glove_tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# code to load the tokenizer
# with open("glove_tokenizer.pkl", "rb") as f:
#     tokenizer = pickle.load(f)
# 
# # (2) Convert new text to sequences
# new_text = ["hello stranger"]
# seq = tokenizer.texts_to_sequences(new_text)  # [[1, <OOV>]]