# Initialization

In [18]:
# General imports
import json
import random
import string
import numpy as np
import torch
import re
from tqdm import tqdm


# NLP-related imports
import nltk
from nltk.stem.lancaster import LancasterStemmer
import spacy

# TensorFlow/Keras imports
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Masking
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Transformers imports
from transformers import BertTokenizer, BertModel

# Scikit-learn imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


# Load Dataset

In [9]:
# Load intents
with open('intents.json') as file:
    data = json.load(file)

# Prepare data
texts = []
labels = []

for intent in data['intents']:
    for pattern in intent['patterns']:
        texts.append(pattern)
        labels.append(intent['tag'])
        
# Summary
print(f"Number of texts: {len(texts)}")
print(f"Number of labels: {len(labels)}")
print(f"Unique labels: {set(labels)}")
print(f"Sample texts: {texts[:5]}")
print(f"Sample labels: {labels[:5]}")

Number of texts: 406
Number of labels: 406
Unique labels: {'committee', 'hours', 'creator', 'hostel', 'ithod', 'event', 'admission', 'name', 'sports', 'canteen', 'sem', 'salutaion', 'document', 'random', 'task', 'hod', 'infrastructure', 'course', 'menu', 'greeting', 'number', 'uniform', 'location', 'goodbye', 'scholarship', 'floors', 'vacation', 'fees', 'syllabus', 'library', 'college intake', 'swear', 'principal', 'extchod', 'computerhod', 'facilities', 'ragging', 'placement'}
Sample texts: ['Hi', 'Hi', 'How are you?', 'Is anyone there?', 'Hello']
Sample labels: ['greeting', 'greeting', 'greeting', 'greeting', 'greeting']


# Data Preprocessing

## -- Common Preprocessing

In [10]:

def preprocess_text_per_word(text):
    words = text.split()
    processed_words = [word.lower() for word in words]
    processed_words = [word.strip() for word in processed_words]
    processed_words = [re.sub('\s+',' ', word) for word in processed_words]
    processed_words = [word for word in processed_words if not all(char in string.punctuation for char in word.replace(' ',''))]
    
    processed_words = ' '.join(processed_words)
    return processed_words

# Preprocess texts
preprocessed_texts = [preprocess_text_per_word(text) for text in texts]

filtered_texts = []
skipped_indices = []
for i, word in enumerate(preprocessed_texts):
    if word.strip() != '':
        filtered_texts.append(word)
    else:
        skipped_indices.append(i)
        
preprocessed_texts = filtered_texts
labels = [label for i, label in enumerate(labels) if i not in skipped_indices]


# summary
print("===== data summary =====")
print(f"Original number of texts: {len(texts)}")
print(f"Number of texts after preprocessing: {len(preprocessed_texts)}")
print(f"Number of skipped texts: {len(skipped_indices)}")
print("Sample before preprocessing:", texts[:5])
print("Sample after preprocessing:", preprocessed_texts[:5])
print()
print("===== label summary =====")
print(f"Original number of labels: {len(labels) + len(skipped_indices)}")
print(f"Number of labels after preprocessing: {len(labels)}")
print("Sample labels before preprocessing:", labels[:5])



===== data summary =====
Original number of texts: 406
Number of texts after preprocessing: 405
Number of skipped texts: 1
Sample before preprocessing: ['Hi', 'Hi', 'How are you?', 'Is anyone there?', 'Hello']
Sample after preprocessing: ['hi', 'hi', 'how are you?', 'is anyone there?', 'hello']

===== label summary =====
Original number of labels: 406
Number of labels after preprocessing: 405
Sample labels before preprocessing: ['greeting', 'greeting', 'greeting', 'greeting', 'greeting']


## -- Semantic Tagging

In [None]:
nlp = spacy.load("en_core_web_lg")

In [12]:
distinct_tags = set()

def apply_ner_tags(text):
    doc = nlp(text)
    
    for ent in doc.ents:
        distinct_tags.add(ent.label_)
        
    tagged_tokens = []
    for token in doc:
        replaced = False
        for ent in doc.ents:
            if token.text == ent.text:
                tagged_tokens.append(f"<{ent.label_}>")
                replaced = True
                break
        if not replaced:
            tagged_tokens.append(token.text)
    return " ".join(tagged_tokens)

tagged_texts = [apply_ner_tags(text) for text in preprocessed_texts]

# Summary of distinct tags and tagged texts
print(f"Distinct NER tags: {distinct_tags}")
print(f"Sample tagged texts: {tagged_texts[:5]}")
print(f"Length of tagged texts: {len(tagged_texts)}")
print(f"Length of distinct tags: {len(distinct_tags)}")


Distinct NER tags: {'ORG', 'PERSON', 'DATE', 'ORDINAL', 'TIME'}
Sample tagged texts: ['hi', 'hi', 'how are you ?', 'is anyone there ?', 'hello']
Length of tagged texts: 405
Length of distinct tags: 5


## -- BERT Embedding

In [14]:
def get_bert_embedding(sentence, tokenizer, model, max_len=30):
    inputs = tokenizer(sentence, return_tensors="pt", padding="max_length", truncation=True, max_length=max_len)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.squeeze(0).numpy()

# add special tokens
# Add angle brackets to each token
special_tokens = [f"<{token}>" for token in distinct_tags]
special_tokens = sorted(special_tokens)
print(special_tokens)

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.add_tokens(special_tokens)

bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.resize_token_embeddings(len(tokenizer))
bert_model.eval()  # disable gradient updates for inference

X = np.array([get_bert_embedding(sentence, tokenizer, bert_model) for sentence in tqdm(tagged_texts)])

# Summary of BERT embeddings
print(f"Shape of BERT embeddings array: {X.shape} (Length: {len(X)})")
print(f"Data type of BERT embeddings: {X.dtype}")
print(f"Sample BERT embedding for the first text: {X[0]}")

['<DATE>', '<ORDINAL>', '<ORG>', '<PERSON>', '<TIME>']


100%|██████████| 405/405 [01:01<00:00,  6.54it/s]


Shape of BERT embeddings array: (405, 30, 768) (Length: 405)
Data type of BERT embeddings: float32
Sample BERT embedding for the first text: [[-0.12822832  0.2022242  -0.04041219 ... -0.11584646  0.14928633
   0.1449424 ]
 [-0.5613979  -0.17877388  0.23751609 ...  0.35251376  0.14657634
  -0.46803072]
 [ 0.7654204   0.08086837 -0.2742688  ...  0.21145517 -0.67852986
  -0.33935976]
 ...
 [-0.38684064  0.08147977  0.5432662  ...  0.00216477  0.11338539
   0.14539245]
 [-0.5338742  -0.10825475  0.32787177 ...  0.14547752  0.18982382
   0.14332972]
 [-0.42385688  0.00591564  0.49767217 ... -0.06936394  0.0515826
   0.06124616]]


## -- Label Encoding

In [17]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
y = to_categorical(encoded_labels)  # final label array

# summary
print("y shape:", y.shape)
print("y data type:", y.dtype)

y shape: (405, 38)
y data type: float32


## -- Data Split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Development

## -- Architecture

In [21]:
model = Sequential()

# masking layer
model.add(Masking(mask_value=0., input_shape=(X_train.shape[1], X_train.shape[2])))  # (max_len, 768)

# lstm layer
model.add(LSTM(512, return_sequences=True))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(128, return_sequences=False))

# additional layers
model.add(Dropout(0.3))
model.add(Dense(512))
model.add(Dropout(0.3))
model.add(Dense(256))
model.add(Dropout(0.3))
model.add(Dense(128))

# output layer
model.add(Dense(y_train.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_1 (Masking)         (None, 30, 768)           0         
                                                                 
 lstm_3 (LSTM)               (None, 30, 512)           2623488   
                                                                 
 lstm_4 (LSTM)               (None, 30, 256)           787456    
                                                                 
 lstm_5 (LSTM)               (None, 128)               197120    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 512)               66048     
                                                                 
 dropout_1 (Dropout)         (None, 512)             

## -- Training

In [22]:
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x249176b8ca0>

## -- Evaluation

In [23]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.metrics import classification_report

# hide warning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# evaluate
y_pred = model.predict(X_test)
y_pred_class = np.argmax(y_pred, axis=1)
print(classification_report(np.argmax(y_test, axis=1), y_pred_class))

loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       0.50      1.00      0.67         1
           2       1.00      0.50      0.67         2
           3       0.50      0.50      0.50         2
           4       0.00      0.00      0.00         1
           5       1.00      0.80      0.89         5
           6       1.00      1.00      1.00         5
           7       1.00      1.00      1.00         2
           8       1.00      0.67      0.80         3
           9       0.00      0.00      0.00         0
          10       1.00      1.00      1.00         1
          11       0.75      0.60      0.67         5
          12       1.00      1.00      1.00         1
          13       0.00      0.00      0.00         1
          14       1.00      0.33      0.50         3
          15       0.00      0.00      0.00         1
          16       0.60      1.00      0.75         3
          17       0.00    

### --- Evaluate Rouge

#### ---- Prepare Evaluation Data

In [24]:
# ===== imports ===== 
import json
import random
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm.notebook import tqdm


# load data
with open("intents.json", "r", encoding="utf-8") as f:
    intents_data = json.load(f)

questions = []
answers = []
for intent in intents_data["intents"]:
    for pattern in intent.get("patterns", []):
        questions.append(pattern)
        answers.append(random.choice(intent["responses"]))

# preprocess
processed_questions = [preprocess_text_per_word(q) for q in questions]
valid_questions = []
valid_indices = []
for i, q in enumerate(processed_questions):
    if q.strip():
        valid_questions.append(apply_ner_tags(q))
        valid_indices.append(i)
valid_answers = [answers[i] for i in valid_indices]

# populate bot outputs
bot_outputs = []
# preprocessed_questions_eval_embed = np.array([get_bert_embedding(sentence, tokenizer, bert_model) for sentence in tqdm(preprocessed_questions_eval)])
for i, question in enumerate(tqdm(valid_questions, desc="Generating responses")):
    embedded_input = np.expand_dims(get_bert_embedding(question, tokenizer, bert_model), axis=0)
    prediction = model.predict(embedded_input)[0]
    predicted_class_index = np.argmax(prediction)
    tag = label_encoder.inverse_transform([predicted_class_index])[0]

    for intent in data['intents']:
        if intent['tag'] == tag:
            response = random.choice(intent['responses'])
            bot_outputs.append(response)
            break

print("\nFinal Counts:")
print(f"- Questions: {len(valid_questions)}")
print(f"- Answers: {len(valid_answers)}")
print(f"- Bot Outputs: {len(bot_outputs)}")

if len(valid_questions) == len(bot_outputs):
    print("\nAll questions processed successfully")
else:
    print("\nMismatch in input/output counts!")


Generating responses:   0%|          | 0/405 [00:00<?, ?it/s]


Final Counts:
- Questions: 405
- Answers: 405
- Bot Outputs: 405

All questions processed successfully


In [25]:
print(bot_outputs[:20])
print("-------------------------------------------")
print(valid_answers[:20])

['Hello!', 'Hello!', 'Hello, $_user!', 'Hi there, how can I help?', 'Good to see you again!', 'Talk to you later', 'Hello, $_user!', 'Good to see you again!', 'Hello, $_user!', 'welcome, anything else i can assist you with?', 'Goodbye!', 'Sad to see you go :(', 'Sad to see you go :(', 'Sad to see you go :(', 'Goodbye!', 'You can call me Mind Reader.', 'Come back soon', 'Talk to you later', 'Sad to see you go :(', 'Sad to see you go :(']
-------------------------------------------
['Hello, $_user!', 'Good to see you again!', 'Hello!', 'Good to see you again!', 'Hi there, how can I help?', 'Hello!', 'Good to see you again!', 'Hello!', 'Good to see you again!', 'Hi there, how can I help?', 'Talk to you later', 'Goodbye!', 'Sad to see you go :(', 'Come back soon', 'Talk to you later', 'Come back soon', 'Come back soon', 'Sad to see you go :(', 'Come back soon', 'Talk to you later']


In [27]:
from rouge_score import rouge_scorer

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Evaluate all
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for ref, hyp in zip(valid_answers, bot_outputs):
    scores = scorer.score(ref, hyp)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

# Average scores
avg_r1 = sum(rouge1_scores) / len(rouge1_scores)
avg_r2 = sum(rouge2_scores) / len(rouge2_scores)
avg_rL = sum(rougeL_scores) / len(rougeL_scores)

print(f"ROUGE-1: {avg_r1:.4f}")
print(f"ROUGE-2: {avg_r2:.4f}")
print(f"ROUGE-L: {avg_rL:.4f}")


ROUGE-1: 0.8145
ROUGE-2: 0.7876
ROUGE-L: 0.8117


### --- Evaluate BERT

In [28]:
from bert_score import score

# Make sure you have these lists already:
# bot_outputs = [your chatbot's answers]
# answers = [expected/gold answers]

# Run BERTScore
P, R, F1 = score(bot_outputs, valid_answers, lang="en", verbose=True)

# Average scores
avg_precision = P.mean().item()
avg_recall = R.mean().item()
avg_f1 = F1.mean().item()

print(f"\nBERTScore:")
print(f"Precision: {avg_precision:.4f}")
print(f"Recall:    {avg_recall:.4f}")
print(f"F1 Score:  {avg_f1:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/7 [00:00<?, ?it/s]

done in 19.50 seconds, 20.77 sentences/sec

BERTScore:
Precision: 0.9638
Recall:    0.9644
F1 Score:  0.9640


## -- Save Model and Other

In [29]:
import pickle
base_name = 'chatbot_campus_lstm'

In [None]:
# Save Keras model
model.save(f"{base_name}.h5")

# Save label encoder
import pickle
with open(f"{base_name}_label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

# Save tokenizer only if you modified BERT’s tokenizer
tokenizer.save_pretrained(f"{base_name}_custom_tokenizer/")




  saving_api.save_model(


('chatbot_campus_lstm_custom_tokenizer/tokenizer_config.json',
 'chatbot_campus_lstm_custom_tokenizer/special_tokens_map.json',
 'chatbot_campus_lstm_custom_tokenizer/vocab.txt',
 'chatbot_campus_lstm_custom_tokenizer/added_tokens.json')