In [None]:
"""
(3) This script saves the SBERT model and embeddings into a pickle file:

1. Initializes NLTK tools for tokenization, stemming, and lemmatization.
2. Loads a pre-trained SBERT model for generating sentence embeddings.
3. Preprocesses FAQ questions (tokenization, stemming, lemmatization, and lowercasing).
4. Computes embeddings for FAQ questions.
5. Saves the model, FAQ questions, FAQ embeddings, and FAQs to a pickle file in a specified directory.
"""

In [None]:
import os
import re
import json
import pickle
import numpy as np
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
# Load pre-trained SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Preprocess text (tokenization, stemming, lemmatization, and lowercasing)
def preprocess_text(text):
    text = re.sub(r'\W+', ' ', text)  # Remove non-alphanumeric characters
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [stemmer.stem(word) for word in tokens]  # Apply stemming
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Apply lemmatization
    return ' '.join(tokens)  # Join tokens back into a single string

# Generate SBERT embeddings
def get_sbert_embedding(text):
    embedding = model.encode(text)
    return embedding

In [None]:
# Load the FAQs from the JSON file
with open('data/keelworks_info.json', 'r') as file:
    data = json.load(file)

faqs = data['questions_and_answers']

# Precompute embeddings for FAQ questions
faq_questions = [preprocess_text(faq['question']) for faq in faqs]
faq_embeddings = np.array([get_sbert_embedding(question) for question in faq_questions])

In [None]:
# Define the directory and file name
model_directory = 'model'
file_name = 'keelworks_model.pkl'
file_path = os.path.join(model_directory, file_name)

# Save model and embeddings to a pickle file
model_data = {
    'model': model,
    'faq_questions': faq_questions,
    'faq_embeddings': faq_embeddings,
    'faqs': faqs
}

with open(file_path, 'wb') as f:
    pickle.dump(model_data, f)

print(f"Model and embeddings saved to {file_path}")