#### Notebook: correct spelling issues in query

In [16]:
import json
from gensim.models import Word2Vec
import os
import spacy
from spellchecker import SpellChecker
# from fuzzywuzzy import fuzz

import sys
sys.path.append(os.path.join('..', 'src'))
from utils import clean_text, remove_non_word_chars, tokens_to_embeddings

from transformers import BertTokenizer, RobertaTokenizer

##### Load embedding model, only spell check words not observed in training data

In [2]:
with open(os.path.join("..", "vars", "hyperparameters1.json")) as json_file:
    hyperparams = json.load(json_file)
    print(json.dumps(hyperparams, indent=4))

{
    "TOKENIZER": "roberta",
    "input_folder": "space_based_pdfs",
    "embedding_model_type": "glove",
    "embedding_model_fname": "roberta_space_based_pdfs_glove_model.bin",
    "vector_size": 50,
    "window": 3,
    "min_count": 3,
    "sg": 0,
    "TOKENS_TPYE": "tokens_less_sw",
    "chunk_size": 450,
    "chunk_overlap": 0,
    "max_query_length": 20,
    "top_N": 10,
    "TOKENS_EMBEDDINGS": "query_search_less_sw",
    "DOCUMENT_EMBEDDING": "token_embeddings_less_sw",
    "DOCUMENT_TOKENS": "tokens_less_sw",
    "METHOD": "COMBINE_MEAN",
    "transformer_model_name": "deepset/roberta-base-squad2",
    "context_size": 500
}


In [5]:
# Load your trained Word2Vec model
embedding_model_fname = hyperparams["embedding_model_fname"]

embedding_model_type = hyperparams['embedding_model_type']
if embedding_model_type == 'Word2Vec':
    model = Word2Vec.load(os.path.join("..", "models", "word_embeddings", embedding_model_fname))

elif embedding_model_type.lower() == 'glove':
    # Load the custom spaCy model
    model = spacy.load(os.path.join("..", "models", "word_embeddings", embedding_model_fname.split(".bin")[0]))

In [17]:
# Set the Tokenizer for your specific BERT model variant
TOKENIZER = hyperparams["TOKENIZER"]

bert_base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
roberta_tokenizer = RobertaTokenizer.from_pretrained("deepset/roberta-base-squad2", add_prefix_space = True)

tokenizers = {'bert': bert_base_tokenizer, 'roberta': roberta_tokenizer}

tokenizer = tokenizers[TOKENIZER]

##### Check query for unknown tokens

In [62]:
user_query = "Whatf's starlink (the big/larg full-scale one) pragram do?"

user_query = user_query.lower()

# clean query for BERT input
user_query = clean_text(user_query)

# clean query for candidate search
# user_query_for_search = remove_non_word_chars(user_query)

# Tokenize the query for BERT input
tokenized_query = tokenizer.tokenize(user_query)
print(tokenized_query)

['Ġwhat', 'f', "'s", 'Ġstar', 'link', 'Ġ(', 'the', 'Ġbig', '/', 'larg', 'Ġfull', '-', 'scale', 'Ġone', ')', 'Ġpr', 'agram', 'Ġdo', '?']


In [63]:
# get embeddings
query_embeddings = tokens_to_embeddings(tokenized_query, model, RANDOM=False)
list(map(lambda x: not any(x), query_embeddings))

[True,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 True,
 True,
 True]

In [45]:
current_word = 'token'
token = ']'
current_word += token if token not in [')', ']', '}'] else ''
print(current_word)

n


In [64]:
spell = SpellChecker()

def correct_spelling(word):
    # Your spelling correction logic
    corrected_word = spell.correction(word)
    return corrected_word if corrected_word else word  # Replace this with your actual correction logic

tokenized_query = tokenizer.tokenize(user_query)
print(tokenized_query)
query_embeddings = tokens_to_embeddings(tokenized_query, model, RANDOM=False)

# Group tokens into words
words = []
current_word = ""
for token in tokenized_query:
    if token.startswith("Ġ"):  # Indicates the start of a new word
        if current_word:
            words.append(current_word)
        current_word = token[1:] if token[1:] not in ['(', '[', '{', '/', '\\'] else ''
    else:
        current_word += token if token not in [')', ']', '}', '/', '\\', '?', ".", "!"] else ''
        if token in ['/', '\\']:
            words.append(current_word)
            current_word = ''

if current_word:
    words.append(current_word)
print("words: ", words)

# Identify misspelled words not in the embeddings model
misspelled_words = []
for word in words:
    # Split punctuation and hyphens from the word
    base_word = "".join(char for char in word if char.isalnum() or char in ["'", "-"])
    if any(list(map(lambda x: not any(x), tokens_to_embeddings(tokenizer.tokenize(base_word), model, RANDOM=False)))):
        # Add the original word to the misspelled_words list
        misspelled_words.append(word)
print(misspelled_words)
# Correct the spelling of misspelled words
corrected_words = {word: correct_spelling(word) for word in misspelled_words}

# Replace misspelled words in the original query
corrected_query = user_query
for original, corrected in corrected_words.items():
    print(original, corrected)
    corrected_query = corrected_query.replace(original, corrected)

print(corrected_query)


['Ġwhat', 'f', "'s", 'Ġstar', 'link', 'Ġ(', 'the', 'Ġbig', '/', 'larg', 'Ġfull', '-', 'scale', 'Ġone', ')', 'Ġpr', 'agram', 'Ġdo', '?']
words:  ["whatf's", 'starlink', 'the', 'big', 'larg', 'full-scale', 'one', 'pragram', 'do']
["whatf's", 'the', 'larg', 'full-scale', 'pragram', 'do']
whatf's what's
the the
larg large
full-scale full-scale
pragram program
do do
what's starlink (the big/large full-scale one) program do?


In [9]:
spell = SpellChecker()

def correct_spelling(query):
    words = query.split()
    corrected_words = [spell.correction(word) for word in words]
    return ' '.join(corrected_words)


query_embeddings = tokens_to_embeddings(tokenized_query, model, RANDOM=False)misspelled_words

In [59]:
correct_spelling("fulls-scale")

In [30]:
not any([0, 0, 0])

True

In [None]:
def find_similar_words(query, word_list, threshold=80):
    similar_words = [word for word in word_list if fuzz.ratio(query, word) > threshold]
    return similar_words
