### Notebook: Write query and get top N similar documents

In [1]:
import os
import sys
# Get the current working directory (notebooks directory)
current_dir = os.getcwd()

# Go up one level to the project directory
project_dir = os.path.dirname(current_dir)

# Assuming your project structure is as described before
src_path = os.path.abspath(os.path.join(project_dir, 'src'))

# Add the 'src' directory to the Python path
sys.path.append(src_path)

from question_answer_site.question_answer.mongodb import MongoDb
from question_answer_site.question_answer.utils import remove_non_word_chars, clean_text, tokens_to_embeddings, post_process_output, correct_spelling
from question_answer_site.question_answer.config import TOKENIZER, EMBEDDING_MODEL_FNAME, EMBEDDING_MODEL_TYPE, TOKENS_EMBEDDINGS, DOCUMENT_EMBEDDING, \
    DOCUMENT_TOKENS, TOP_N, TRANSFORMER_MODEL_NAME, METHOD, MAX_QUERY_LENGTH, username, password, cluster_url, \
    database_name
from transformers import BertTokenizer, BertForQuestionAnswering, RobertaTokenizer, RobertaForQuestionAnswering

from urllib.parse import quote_plus
import torch
from gensim.models import Word2Vec
import os
import re
import nltk
import spacy
import numpy as np
import time
from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ThreadPoolExecutor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
query = "What happened to signals intelligence during world war 1?"


In [3]:
print(username, password, cluster_url, database_name)
escaped_username = quote_plus(username)
escaped_password = quote_plus(password)

# use MongoDb class to connect to database instance and get the documents
mongo_db = MongoDb(escaped_username, escaped_password, cluster_url, database_name, "parsed_documents")

new_user_1 password33566 cluster0 question_answer


In [4]:
if mongo_db.connect():
    print(mongo_db.count_documents())

502


In [5]:
# Set the Tokenizer for your specific BERT model variant
bert_base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
roberta_tokenizer = RobertaTokenizer.from_pretrained("deepset/roberta-base-squad2", add_prefix_space=True)
tokenizers = {'bert': bert_base_tokenizer, 'roberta': roberta_tokenizer}
tokenizer = tokenizers[TOKENIZER]

# Load your trained Word2Vec model
if EMBEDDING_MODEL_TYPE == 'Word2Vec':
    embedding_model = Word2Vec.load(
        os.path.join(os.getcwd(), "question_answer", "embedding_models", EMBEDDING_MODEL_FNAME))
elif EMBEDDING_MODEL_TYPE.lower() == 'glove':
    # Load the custom spaCy model
    embedding_model = spacy.load(os.path.join("..","src","question_answer_site", "question_answer", "embedding_models",
                                    EMBEDDING_MODEL_FNAME.split(".bin")[0]))

# Specify Candidate token embeddings option
if TOKENS_EMBEDDINGS == "query":
    TOKENS = "tokenized_query"
    EMBEDDINGS = "query_embedding"
elif TOKENS_EMBEDDINGS == "query_search":
    TOKENS = "tokenized_query_search"
    EMBEDDINGS = "query_embedding_search"
else:
    TOKENS = "tokenized_query_search_less_sw"
    EMBEDDINGS = "query_embedding_search_less_sw"

In [9]:
def spell_check(user_query):
    tokenized_query = tokenizer.tokenize(user_query)

    # Group tokens into words
    words = []
    current_word = ""
    for token in tokenized_query:
        if token.startswith("Ġ"):  # Indicates the start of a new word
            if current_word:
                words.append(current_word)
            current_word = token[1:] if token[1:] not in ['(', '[', '{', '/', '\\'] else ''
        else:
            current_word += token if token not in [')', ']', '}', '/', '\\', '?', ".", "!"] else ''
            if token in ['/', '\\']:
                words.append(current_word)
                current_word = ''
    if current_word:
        words.append(current_word)

    # Identify misspelled words not in the embeddings model
    misspelled_words = []
    for word in words:
        # Split punctuation and hyphens from the word
        base_word = "".join(char for char in word if char.isalnum() or char in ["'", "-"])
        if any(list(map(lambda x: not any(x),
                        tokens_to_embeddings(tokenizer.tokenize(base_word), embedding_model, RANDOM=False)))):
            # Add the original word to the misspelled_words list
            misspelled_words.append(word)
    # Correct the spelling of misspelled words
    corrected_words = {word: correct_spelling(word) for word in misspelled_words}

    # Replace misspelled words in the original query
    corrected_query = user_query
    for original, corrected in corrected_words.items():
        corrected_query = corrected_query.replace(original, corrected)

    return corrected_query


In [12]:
def process_query(user_query):
    user_query = user_query.lower()

    # clean query for BERT input
    user_query = clean_text(user_query)
    print("Uncorrected query: ", user_query)
    user_query = spell_check(user_query)
    print("Corrected query: ", user_query)

    # clean query for candidate search
    user_query_for_search = remove_non_word_chars(user_query)

    # Tokenize the query for BERT input
    tokenized_query = tokenizer.tokenize(user_query)

    # Tokenize the query for candidate search
    tokenized_query_for_search = tokenizer.tokenize(user_query_for_search)

    # Remove the stop words for the tokenized query for search
    nltk_stop_words = nltk.corpus.stopwords.words('english')
    nltk_stop_words.extend(["Ġ" + word for word in nltk_stop_words])  # Add the roberta modified tokens
    tokenized_query_for_search_less_sw = [token for token in tokenized_query_for_search if
                                          token not in nltk_stop_words]

    # Pad or truncate the query to a fixed length of 20 tokens (BERT input)

    if len(tokenized_query) > MAX_QUERY_LENGTH:
        tokenized_query = tokenized_query[:MAX_QUERY_LENGTH]
    else:
        padding_length = MAX_QUERY_LENGTH - len(tokenized_query)
        tokenized_query = tokenized_query + [tokenizer.pad_token] * padding_length

    # Convert the tokenized query to input IDs and attention mask
    input_ids_query = tokenizer.convert_tokens_to_ids(tokenized_query)
    attention_mask_query = [1] * len(input_ids_query)

    # Convert to tensors
    input_ids_query = torch.tensor(input_ids_query).unsqueeze(0)  # Add batch dimension
    attention_mask_query = torch.tensor(attention_mask_query).unsqueeze(0)  # Add batch dimension

    # Get the query embeddings for the candidate document search
    query_embeddings = tokens_to_embeddings(tokenized_query, embedding_model, RANDOM=False)
    query_embeddings_search = tokens_to_embeddings(tokenized_query_for_search, embedding_model, RANDOM=False)
    query_embeddings_less_sw = tokens_to_embeddings(tokenized_query_for_search_less_sw, embedding_model, RANDOM=False)

    query_data = {
        "query": user_query,
        "input_ids_query": input_ids_query.tolist(),
        "attention_mask_query": attention_mask_query.tolist(),
        "query_search": user_query_for_search,
        "tokenized_query": tokenized_query,
        "tokenized_query_search": tokenized_query_for_search,
        "tokenized_query_search_less_sw": tokenized_query_for_search_less_sw,
        "query_embedding": query_embeddings, #.tolist(),  # Just used for the candidate search
        "query_embedding_search": query_embeddings_search, #.tolist(),  # Just used for the candidate search, cleaned
        "query_embedding_search_less_sw": query_embeddings_less_sw # .tolist()
        # Just used for the candidate search, cleaned more
    }
    # return json.dumps(query_data['query'], indent=2)
    return query_data

In [13]:
query_data = process_query(user_query=query)
print(query_data[ "tokenized_query"])

Uncorrected query:  what happened to signals intelligence during world war 1?
Corrected query:  what happened to signals intelligence during world war 1?
['Ġwhat', 'Ġhappened', 'Ġto', 'Ġsignals', 'Ġintelligence', 'Ġduring', 'Ġworld', 'Ġwar', 'Ġ1', '?', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [14]:
def get_documents_from_mongo():
    if mongo_db.connect():
        documents = mongo_db.get_documents(query={}, inclusion={"tokens": 1, "tokens_less_sw": 1, "counter": 1,
                                                                "Document": 1, "_id": 0})
        documents = list(documents)
        # documents = [document for document in mongo_db.iterate_documents()]
        print(f"Total documents: {mongo_db.count_documents()}")
        mongo_db.disconnect()
        return documents
    return []

In [15]:
def get_doc_sim_scores(document, query_data):
    """

    :param document: (dict) Mongo queried document from parsed_documents
    :param query_data: Processed query
    :return: (float, dict) Similarity score and original document data
    """
    query_embedding = np.array(query_data[EMBEDDINGS])
    query_tokens = np.array(query_data[TOKENS])

    # remove the paddings from the query
    query_embedding = np.array([emb for emb, token in zip(query_embedding, query_tokens) if token != '[PAD]'])

    # List to store cosine similarity scores and corresponding document filenames
    # chunk_embeddings = np.array(document[DOCUMENT_EMBEDDING])
    chunk_tokens = np.array(document[DOCUMENT_TOKENS])
    chunk_embeddings = tokens_to_embeddings(document[DOCUMENT_TOKENS], embedding_model)

    # remove the paddings and unknown tokens from the query
    chunk_embeddings = np.array(
        [emb for emb, token in zip(chunk_embeddings, chunk_tokens) if token not in ['[PAD]', '[UNK]']])

    # Calculate cosine similarity between query_embedding and chunk_embeddings METHOD = 'MEAN_MAX'
    if METHOD == 'MEAN_MAX':
        similarity = cosine_similarity(query_embedding, chunk_embeddings)
        similarity = np.mean(np.max(similarity, axis=1))
    elif METHOD == 'MEAN_MEAN':
        similarity = cosine_similarity(query_embedding, chunk_embeddings)
        similarity = np.mean(similarity)
    elif METHOD == 'COMBINE_MEAN':  # 'COMBINE_MEAN'
        similarity = cosine_similarity(np.mean(query_embedding, axis=0).reshape(1, -1),
                                       np.mean(chunk_embeddings, axis=0).reshape(1, -1))
        similarity = np.mean(similarity)  # Get the single value out of the array
    else:
        mean_max_similarity = cosine_similarity(query_embedding, chunk_embeddings)
        mean_max_similarity = np.mean(np.max(mean_max_similarity, axis=1))
        combine_mean_similarity = cosine_similarity(np.mean(query_embedding, axis=0).reshape(1, -1),
                                       np.mean(chunk_embeddings, axis=0).reshape(1, -1))
        combine_mean_similarity = np.mean(combine_mean_similarity) 
        similarity = .5*mean_max_similarity + .5*combine_mean_similarity

    return (similarity, document)

In [16]:
def get_candidate_docs(query_data):
    """
    Get similarity score between query embeddings and all document embeddings, sort by score and return top N
    :param query_data: (dict) Processed query
    :return: [(float, dict)] sorted list of tuples continaing similarity score and data from Mongo
    """
    documents = get_documents_from_mongo()
    with ThreadPoolExecutor() as executor:
        # Submit each document for processing concurrently
        futures = [executor.submit(get_doc_sim_scores, doc, query_data) for doc in documents]

        # Wait for all tasks to complete
        sim_scores = [future.result() for future in futures]
    sim_scores.sort(key=lambda x: x[0], reverse=True)

    return sim_scores[:TOP_N]

In [23]:
# METHOD = 'COMBINE_MEAN'
# METHOD = 'MEAN_MAX'
METHOD = 'both'

# Get the candidate documents, top_n_documents: (similarity_score, document dictionary)
start_time = time.time()

top_n_documents = get_candidate_docs(query_data)
top_n_documents.sort(key=lambda x: x[1]['counter'])

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken to find top {TOP_N} documents: {elapsed_time} seconds")

Total documents: 502
Time taken to find top 20 documents: 6.634253978729248 seconds


In [24]:
for sim, doc in top_n_documents:
    if len(doc['Document']) < 23:
        print(f"{doc['Document']}\t\t\t\t\t{doc['counter']} \t {sim}")
    elif len(doc['Document']) <= 30:
        print(f"{doc['Document']} \t\t\t\t{doc['counter']} \t {sim}")
    elif len(doc['Document']) <= 36:
        print(f"{doc['Document']} \t\t\t{doc['counter']} \t {sim}")
    elif len(doc['Document']) <= 42:
        print(f"{doc['Document']}\t\t{doc['counter']} \t {sim}")
    elif len(doc['Document']) <= 46:
        print(f"{doc['Document']}\t\t{doc['counter']} \t {sim}")
    else:
        print(f"{doc['Document']}\t{doc['counter']} \t {sim}")

Signals intelligence - Wikipedia.pdf 			482 	 0.7348566965447785
Signals intelligence - Wikipedia.pdf 			494 	 0.6805135708096614
Signals intelligence - Wikipedia.pdf 			478 	 0.6712810532022575
Signals intelligence - Wikipedia.pdf 			491 	 0.63252902506634
Signals intelligence - Wikipedia.pdf 			493 	 0.6080013898031059
Signals intelligence - Wikipedia.pdf 			498 	 0.6005332291422623
Signals intelligence - Wikipedia.pdf 			492 	 0.5663472163544991
Signals intelligence - Wikipedia.pdf 			488 	 0.5641943121699271
Signals intelligence - Wikipedia.pdf 			486 	 0.5230971415599588
Signals intelligence - Wikipedia.pdf 			495 	 0.5179160102216983
Signals intelligence - Wikipedia.pdf 			489 	 0.5158054953403957
Communications satellite - Wikipedia.pdf		226 	 0.510401767130889
Communications satellite - Wikipedia.pdf		222 	 0.5073603642888664
Communications satellite - Wikipedia.pdf		221 	 0.505918751209149
Reconnaissance satellite - Wikipedia.pdf		10 	 0.5033843736526733
Communications satelli

In [50]:
import pandas as pd
disp_dict = {
    "Document":[],
    "Counter":[],  
    "Simularity":[], 
}
for sim, doc in top_n_documents:
    disp_dict["Document"].append(doc['Document'])
    disp_dict["Counter"].append(doc['counter'])
    disp_dict["Simularity"].append(sim)
    
disp_df = pd.DataFrame(disp_dict)
# disp_df[disp_df['Counter']==39]
# disp_df[disp_df['Document']=="Starlink - Wikipedia.pdf"]
disp_df

Unnamed: 0,Document,Counter,Simularity
0,Signals intelligence - Wikipedia.pdf,480,0.803169
1,Signals intelligence - Wikipedia.pdf,496,0.803169
2,Signals intelligence - Wikipedia.pdf,495,0.781953
3,Signals intelligence - Wikipedia.pdf,479,0.767552
4,Signals intelligence - Wikipedia.pdf,486,0.767552
5,Signals intelligence - Wikipedia.pdf,489,0.767552
6,Signals intelligence - Wikipedia.pdf,491,0.767552
7,Signals intelligence - Wikipedia.pdf,492,0.767552
8,Signals intelligence - Wikipedia.pdf,497,0.76576
9,Signals intelligence - Wikipedia.pdf,498,0.76576


#### Get specific chunk from Mongodb

In [40]:
def print_view(TYPE, query_info, chunk, counter):
    print(f"Inspecting {TYPE} METHOD...")
    chunk_embeddings = chunk['token_embeddings_less_sw']
    chunk_tokens = chunk['tokens_less_sw']
    query_embedding = query_info["query_embedding_search_less_sw"]
    query_tokens = query_info["tokenized_query_search_less_sw"]
    
    if TYPE == "MEAN_MAX":
        print("Finding the most simlar words in the chunk for each query word...\n")

        sim = cosine_similarity(query_embedding, chunk_embeddings)

        print("Position\tQuery\t\t\tChunk\t\tsim_score")
        for i, s, qt in zip(np.argmax(sim, axis=1), np.max(sim, axis=1), query_tokens):
            print(i)
            if len(chunk_tokens[i]) < 7 and len(qt) < 8:
                print(f"     {i}) \t{qt}\t\t-->\t{chunk_tokens[i]} \t\t{s}")
            elif len(chunk_tokens[i]) < 6:
                print(f"     {i}) \t{qt}\t-->\t{chunk_tokens[i]} \t\t{s}")
            elif len(chunk_tokens[i]) >= 7 and len(qt) < 8:
                print(f"     {i}) \t{qt}\t\t-->\t{chunk_tokens[i]} \t{s}")
            else:
                print(f"     {i}) \t{qt}\t-->\t{chunk_tokens[i]} \t{s}")

        print(f"\nnp.mean(np.max(sim, axis=1))\tsimilarity score between query and {counter} is {np.mean(np.max(sim, axis=1))}")
        print(f"\nnp.mean(sim)\t\t\tsimilarity score between query and {counter} is {np.mean(sim)}")
    
    elif TYPE == "COMBINE_MEAN":
        similarity = cosine_similarity(np.mean(query_embedding, axis=0).reshape(1, -1),
                                       np.mean(chunk_embeddings, axis=0).reshape(1, -1))
        similarity = np.mean(similarity) # Get the single value out of the array
        
        print(f"\nThe average query embedding and average {counter} embedding is {similarity}")
    
    elif TYPE == "MEAN_MEAN":
        sim = cosine_similarity(query_embedding, chunk_embeddings)
        print(f"\nnp.mean(sim) similarity score between query and {filename} is {np.mean(sim)}")
        
    else:
        print(f"TYPE {TYPE} not found")

In [42]:
query_data["query"]
print(query_data['tokenized_query_search_less_sw'])

['Ġhappened', 'Ġsignals', 'Ġintelligence', 'Ġworld', 'Ġwar', 'Ġ1']


In [43]:
query_data.keys()

dict_keys(['query', 'input_ids_query', 'attention_mask_query', 'query_search', 'tokenized_query', 'tokenized_query_search', 'tokenized_query_search_less_sw', 'query_embedding', 'query_embedding_search', 'query_embedding_search_less_sw'])

In [63]:
if mongo_db.connect():
    # cursor = mongo_db.get_collection().find({"Document": "Starlink - Wikipedia.pdf"})
    cursor = mongo_db.get_collection().find({"counter": 478})
    
    mongo_data = list(cursor)
    mongo_db.disconnect()

In [64]:
print(len(mongo_data))
print(mongo_data[0].keys())
print(mongo_data[0]['tokens_less_sw'])
# mongo_data[0]['token_embeddings_less_sw']

1
dict_keys(['_id', 'tokens', 'tokens_less_sw', 'token_embeddings_less_sw', 'Document', 'sha_256', 'counter'])


In [58]:
print_view(METHOD, query_data, mongo_data[0], mongo_data[0]['counter'])

Inspecting MEAN_MAX METHOD...
Finding the most simlar words in the chunk for each query word...

Position	Query			Chunk		sim_score
205
     205) 	Ġhappened	-->	mber 		0.8190155052386583
142
     142) 	Ġsignals	-->	Ġsignals 	0.9999999999999999
143
     143) 	Ġintelligence	-->	Ġintelligence 	0.9999999999999997
153
     153) 	Ġworld		-->	Ġworld 		0.9999999999999997
43
     43) 	Ġwar		-->	Ġwar 		0.9999999999999999
0
     0) 	Ġ1		-->	Ġsmall 		0.0

np.mean(np.max(sim, axis=1))	similarity score between query and 480 is 0.8031692508731095

np.mean(sim)			similarity score between query and 480 is 0.017971111829255957


#### Analysis

In [99]:
chunk_embeddings = mongo_data[0]['token_embeddings_less_sw']
chunk_tokens = mongo_data[0]['tokens_less_sw']
query_embedding = query_data["query_embedding_search_less_sw"]
query_tokens = query_data["tokenized_query_search_less_sw"]

In [100]:
chunk_tokens[0]

'Ġstar'

In [101]:
query_data["tokenized_query_search_less_sw"][0]

'Ġstar'

In [103]:
print(chunk_embeddings[0])

[0.1263670027256012, -0.18757300078868866, 0.3148829936981201, 0.6943079829216003, -1.6521389484405518, 0.7149810194969177, 1.2589759826660156, -0.9046199917793274, -1.0553289651870728, 0.6717360019683838, 0.5311689972877502, 0.3487280011177063, 0.3001149892807007, 0.967756986618042, -0.8603249788284302, -0.4125550091266632, -0.7426400184631348, 0.726872980594635, 0.4211600124835968, -1.3967779874801636, -0.3427030146121979, 0.16761000454425812, -0.2244739979505539, 0.9043030142784119, 1.0735490322113037, 0.09345600008964539, -0.20636099576950073, 0.8705009818077087, 0.3816690146923065, 0.6370450258255005, 0.009087000042200089, -0.2014629989862442, -0.7317540049552917, -0.8246780037879944, 1.1714550256729126, -0.5089390277862549, 0.24671700596809387, 0.4250909984111786, 0.38922399282455444, -0.5507810115814209, 0.5331500172615051, -0.3759540021419525, 0.6035339832305908, -0.08156999945640564, 0.05593999847769737, 0.06643900275230408, 0.18744899332523346, 0.2751159965991974, 0.068736001

In [102]:
print(query_embedding[0])

[0.1263670027256012, -0.18757300078868866, 0.3148829936981201, 0.6943079829216003, -1.6521389484405518, 0.7149810194969177, 1.2589759826660156, -0.9046199917793274, -1.0553289651870728, 0.6717360019683838, 0.5311689972877502, 0.3487280011177063, 0.3001149892807007, 0.967756986618042, -0.8603249788284302, -0.4125550091266632, -0.7426400184631348, 0.726872980594635, 0.4211600124835968, -1.3967779874801636, -0.3427030146121979, 0.16761000454425812, -0.2244739979505539, 0.9043030142784119, 1.0735490322113037, 0.09345600008964539, -0.20636099576950073, 0.8705009818077087, 0.3816690146923065, 0.6370450258255005, 0.009087000042200089, -0.2014629989862442, -0.7317540049552917, -0.8246780037879944, 1.1714550256729126, -0.5089390277862549, 0.24671700596809387, 0.4250909984111786, 0.38922399282455444, -0.5507810115814209, 0.5331500172615051, -0.3759540021419525, 0.6035339832305908, -0.08156999945640564, 0.05593999847769737, 0.06643900275230408, 0.18744899332523346, 0.2751159965991974, 0.068736001

In [31]:
import numpy as np
from numpy.linalg import norm
 
# define two lists or array
A = np.array(chunk_embeddings[0])
B = np.array(query_embedding[0])
 
print("A:", A)
print("B:", B)
 
# compute cosine similarity
cosine = np.dot(A,B)/(norm(A)*norm(B))
print("Cosine Similarity:", cosine)

A: [ 0.036363   -0.393042   -0.14290901 -0.46448901  0.761226    0.76178199
 -0.35284099  1.00311196  1.06885898 -0.42710701 -0.724011   -0.37086099
  0.25220501  1.56391394 -0.005681   -1.03142202 -1.27680898  0.82345903
 -1.13170898 -0.506347   -0.380328   -0.63763601 -0.649562   -0.353872
 -0.016324    0.44753599  0.79029101 -0.99025601  0.35526401  0.51425499
  0.206158    0.746557   -0.257566   -0.95098799  0.57719302  0.76185697
 -0.21814799  1.15517604 -0.113426   -0.96824402 -0.25894901  0.52897602
  0.166455   -0.47958601 -0.148857    0.020318   -0.91852498 -0.195447
  0.191122    1.20281506]
B: [ 0.027065    0.117158   -0.149758    0.095494    0.78823799  0.044426
  0.21710899 -1.09294701  0.17413101 -0.54461998 -1.32539797  0.66004902
  0.88568503  0.35447499 -0.79060501  0.30542299  0.43905699 -1.33964705
  0.074849   -0.38930199 -1.14452004  0.169121   -0.13893899  0.053343
 -0.076741   -1.16213596 -0.890517   -0.112144   -0.54387599  0.45776999
 -0.91843998 -0.038122    0

#### Get the Answer