### Notebook: Before and After Embeddings After Adding Document

In [142]:
import os
import sys
# Get the current working directory (notebooks directory)
current_dir = os.getcwd()

# Go up one level to the project directory
project_dir = os.path.dirname(current_dir)

# Assuming your project structure is as described before
src_path = os.path.abspath(os.path.join(project_dir, 'src'))

# Add the 'src' directory to the Python path
sys.path.append(src_path)

from question_answer_site.question_answer.mongodb import MongoDb
from question_answer_site.question_answer.utils import remove_non_word_chars, clean_text, tokens_to_embeddings, \
post_process_output, correct_spelling
from question_answer_site.question_answer.config import TOKENIZER, EMBEDDING_MODEL_FNAME, EMBEDDING_MODEL_TYPE, \
TOKENS_EMBEDDINGS, DOCUMENT_EMBEDDING, DOCUMENT_TOKENS, TOP_N, TRANSFORMER_MODEL_NAME, METHOD, MAX_QUERY_LENGTH, \
username, password, cluster_url, database_name

from transformers import BertTokenizer, BertForQuestionAnswering, RobertaTokenizer, RobertaForQuestionAnswering
from urllib.parse import quote_plus
import spacy

#### Parsed Documents

In [154]:
escaped_username = quote_plus(username)
escaped_password = quote_plus(password)

# use MongoDb class to connect to database instance and get the documents
mongo_db = MongoDb(escaped_username, escaped_password, cluster_url, database_name, "parsed_documents")

##### Count

In [144]:
# Before
if mongo_db.connect():
    doc_cnt = mongo_db.count_documents()
    print(f"{doc_cnt} documents in 'parsed_documents'")

538 documents in 'parsed_documents'


In [145]:
# After
if mongo_db.connect():
    doc_cnt = mongo_db.count_documents()
    print(f"{doc_cnt} documents in 'parsed_documents'")

538 documents in 'parsed_documents'


#### Inspect One Document: Embedding

In [155]:
# Before
if mongo_db.connect():
    cursor = mongo_db.get_collection().find({"counter":1})
    data_before = list(cursor)

print(data_before[0].keys())

print(f"\nprinting the first token embedding '{data_before[0]['tokens_less_sw'][0]}':")
print(data_before[0]['token_embeddings_less_sw'][0])

dict_keys(['_id', 'tokens', 'tokens_less_sw', 'token_embeddings_less_sw', 'Document', 'sha_256', 'counter'])

printing the first token embedding 'Ġgalaxy':
[0.4640600085258484, 0.053341999650001526, -0.029682999476790428, -0.2906079888343811, 0.48579901456832886, -0.1054299995303154, -0.03219600021839142, -0.012400000356137753, -0.5563309788703918, -0.1653430014848709, -0.39154499769210815, 1.1606249809265137, -0.3053390085697174, 0.5109630227088928, -1.2667030096054077, -0.0066019999794662, 0.13386200368404388, -0.713021993637085, -0.37384700775146484, -1.6462559700012207, -0.2543179988861084, 0.3273789882659912, 0.3113360106945038, 0.5228739976882935, 0.49044400453567505, -1.3128349781036377, 0.11598700284957886, 0.3884899914264679, -0.23123499751091003, -0.03806300088763237, -0.7551469802856445, -0.14002199470996857, -0.5331230163574219, 0.09765300154685974, 0.0024649999104440212, 0.31720900535583496, 0.43054598569869995, -0.08259499818086624, 0.40387898683547974, 0.5639659762382507

In [156]:
# After
if mongo_db.connect():
    cursor = mongo_db.get_collection().find({"counter":1})
    data_after = list(cursor)
    
print(f"printing the first token embedding '{data_after[0]['tokens_less_sw'][0]}':")
print(data_after[0]['token_embeddings_less_sw'][0])

printing the first token embedding 'Ġgalaxy':
[0.4640600085258484, 0.053341999650001526, -0.029682999476790428, -0.2906079888343811, 0.48579901456832886, -0.1054299995303154, -0.03219600021839142, -0.012400000356137753, -0.5563309788703918, -0.1653430014848709, -0.39154499769210815, 1.1606249809265137, -0.3053390085697174, 0.5109630227088928, -1.2667030096054077, -0.0066019999794662, 0.13386200368404388, -0.713021993637085, -0.37384700775146484, -1.6462559700012207, -0.2543179988861084, 0.3273789882659912, 0.3113360106945038, 0.5228739976882935, 0.49044400453567505, -1.3128349781036377, 0.11598700284957886, 0.3884899914264679, -0.23123499751091003, -0.03806300088763237, -0.7551469802856445, -0.14002199470996857, -0.5331230163574219, 0.09765300154685974, 0.0024649999104440212, 0.31720900535583496, 0.43054598569869995, -0.08259499818086624, 0.40387898683547974, 0.5639659762382507, 0.10513599961996078, 0.04346400126814842, -0.27699199318885803, -0.42980700731277466, 0.0933229997754097, 0.

In [148]:
# Verify new document added with correct name
new_doc_name = "Kosmos 2516 - Wikipedia.pdf"
if mongo_db.connect():
    cursor = mongo_db.get_collection().find({"Document":new_doc_name})
    data = list(cursor)
    
print(data[0]["Document"])

Kosmos 2516 - Wikipedia.pdf


In [149]:
mongo_db.disconnect()

#### Extracted Text

In [150]:
mongo_db = MongoDb(escaped_username, escaped_password, cluster_url, database_name, "extracted_text")

In [151]:
# Before
if mongo_db.connect():
    doc_cnt = mongo_db.count_documents()
    print(f"{doc_cnt} documents in 'parsed_documents'")

32 documents in 'parsed_documents'


In [152]:
# After
if mongo_db.connect():
    doc_cnt = mongo_db.count_documents()
    print(f"{doc_cnt} documents in 'parsed_documents'")

32 documents in 'parsed_documents'


In [157]:
mongo_db.disconnect()

#### Individual Words

In [128]:
TOKENIZER

'roberta'

In [158]:
model = spacy.load(os.path.join("..","src","question_answer_site", "question_answer", "embedding_models",
                                EMBEDDING_MODEL_FNAME.split(".bin")[0]))

# Set the Tokenizer for your specific BERT model variant
bert_base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
roberta_tokenizer = RobertaTokenizer.from_pretrained("deepset/roberta-base-squad2", add_prefix_space = True)

tokenizers = {'bert': bert_base_tokenizer, 'roberta': roberta_tokenizer}

tokenizer = tokenizers[TOKENIZER]

words = "russian kosmos earth"
# words = "Kosmos 2499[3] was a Russian satellite orbiting the Earth, before breaking up on January 4, 2023.".lower()
tokens = tokenizer.tokenize(words)

In [159]:
tokens

['Ġr', 'ussian', 'Ġk', 'os', 'mos', 'Ġearth']

In [160]:
embeddings_old = dict()
for token in tokens:
    embedding = model(token)
    embeddings_old[token] = embedding.vector.tolist()

In [139]:
embeddings_new = dict()
for token in tokens:
    embedding = model(token)
    embeddings_new[token] = embedding.vector.tolist()

In [161]:
print(embeddings_new['Ġr'] == embeddings_old['Ġr'])
print(embeddings_new['ussian'] == embeddings_old['ussian'])
print(embeddings_new['Ġk'] == embeddings_old['Ġk'])
print(embeddings_new['os'] == embeddings_old['os'])
print(embeddings_new["mos"] == embeddings_old["mos"])
print(embeddings_new['Ġearth'] == embeddings_old['Ġearth'])

False
False
False
False
False
False


In [163]:
for val1, val2 in zip(embeddings_new['ussian'], embeddings_old["mos"]):
    print(val1, val2)

-0.19162699580192566 -0.8270270228385925
0.2061620056629181 0.48517701029777527
0.9097089767456055 -0.946977972984314
0.6142230033874512 0.16698899865150452
0.3647550046443939 -0.27535098791122437
0.8044229745864868 0.16251100599765778
-0.130171999335289 0.5564730167388916
0.14237099885940552 0.2210559993982315
0.9826949834823608 -0.28651300072669983
0.12576499581336975 -0.2730660140514374
-0.7192649841308594 0.759548008441925
0.6930869817733765 -0.05993900075554848
-0.37209099531173706 -0.5526189804077148
0.20426400005817413 0.124719999730587
1.106840968132019 -0.06790599972009659
-0.09339500218629837 0.12285099923610687
0.3197210133075714 -0.7005069851875305
0.588325023651123 0.5602779984474182
0.1696310043334961 -0.2020609974861145
0.47122201323509216 0.17398600280284882
-0.018806999549269676 0.28056100010871887
-0.9990469813346863 -0.8363159894943237
-0.2814750075340271 -0.7087500095367432
0.7270900011062622 -0.5468050241470337
0.9770429730415344 -0.3934760093688965
-0.216937005519