#### Notebook: Write data to the Mongodb collections

In [42]:
from urllib.parse import quote_plus
import os
import json
import sys
# Get the current working directory (notebooks directory)
current_dir = os.getcwd()

# Go up one level to the project directory
project_dir = os.path.dirname(current_dir)

# Assuming your project structure is as described before
src_path = os.path.abspath(os.path.join(project_dir, 'src'))

# Add the 'src' directory to the Python path
sys.path.append(src_path)
from question_answer_site.question_answer.parse_document import parse_document, update_collection
from question_answer_site.question_answer.mongodb import MongoDb
from question_answer_site.question_answer.config import TOKENIZER, EMBEDDING_MODEL_FNAME, EMBEDDING_MODEL_TYPE, TOKENS_EMBEDDINGS, DOCUMENT_EMBEDDING, \
    DOCUMENT_TOKENS, TOP_N, TRANSFORMER_MODEL_NAME, METHOD, MAX_QUERY_LENGTH, username, password, cluster_url, INPUT_FOLDER, \
    database_name, special_characters, CHUNK_SIZE, CHUNK_OVERLAP
from transformers import BertTokenizer, BertForQuestionAnswering, RobertaTokenizer, RobertaForQuestionAnswering

from gensim.models import Word2Vec
import spacy
import copy

In [43]:
# Set the Tokenizer for your specific BERT model variant
bert_base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
roberta_tokenizer = RobertaTokenizer.from_pretrained("deepset/roberta-base-squad2", add_prefix_space = True)

tokenizers = {'bert': bert_base_tokenizer, 'roberta': roberta_tokenizer}

tokenizer = tokenizers[TOKENIZER]

In [44]:
# Load your trained Word2Vec model
if EMBEDDING_MODEL_TYPE == 'Word2Vec':
    embedding_model = Word2Vec.load(
        os.path.join(os.getcwd(), "question_answer", "embedding_models", EMBEDDING_MODEL_FNAME))
elif EMBEDDING_MODEL_TYPE.lower() == 'glove':
    # Load the custom spaCy model
    embedding_model = spacy.load(os.path.join("..","src","question_answer_site", "question_answer", "embedding_models",
                                         EMBEDDING_MODEL_FNAME.split(".bin")[0]))

In [45]:
document_path = os.path.join("..", "data", "space_based_txts", "Starlink Explained- What You Need to Know About Elon Musk's Satellite Internet Service.txt")

In [46]:
data = parse_document(document_path, embedding_model, tokenizer, CHUNK_SIZE, CHUNK_OVERLAP, special_characters)

../data/space_based_txts/Starlink Explained- What You Need to Know About Elon Musk's Satellite Internet Service.txt Starlink Explained- What You Need to Know About Elon Musk's Satellite Internet Service.txt
processing text...
making lower-case...
Removing non-text elements (extra whitespaces)...
Removing unnecessary whitespace and special characters...
Removing line breaks...
Removing gibberish...
Removing unicode...
remove single letters or super large words (so big they don't make sense)...
done cleaning.

tokenize the processed text...
Chunking the tokenized text...

printing the shape of chunked dataframe
(7, 13)


In [48]:
update_collection("parsed_documents", copy.deepcopy(data))

Updating the 'parsed_documents' collection
502 documents in 'parsed_documents' before adding
509 documents in 'parsed_documents' after adding


In [49]:
# Should be +1 for adding one document
update_collection("extracted_text", copy.deepcopy(data))

Updating the 'extracted_text' collection
27 documents in 'extracted_text' before adding
28 documents in 'extracted_text' after adding
