#### Notebook: Write data to the Mongodb collections

In [9]:
import platform
import datetime
import os

# Set proxy information if windows
if platform.system() == "Windows":
    print("Platform is Windows, setting proxy")
    # Get the current date and time
    now = datetime.datetime.now()
    day = now.strftime("%A")
    proxy_url = f"http://33566:{day[0:3]}@proxy-west.aero.org:8080"

    # Set proxy environment variables
    os.environ['HTTP_PROXY'] = proxy_url
    os.environ['HTTPS_PROXY'] = proxy_url

In [10]:
from urllib.parse import quote_plus
import json
import sys
# Get the current working directory (notebooks directory)
current_dir = os.getcwd()

# Go up one level to the project directory
project_dir = os.path.dirname(current_dir)

# Assuming your project structure is as described before
src_path = os.path.abspath(os.path.join(project_dir, 'src'))

# Add the 'src' directory to the Python path
sys.path.append(src_path)

from question_answer_site.question_answer.parse_document import parse_document, update_collection
from question_answer_site.question_answer.mongodb import MongoDb
from question_answer_site.question_answer.config import TOKENIZER, EMBEDDING_MODEL_FNAME, EMBEDDING_MODEL_TYPE, TOKENS_EMBEDDINGS, DOCUMENT_EMBEDDING, \
    DOCUMENT_TOKENS, TOP_N, TRANSFORMER_MODEL_NAME, METHOD, MAX_QUERY_LENGTH, username, password, cluster_url, INPUT_FOLDER, \
    database_name, special_characters, CHUNK_SIZE, CHUNK_OVERLAP
from transformers import BertTokenizer, BertForQuestionAnswering, RobertaTokenizer, RobertaForQuestionAnswering

from gensim.models import Word2Vec
import spacy
import copy

In [11]:
# Set the Tokenizer for your specific BERT model variant
bert_base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
roberta_tokenizer = RobertaTokenizer.from_pretrained("deepset/roberta-base-squad2", add_prefix_space = True)

tokenizers = {'bert': bert_base_tokenizer, 'roberta': roberta_tokenizer}

tokenizer = tokenizers[TOKENIZER]

In [12]:
# Load your trained Word2Vec model
if EMBEDDING_MODEL_TYPE == 'Word2Vec':
    embedding_model = Word2Vec.load(
        os.path.join(os.getcwd(), "question_answer", "embedding_models", EMBEDDING_MODEL_FNAME))
elif EMBEDDING_MODEL_TYPE.lower() == 'glove':
    # Load the custom spaCy model
    embedding_model = spacy.load(os.path.join("..","src","question_answer_site", "question_answer", "embedding_models",
                                         EMBEDDING_MODEL_FNAME.split(".bin")[0]))

In [13]:
document_path = os.path.join("..", "data", "space_based_pdfs")

In [14]:
data = parse_document(directory=document_path,
                      embedding_layer_model=embedding_model,
                      tokenizer=tokenizer,
                      chunk_size=CHUNK_SIZE,
                      chunk_overlap=CHUNK_OVERLAP,
                      additional_stopwords=special_characters)

file path: ../data/space_based_pdfs/Galaxy 15 - Wikipedia.pdf,
file name: Galaxy 15 - Wikipedia.pdf
file path: ../data/space_based_pdfs/Reconnaissance satellite - Wikipedia.pdf,
file name: Reconnaissance satellite - Wikipedia.pdf
file path: ../data/space_based_pdfs/Wideband Global SATCOM - Wikipedia.pdf,
file name: Wideband Global SATCOM - Wikipedia.pdf
file path: ../data/space_based_pdfs/.DS_Store,
file name: .DS_Store
File not a recognized format
	language detection error!
file path: ../data/space_based_pdfs/Swarm Technologies - Wikipedia.pdf,
file name: Swarm Technologies - Wikipedia.pdf
file path: ../data/space_based_pdfs/Fengyun - Wikipedia.pdf,
file name: Fengyun - Wikipedia.pdf
file path: ../data/space_based_pdfs/Advanced Extremely High Frequency - Wikipedia.pdf,
file name: Advanced Extremely High Frequency - Wikipedia.pdf
file path: ../data/space_based_pdfs/Falcon 9 - Wikipedia.pdf,
file name: Falcon 9 - Wikipedia.pdf
file path: ../data/space_based_pdfs/Rocket Lab Electron - Wi

In [15]:
update_collection("parsed_documents", copy.deepcopy(data))

Pinged your deployment. You successfully connected to MongoDB!
Updating the 'parsed_documents' collection...
parsed_documents
0 documents in 'parsed_documents' before adding.
Galaxy 15 - Wikipedia.pdf set()
Galaxy 15 - Wikipedia.pdf set()
Galaxy 15 - Wikipedia.pdf set()
Galaxy 15 - Wikipedia.pdf set()
Galaxy 15 - Wikipedia.pdf set()
Galaxy 15 - Wikipedia.pdf set()
Galaxy 15 - Wikipedia.pdf set()
Galaxy 15 - Wikipedia.pdf set()
Galaxy 15 - Wikipedia.pdf set()
Galaxy 15 - Wikipedia.pdf set()
Galaxy 15 - Wikipedia.pdf set()
Galaxy 15 - Wikipedia.pdf set()
Galaxy 15 - Wikipedia.pdf set()
Galaxy 15 - Wikipedia.pdf set()
Galaxy 15 - Wikipedia.pdf set()
Galaxy 15 - Wikipedia.pdf set()
Galaxy 15 - Wikipedia.pdf set()
Reconnaissance satellite - Wikipedia.pdf set()
Reconnaissance satellite - Wikipedia.pdf set()
Reconnaissance satellite - Wikipedia.pdf set()
Reconnaissance satellite - Wikipedia.pdf set()
Reconnaissance satellite - Wikipedia.pdf set()
Reconnaissance satellite - Wikipedia.pdf set()

In [16]:
# Should be +1 for adding one document
update_collection("extracted_text", copy.deepcopy(data))

Pinged your deployment. You successfully connected to MongoDB!
Updating the 'extracted_text' collection...
extracted_text
0 documents in 'extracted_text' before adding.
Galaxy 15 - Wikipedia.pdf set()
Galaxy 15 - Wikipedia.pdf
Galaxy 15 - Wikipedia.pdf {'Galaxy 15 - Wikipedia.pdf'}
Galaxy 15 - Wikipedia.pdf {'Galaxy 15 - Wikipedia.pdf'}
Galaxy 15 - Wikipedia.pdf {'Galaxy 15 - Wikipedia.pdf'}
Galaxy 15 - Wikipedia.pdf {'Galaxy 15 - Wikipedia.pdf'}
Galaxy 15 - Wikipedia.pdf {'Galaxy 15 - Wikipedia.pdf'}
Galaxy 15 - Wikipedia.pdf {'Galaxy 15 - Wikipedia.pdf'}
Galaxy 15 - Wikipedia.pdf {'Galaxy 15 - Wikipedia.pdf'}
Galaxy 15 - Wikipedia.pdf {'Galaxy 15 - Wikipedia.pdf'}
Galaxy 15 - Wikipedia.pdf {'Galaxy 15 - Wikipedia.pdf'}
Galaxy 15 - Wikipedia.pdf {'Galaxy 15 - Wikipedia.pdf'}
Galaxy 15 - Wikipedia.pdf {'Galaxy 15 - Wikipedia.pdf'}
Galaxy 15 - Wikipedia.pdf {'Galaxy 15 - Wikipedia.pdf'}
Galaxy 15 - Wikipedia.pdf {'Galaxy 15 - Wikipedia.pdf'}
Galaxy 15 - Wikipedia.pdf {'Galaxy 15 - Wikip