#### Notebook: Write single document to Mongodb

In [92]:
import os
sys.path.append(os.path.join('..', 'src', 'question_answer_site', 'question_answer'))
from utils import parse_pdf_to_chunks

from transformers import BertTokenizer, RobertaTokenizer
from gensim.models import Word2Vec

from mongo import MongoDb
from urllib.parse import quote_plus
import json

import copy

##### Set hyperparameters
- Tokenizer, embedding model, chunk size and overlap

In [101]:
with open(os.path.join("..", "vars", "hyperparameters1.json")) as json_file:
    hyperparams = json.load(json_file)
    print(json.dumps(hyperparams, indent=4))

{
    "TOKENIZER": "roberta",
    "input_folder": "space_based_pdfs",
    "embedding_model_type": "glove",
    "embedding_model_fname": "roberta_space_based_pdfs_glove_model.bin",
    "vector_size": 50,
    "window": 3,
    "min_count": 3,
    "sg": 0,
    "TOKENS_TPYE": "tokens_less_sw",
    "chunk_size": 450,
    "chunk_overlap": 0,
    "max_query_length": 20,
    "top_N": 10,
    "TOKENS_EMBEDDINGS": "query_search_less_sw",
    "DOCUMENT_EMBEDDING": "token_embeddings_less_sw",
    "DOCUMENT_TOKENS": "tokens_less_sw",
    "METHOD": "COMBINE_MEAN",
    "transformer_model_name": "deepset/roberta-base-squad2",
    "context_size": 500
}


In [102]:
# Set the Tokenizer for your specific BERT model variant
tokenizer = RobertaTokenizer.from_pretrained("deepset/roberta-base-squad2", add_prefix_space = True)

In [103]:
# Load your trained Word2Vec model
embedding_model_fname = hyperparams["embedding_model_fname"]

embedding_model_type = hyperparams['embedding_model_type']
if embedding_model_type == 'Word2Vec':
    model = Word2Vec.load(os.path.join("..", "models", "word_embeddings", embedding_model_fname))

elif embedding_model_type.lower() == 'glove':
    # Load the custom spaCy model
    model = spacy.load(os.path.join("..", "models", "word_embeddings", embedding_model_fname.split(".bin")[0]))

In [104]:
# Specify additional stopwords to remove from the chunk cleaned for the candidate document search
special_characters = [
    "!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", "://", "https",'"', '"...', "/)","www",
    ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~", ".[", ",[", "-,", "][", "com",
    "),", ',"', ').'
]

special_characters += list(map(lambda x: "Ġ" + x, special_characters))
print(special_characters)

# Add numbers to remove
special_characters += list(map(lambda x: str(x), range(100000)))
special_characters += list(map(lambda x: "Ġ" + str(x), range(100000)))

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '://', 'https', '"', '"...', '/)', 'www', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '.[', ',[', '-,', '][', 'com', '),', ',"', 'Ġ!', 'Ġ"', 'Ġ#', 'Ġ$', 'Ġ%', 'Ġ&', "Ġ'", 'Ġ(', 'Ġ)', 'Ġ*', 'Ġ+', 'Ġ,', 'Ġ-', 'Ġ.', 'Ġ/', 'Ġ://', 'Ġhttps', 'Ġ"', 'Ġ"...', 'Ġ/)', 'Ġwww', 'Ġ:', 'Ġ;', 'Ġ<', 'Ġ=', 'Ġ>', 'Ġ?', 'Ġ@', 'Ġ[', 'Ġ\\', 'Ġ]', 'Ġ^', 'Ġ_', 'Ġ`', 'Ġ{', 'Ġ|', 'Ġ}', 'Ġ~', 'Ġ.[', 'Ġ,[', 'Ġ-,', 'Ġ][', 'Ġcom', 'Ġ),', 'Ġ,"']


In [105]:
input_folder = "one_pdf"
single_file = os.path.join("..", "data", input_folder, 'Falcon Heavy - Wikipedia.pdf')
print(single_file)

../data/one_pdf/Falcon Heavy - Wikipedia.pdf


##### Read data from file into list of dictionaries representing chunks of text

In [106]:
parsed_data = parse_pdf_to_chunks(single_file, 
                                   embedding_layer_model=model, 
                                   tokenizer=tokenizer, 
                                   chunk_size=hyperparams['chunk_size'],
                                   chunk_overlap=hyperparams['chunk_overlap'], 
                                   additional_stopwords=special_characters)

../data/one_pdf/Falcon Heavy - Wikipedia.pdf
processing text...
making lower-case...
Removing non-text elements (extra whitespaces)...
Removing unnecessary whitespace and special characters...
Removing line breaks...
Removing gibberish...
Removing unicode...
remove single letters or super large words (so big they don't make sense)...
done cleaning.

tokenize the processed text...
Chunking the tokenized text...

printing the shape of chunked dataframe
(47, 14)


##### Connect to MongoDb, write data to the database

In [107]:
username = "new_user_1"
password = "password33566"
# Escape the username and password
escaped_username = quote_plus(username)
escaped_password = quote_plus(password)

cluster_url = "cluster0"
database_name = "question_answer"

In [108]:
collection_name = "parsed_documents"

# Create a MongoClient and connect to the server
mongodb = MongoDb(escaped_username, escaped_password, cluster_url, database_name, collection_name)
mongodb.connect()

doc_cnt = mongodb.count_documents()
print(f"{doc_cnt} documents in {collection_name} before adding")

document_tracker = set()
for data_obj in copy.deepcopy(parsed_data):
    # 'extracted_text'
    data_obj.pop('Original_Text')
    data_obj.pop('Text')

    # -
    data_obj.pop('language')
    data_obj.pop('language_probability')
    data_obj.pop('Path')
    data_obj.pop('token_embeddings')
    data_obj.pop('chunk_text')
    data_obj.pop('chunk_text_less_sw')

    mongodb.insert_document(data_obj)

print("Data inserted successfully!")

doc_cnt = mongodb.count_documents()
print(f"{doc_cnt} documents in {collection_name} after adding")

# Close the MongoDB client when done
mongodb.disconnect()

322 documents in parsed_documents before adding
Data inserted successfully!
369 documents in parsed_documents after adding


In [109]:
collection_name = "extracted_text"

# Create a MongoClient and connect to the server
mongodb = MongoDb(escaped_username, escaped_password, cluster_url, database_name, collection_name)
mongodb.connect()

doc_cnt = mongodb.count_documents()
print(f"{doc_cnt} documents in {collection_name} before adding")

document_tracker = set()
for data_obj in copy.deepcopy(parsed_data):
    # -
    data_obj.pop('language')
    data_obj.pop('language_probability')
    data_obj.pop('Path')
    data_obj.pop('token_embeddings')
    data_obj.pop('chunk_text')
    data_obj.pop('chunk_text_less_sw')

    # 'parsed_documents'
    data_obj.pop('counter')
    data_obj.pop('token_embeddings_less_sw')
    data_obj.pop('tokens_less_sw')
    data_obj.pop('tokens')

    # Insert the JSON data as a document into the collection
    if data_obj['Document'] not in document_tracker:
        document_tracker.add(data_obj['Document'])
        print(data_obj['Document'])
        mongodb.insert_document(data_obj)

print("Data inserted successfully!")

doc_cnt = mongodb.count_documents()
print(f"{doc_cnt} documents in {collection_name} after adding")

# Close the MongoDB client when done
mongodb.disconnect()

14 documents in extracted_text before adding
Falcon Heavy - Wikipedia.pdf
Data inserted successfully!
15 documents in extracted_text after adding


##### Once database is updated, the embedding model needs to be updates and all chunk tokens need to be modified