### Notebook: Processes directory of PDF's into Mongodb

In [10]:
import sys
import json
import shutil

import os
import sys
# Get the current working directory (notebooks directory)
current_dir = os.getcwd()

# Go up one level to the project directory
project_dir = os.path.dirname(current_dir)

# Assuming your project structure is as described before
src_path = os.path.abspath(os.path.join(project_dir, 'src'))

# Add the 'src' directory to the Python path
sys.path.append(src_path)
from question_answer_site.question_answer.parse_document import parse_document, update_collection
from question_answer_site.question_answer.mongodb import MongoDb
from question_answer_site.question_answer.utils import remove_non_word_chars, clean_text, tokens_to_embeddings, post_process_output, correct_spelling
from question_answer_site.question_answer.config import TOKENIZER, EMBEDDING_MODEL_FNAME, EMBEDDING_MODEL_TYPE, TOKENS_EMBEDDINGS, DOCUMENT_EMBEDDING, \
    DOCUMENT_TOKENS, TOP_N, TRANSFORMER_MODEL_NAME, METHOD, MAX_QUERY_LENGTH, username, password, cluster_url, INPUT_FOLDER, \
    database_name, special_characters, CHUNK_SIZE, CHUNK_OVERLAP
from transformers import BertTokenizer, BertForQuestionAnswering, RobertaTokenizer, RobertaForQuestionAnswering

from gensim.models import Word2Vec
from urllib.parse import quote_plus
import spacy
import copy

#### Parse pdfs in directory for text, tokenize, chunk and save to JSON files
- Specify PDF directory (directory)
- Specify Storage directory for the JSON files (storage_dir)
- Specify word embedding model, consistent with the query for getting candidate documents (model_fname)
- Specify chunk overlap: the number of tokens consecutive chunks overlap by (chunk_overlap)
- Specify the tokenizer (TOKENIZER)

##### Parse documents in test_pdfs dirctory

In [2]:
# Set the Tokenizer for your specific BERT model variant
bert_base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
roberta_tokenizer = RobertaTokenizer.from_pretrained("deepset/roberta-base-squad2", add_prefix_space = True)

tokenizers = {'bert': bert_base_tokenizer, 'roberta': roberta_tokenizer}

tokenizer = tokenizers[TOKENIZER]

In [3]:
# Load your trained Word2Vec model
if EMBEDDING_MODEL_TYPE == 'Word2Vec':
    model = Word2Vec.load(
        os.path.join(os.getcwd(), "question_answer", "embedding_models", EMBEDDING_MODEL_FNAME))
elif EMBEDDING_MODEL_TYPE.lower() == 'glove':
    # Load the custom spaCy model
    model = spacy.load(os.path.join("..","src","question_answer_site", "question_answer", "embedding_models",
                                         EMBEDDING_MODEL_FNAME.split(".bin")[0]))

In [4]:
directory = os.path.join("..", "data", INPUT_FOLDER)
print(f"input data location: {directory}")

input data location: ../data/space_based_pdfs


In [5]:
directory = os.path.join("..", "data", INPUT_FOLDER)

# Specify the directory path you want to check and create
output_folder = f"{INPUT_FOLDER}_{TOKENIZER}_parsed"
storage_dir = os.path.join("..", "data", output_folder)

# Check if the directory exists
if not os.path.exists(storage_dir):
    # If the directory doesn't exist, create it
    os.makedirs(storage_dir)
    print(f"Directory '{storage_dir}' created.")
else:
    # If the directory exists, delete its contents
    for filename in os.listdir(storage_dir):
        file_path = os.path.join(storage_dir, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f"Failed to delete {file_path}. Reason: {e}")
    print(f"Contents of directory '{storage_dir}' deleted.")

Contents of directory '../data/space_based_pdfs_roberta_parsed' deleted.


In [6]:
parsed_data = parse_document(directory=directory,
                            embedding_layer_model=model,
                            tokenizer=tokenizer,
                            chunk_size=CHUNK_SIZE,
                            chunk_overlap=CHUNK_OVERLAP,
                            additional_stopwords=special_characters)

file path: ../data/space_based_pdfs/Galaxy 15 - Wikipedia.pdf,
file name: Galaxy 15 - Wikipedia.pdf
file path: ../data/space_based_pdfs/Reconnaissance satellite - Wikipedia.pdf,
file name: Reconnaissance satellite - Wikipedia.pdf
file path: ../data/space_based_pdfs/Wideband Global SATCOM - Wikipedia.pdf,
file name: Wideband Global SATCOM - Wikipedia.pdf
file path: ../data/space_based_pdfs/.DS_Store,
file name: .DS_Store
File not a recognized format
	language detection error!
file path: ../data/space_based_pdfs/Swarm Technologies - Wikipedia.pdf,
file name: Swarm Technologies - Wikipedia.pdf
file path: ../data/space_based_pdfs/Fengyun - Wikipedia.pdf,
file name: Fengyun - Wikipedia.pdf
file path: ../data/space_based_pdfs/Advanced Extremely High Frequency - Wikipedia.pdf,
file name: Advanced Extremely High Frequency - Wikipedia.pdf
file path: ../data/space_based_pdfs/Falcon 9 - Wikipedia.pdf,
file name: Falcon 9 - Wikipedia.pdf
file path: ../data/space_based_pdfs/Rocket Lab Electron - Wi

In [7]:
parsed_data[0].keys()

dict_keys(['chunk_text', 'chunk_text_less_sw', 'tokens', 'tokens_less_sw', 'token_embeddings', 'token_embeddings_less_sw', 'Document', 'Path', 'Text', 'Original_Text', 'sha_256', 'language', 'language_probability', 'counter'])

#### Write to Mongodb

##### Extracted Text

In [13]:
update_collection("extracted_text", copy.deepcopy(parsed_data))

Updating the 'extracted_text' collection
0 documents in 'extracted_text' before adding
27 documents in 'extracted_text' after adding


##### Parsed Documents

In [12]:
update_collection("parsed_documents", copy.deepcopy(parsed_data))

Updating the 'parsed_documents' collection
0 documents in 'parsed_documents' before adding
502 documents in 'parsed_documents' after adding
