In [15]:
import sys
import json
import shutil

import os
sys.path.append(os.path.join('..', 'src'))

from transformers import BertTokenizer, RobertaTokenizer
from utils import parsed_pdf_to_json
from gensim.models import Word2Vec

#### Get Hyperparameters

In [2]:
with open(os.path.join("..", "vars", "hyperparameters1.json")) as json_file:
    hyperparams = json.load(json_file)
    print(json.dumps(hyperparams, indent=4))

{
    "TOKENIZER": "roberta",
    "input_folder": "space_based_pdfs",
    "embedding_model_type": "Word2Vec",
    "embedding_model_fname": "roberta_space_based_pdfs_Word2Vec_model.bin",
    "vector_size": 100,
    "window": 5,
    "min_count": 1,
    "sg": 0,
    "TOKENS_TPYE": "tokens_less_sw",
    "chunk_size": 100,
    "chunk_overlap": 0,
    "max_query_length": 20,
    "top_N": 10,
    "TOKENS_EMBEDDINGS": "query_search_less_sw",
    "DOCUMENT_EMBEDDING": "token_embeddings_less_sw",
    "METHOD": "MEAN_MAX",
    "transformer_model_name": "deepset/roberta-base-squad2",
    "context_size": 350
}


#### Parse pdfs in directory for text, tokenize, chunk and save to JSON files
- Specify PDF directory (directory)
- Specify Storage directory for the JSON files (storage_dir)
- Specify word embedding model, consistent with the query for getting candidate documents (model_fname)
- Specify chunk overlap: the number of tokens consecutive chunks overlap by (chunk_overlap)
- Specify the tokenizer (TOKENIZER)

##### Parse documents in test_pdfs dirctory

In [19]:
# Set the Tokenizer for your specific BERT model variant
TOKENIZER = hyperparams["TOKENIZER"]

bert_base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
roberta_tokenizer = RobertaTokenizer.from_pretrained("deepset/roberta-base-squad2")

tokenizers = {'bert': bert_base_tokenizer, 'roberta': roberta_tokenizer}

tokenizer = tokenizers[TOKENIZER]

In [20]:
# Load your trained Word2Vec model
embedding_model_fname = hyperparams["embedding_model_fname"]

embedding_model_type = hyperparams['embedding_model_type']
if embedding_model_type == 'Word2Vec':
    model = Word2Vec.load(os.path.join("..", "models", "word_embeddings", embedding_model_fname))

In [22]:
input_folder = hyperparams["input_folder"]
directory = os.path.join("..", "data", input_folder)

# Specify the directory path you want to check and create
output_folder = f"{input_folder}_{TOKENIZER}_parsed"
storage_dir = os.path.join("..", "data", output_folder)

print(f"input data location: {directory}\n")

# Check if the directory exists
if not os.path.exists(storage_dir):
    # If the directory doesn't exist, create it
    os.makedirs(storage_dir)
    print(f"Directory '{storage_dir}' created.")
else:
    # If the directory exists, delete its contents
    for filename in os.listdir(storage_dir):
        file_path = os.path.join(storage_dir, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f"Failed to delete {file_path}. Reason: {e}")
    print(f"Contents of directory '{storage_dir}' deleted.")

input data location: ../data/space_based_pdfs

Contents of directory '../data/space_based_pdfs_roberta_parsed' deleted.


In [23]:
# Specify additional stopwords to remove from the chunk cleaned for the candidate document search
special_characters = [
    "!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", "://", "https",'"', '"...', "/)","www",
    ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~", ".[", ",[", "-,", "][", "com",
    "),", ',"'
]

special_characters += list(map(lambda x: "Ġ" + x, special_characters))
print(special_characters)

# Add numbers to remove
special_characters += list(map(lambda x: str(x), range(100000)))
special_characters += list(map(lambda x: "Ġ" + str(x), range(100000)))

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '://', 'https', '"', '"...', '/)', 'www', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '.[', ',[', '-,', '][', 'com', 'Ġ!', 'Ġ"', 'Ġ#', 'Ġ$', 'Ġ%', 'Ġ&', "Ġ'", 'Ġ(', 'Ġ)', 'Ġ*', 'Ġ+', 'Ġ,', 'Ġ-', 'Ġ.', 'Ġ/', 'Ġ://', 'Ġhttps', 'Ġ"', 'Ġ"...', 'Ġ/)', 'Ġwww', 'Ġ:', 'Ġ;', 'Ġ<', 'Ġ=', 'Ġ>', 'Ġ?', 'Ġ@', 'Ġ[', 'Ġ\\', 'Ġ]', 'Ġ^', 'Ġ_', 'Ġ`', 'Ġ{', 'Ġ|', 'Ġ}', 'Ġ~', 'Ġ.[', 'Ġ,[', 'Ġ-,', 'Ġ][', 'Ġcom']


In [24]:
parsed_pdf_to_json(directory, storage_dir, 
                   embedding_layer_model=model, 
                   tokenizer=tokenizer, 
                   chunk_size=hyperparams['chunk_size'],
                   chunk_overlap=hyperparams['chunk_overlap'], 
                   additional_stopwords=special_characters)

../data/space_based_pdfs/Galaxy 15 - Wikipedia.pdf
../data/space_based_pdfs/Swarm Technologies - Wikipedia.pdf
../data/space_based_pdfs/Fengyun - Wikipedia.pdf
../data/space_based_pdfs/Falcon 9 - Wikipedia.pdf
../data/space_based_pdfs/Cygnus NG-19 - Wikipedia.pdf
../data/space_based_pdfs/Atlas V - Wikipedia.pdf
../data/space_based_pdfs/Inmarsat - Wikipedia.pdf
../data/space_based_pdfs/Kepler-11 - Wikipedia.pdf
../data/space_based_pdfs/James Webb Space Telescope - Wikipedia.pdf
../data/space_based_pdfs/Space-Based Infrared System - Wikipedia.pdf
../data/space_based_pdfs/Yaogan - Wikipedia.pdf
../data/space_based_pdfs/Starlink - Wikipedia.pdf
../data/space_based_pdfs/Atlas (rocket family) - Wikipedia.pdf
processing text...
making lower-case...
Removing non-text elements (extra whitespaces)...
Removing unnecessary whitespace and special characters...
Removing line breaks...
Removing gibberish...
Removing unicode...
remove single letters or super large words (so big they don't make sense).

##### Test on one_pdf directory containing one pdf file

In [17]:
input_folder = "one_pdf"
directory = os.path.join("..", "data", input_folder)

# Specify the directory path you want to check and create
output_folder = f"{input_folder}_{TOKENIZER}_parsed"
storage_dir = os.path.join("..", "data", output_folder)

print(f"input data location: {directory}\n")

# Check if the directory exists
if not os.path.exists(storage_dir):
    # If the directory doesn't exist, create it
    os.makedirs(storage_dir)
    print(f"Directory '{storage_dir}' created.")
else:
    # If the directory exists, delete its contents
    for filename in os.listdir(storage_dir):
        file_path = os.path.join(storage_dir, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f"Failed to delete {file_path}. Reason: {e}")
    print(f"Contents of directory '{storage_dir}' deleted.")

input data location: ../data/one_pdf

Contents of directory '../data/one_pdf_roberta_parsed' deleted.


In [18]:
parsed_pdf_to_json(directory, storage_dir, 
                   embedding_layer_model=model, 
                   tokenizer=tokenizer, 
                   chunk_size=hyperparams['chunk_size'],
                   chunk_overlap=hyperparams['chunk_overlap'], 
                   additional_stopwords=special_characters)

../data/one_pdf/Starlink - Wikipedia.pdf
processing text...
making lower-case...
Removing non-text elements (extra whitespaces)...
Removing unnecessary whitespace and special characters...
Removing line breaks...
Removing gibberish...
Removing unicode...
remove single letters or super large words (so big they don't make sense)...
done cleaning.

tokenize the processed text...
Chunking the tokenized text...

printing the shape of chunked dataframe
(333, 13)


#### Testing some code

In [17]:
# BELOW CODE: Adding a spell checker to the clean_text function in utils.py

import re
import unicodedata
from spellchecker import SpellChecker

def clean_text(text):
    # Normalize Unicode characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

    # Remove unnecessary whitespace
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    
    # Spell checking and correction using pyspellchecker
#     spell = SpellChecker()
#     words = text.split()
#     for word in words:
#         print(spell.correction(word))
#     corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words ]
#     text = ' '.join(corrected_words)
    
    return text.strip()  # Remove leading and trailing whitespace

# Example usage
raw_text = "therea are some known \u201celementary\u201d building blocks for lagrangian cobordisms..."
cleaned_text = clean_text(raw_text)
print(cleaned_text)





therea are some known elementary building blocks for lagrangian cobordisms...
