In [1]:
import sys
import os
sys.path.append(os.path.join('..', 'src'))

from utils import parsed_pdf_to_json
from gensim.models import Word2Vec

  from .autonotebook import tqdm as notebook_tqdm


#### Parse pdfs in directory for text, tokenize, chunk and save to JSON files
- Specify PDF directory (directory)
- Specify Storage directory for the JSON files (storage_dir)
- Specify word embedding model, consistent with the query for getting candidate documents (model_fname)
- Specify chunk overlap: the number of tokens consecutive chunks overlap by (chunk_overlap)
- Specify the tokenizer (TOKENIZER)

In [None]:
# Set the Tokenizer for your specific BERT model variant
# TOKENIZER = 'bert'
TOKENIZER = 'roberta'

bert_base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
roberta_tokenizer = RobertaTokenizer.from_pretrained("deepset/roberta-base-squad2")

tokenizers = {'bert': bert_base_tokenizer, 'roberta': roberta_tokenizer}

tokenizer = tokenizers[TOKENIZER]

##### Test on one_pdf directory containing one pdf file

In [2]:
directory = os.path.join("..", "data", "one_pdf")
storage_dir = os.path.join("..", "data", "one_pdf_parsed")

# Load your trained Word2Vec model
model_fname = os.path.join("..", "models", "word_embeddings", "roberta_word2vec_model.bin")
model = Word2Vec.load(model_fname)

In [3]:
parsed_pdf_to_json(directory, storage_dir, embedding_layer_model=model, chunk_overlap=0)

../data/one_pdf/2101.00031.pdf
processing text...
making lower-case...
Removing non-text elements (extra whitespaces)...
Removing unnecessary whitespace and special characters...
Removing line breaks...
Removing gibberish...
Removing unicode...
remove single letters or super large words (so big they don't make sense)...
done cleaning.

tokenize the processed text...
Chunking the tokenized text

printing the shape of chunked dataframe
(124, 17)


##### Parse documents in test_pdfs dirctory

In [4]:
directory = os.path.join("..", "data", "test_pdfs")
storage_dir = os.path.join("..", "data", "parsed_cleaned_pdfs", "roberta")

# Load your trained Word2Vec model
model_fname = os.path.join("..", "models", "word_embeddings", "roberta_word2vec_model.bin")
model = Word2Vec.load(model_fname)

In [5]:
parsed_pdf_to_json(directory, storage_dir, embedding_layer_model=model, tokenizer= chunk_overlap=0)

../data/test_pdfs/2101.00031.pdf
../data/test_pdfs/2101.01089.pdf
../data/test_pdfs/2101.00182.pdf
../data/test_pdfs/2101.00525.pdf
../data/test_pdfs/2101.01017.pdf
../data/test_pdfs/2101.00005.pdf
../data/test_pdfs/2101.00763.pdf
../data/test_pdfs/2101.01291.pdf
../data/test_pdfs/2101.00831.pdf
../data/test_pdfs/2101.01094.pdf
../data/test_pdfs/2101.00572.pdf
processing text...
making lower-case...
Removing non-text elements (extra whitespaces)...
Removing unnecessary whitespace and special characters...
Removing line breaks...
Removing gibberish...
Removing unicode...
remove single letters or super large words (so big they don't make sense)...
done cleaning.

tokenize the processed text...
Chunking the tokenized text

printing the shape of chunked dataframe
(906, 17)


#### Testing some code

In [17]:
# BELOW CODE: Adding a spell checker to the clean_text function in utils.py

import re
import unicodedata
from spellchecker import SpellChecker

def clean_text(text):
    # Normalize Unicode characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

    # Remove unnecessary whitespace
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    
    # Spell checking and correction using pyspellchecker
#     spell = SpellChecker()
#     words = text.split()
#     for word in words:
#         print(spell.correction(word))
#     corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words ]
#     text = ' '.join(corrected_words)
    
    return text.strip()  # Remove leading and trailing whitespace

# Example usage
raw_text = "therea are some known \u201celementary\u201d building blocks for lagrangian cobordisms..."
cleaned_text = clean_text(raw_text)
print(cleaned_text)





therea are some known elementary building blocks for lagrangian cobordisms...
