In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from IPython.display import  clear_output
import time
import PyPDF2
from pathlib import Path
from tqdm.auto import tqdm
from typing import Optional

device = 'cuda' if torch.cuda.is_available() else 'cpu'

DEFAULT_MODEL = "meta-llama/Llama-3.2-3B-Instruct"


model = AutoModelForCausalLM.from_pretrained(
    DEFAULT_MODEL,
    torch_dtype=torch.bfloat16,
    use_safetensors=True,
    device_map=device,
)

tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL, use_safetensors=True)
tokenizer.pad_token_id = tokenizer.eos_token_id

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [2]:
pdf_path = r'C:\Users\User\Desktop\NLP\1706.03762v7.pdf'

with open(pdf_path, 'rb') as file:
    # Create PDF reader object
    pdf_reader = PyPDF2.PdfReader(file)
    
    # Get total number of pages
    num_pages = len(pdf_reader.pages)
    print(f"Processing PDF with {num_pages} pages...")
    
    extracted_text = []
    total_chars = 0
    max_chars = 100000
    
    # Iterate through all pages
    for page_num in range(num_pages):
        # Extract text from page
        page = pdf_reader.pages[page_num]
        text = page.extract_text()
        
        # Check if adding this page's text would exceed the limit
        if total_chars + len(text) > max_chars:
            # Only add text up to the limit
            remaining_chars = max_chars - total_chars
            extracted_text.append(text[:remaining_chars])
            print(f"Reached {max_chars} character limit at page {page_num + 1}")
            break
        
        extracted_text.append(text)
        total_chars += len(text)
        print(f"Processed page {page_num + 1}/{num_pages}")
    
    final_text = '\n'.join(extracted_text)
    print(f"\nExtraction complete! Total characters: {len(final_text)}")


Processing PDF with 15 pages...
Processed page 1/15
Processed page 2/15
Processed page 3/15
Processed page 4/15
Processed page 5/15
Processed page 6/15
Processed page 7/15
Processed page 8/15
Processed page 9/15
Processed page 10/15
Processed page 11/15
Processed page 12/15
Processed page 13/15
Processed page 14/15
Processed page 15/15

Extraction complete! Total characters: 39486


In [3]:
if final_text:
    output_file = 'extracted_text.txt'
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(final_text)
    print(f"\nExtracted text has been saved to {output_file}")


Extracted text has been saved to extracted_text.txt


# Llama Pre-Processing
Now let's proceed to justify our distaste for writing regex and use that as a justification for a LLM instead:

At this point, have a text file extracted from a PDF of a paper. Generally PDF extracts can be messy due to characters, formatting, Latex, Tables, etc.

One way to handle this would be using regex, instead we can also prompt the feather light Llama models to clean up our text for us.

Please try changing the SYS_PROMPT below to see what improvements you can make:

In [4]:
SYS_PROMPT = """
You are a world class text pre-processor, here is the raw data from a PDF, please parse and return it in a way that is crispy and usable to send to a podcast writer.

The raw data is messed up with new lines, Latex math and you will see fluff that we can remove completely. Basically take away any details that you think might be useless in a podcast author's transcript.

Remember, the podcast could be on any topic whatsoever so the issues listed above are not exhaustive

Please be smart with what you remove and be creative ok?

Remember DO NOT START SUMMARIZING THIS, YOU ARE ONLY CLEANING UP THE TEXT AND RE-WRITING WHEN NEEDED

Be very smart and aggressive with removing details, you will get a running portion of the text and keep returning the processed text.

PLEASE DO NOT ADD MARKDOWN FORMATTING, STOP ADDING SPECIAL CHARACTERS THAT MARKDOWN CAPATILISATION ETC LIKES

ALWAYS start your response directly with processed text and NO ACKNOWLEDGEMENTS about my questions ok?
Here is the text:
"""

In [5]:
def create_word_bounded_chunks(text, target_chunk_size):
    """
    Split text into chunks at word boundaries close to the target chunk size.
    """
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    
    for word in words:
        word_length = len(word) + 1  # +1 for the space
        if current_length + word_length > target_chunk_size and current_chunk:
            # Join the current chunk and add it to chunks
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = word_length
        else:
            current_chunk.append(word)
            current_length += word_length
    
    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

In [6]:
CHUNK_SIZE = 1000  # Adjust chunk size if needed

chunks = create_word_bounded_chunks(final_text, CHUNK_SIZE)
print(len(chunks))

40


In [7]:
text_chunk = chunks[22]


conversation = [
    {"role": "system", "content": SYS_PROMPT},
    {"role": "user", "content": text_chunk},
]

prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
inputs = tokenizer(prompt, return_tensors="pt").to(device)

with torch.no_grad():
    output = model.generate(
        **inputs,
        temperature=0.7,
        top_p=0.9,
        max_new_tokens=512
    )

processed_text = tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()

# Print chunk information for monitoring
#print(f"\n{'='*40} Chunk {chunk_num} {'='*40}")
print(f"INPUT TEXT:\n{text_chunk[:500]}...")  # Show first 500 chars of input
print(f"\nPROCESSED TEXT:\n{processed_text[:500]}...")  # Show first 500 chars of output
print(f"{'='*90}\n")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
7.7·10191.2·1021 Transformer (base model) 27.3 38.1 3.3·1018 Transformer (big) 28.4 41.8 2.3·1019 Residual Dropout We apply dropout [ 33] to the output of each sub-layer, before it is added to the sub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks. For the base model, we use a rate of Pdrop= 0.1. Label Smoothing During training, we employed label smoothing of value ϵls= 0.1[36]. This hurt...

PROCESSED TEXT:
t to the output of each sub-layer, before it is added to the sub-layer input and normalized. 
We apply dropout to the sums of the embeddings and the positional encodings in both the encoder and decoder stacks. 
For the base model, we use a rate of Pdrop= 0.1. 
Label Smoothing 
During training, we employed label smoothing of value ϵls= 0.1. 
This hurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score. 
6 Results 
6.1 Machine 

In [8]:
def process_chunk(text_chunk, chunk_num):
    """Process a chunk of text and return both input and output for verification"""
    conversation = [
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": text_chunk},
    ]
    
    prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        output = model.generate(
            **inputs,
            temperature=0.7,
            top_p=0.9,
            max_new_tokens=512
        )
    
    processed_text = tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
    
    return processed_text

In [9]:
# Read the file
import os
INPUT_FILE = 'extracted_text.txt'
with open(INPUT_FILE, 'r', encoding='utf-8') as file:
    text = file.read()

# Calculate number of chunks
num_chunks = (len(text) + CHUNK_SIZE - 1) // CHUNK_SIZE

# Cell 6: Process the file with ordered output
# Create output file name
output_file = f"clean_{os.path.basename(INPUT_FILE)}"

with open(output_file, 'w', encoding='utf-8') as out_file:
    for chunk_num, chunk in enumerate(tqdm(chunks, desc="Processing chunks")):
        # Process chunk and append to complete text
        processed_chunk = process_chunk(chunk, chunk_num)
        processed_text += processed_chunk + "\n"
        
        # Write chunk immediately to file
        out_file.write(processed_chunk + "\n")
        out_file.flush()

Processing chunks:   0%|          | 0/40 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin