In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from IPython.display import  clear_output
import time
import PyPDF2
from pathlib import Path
from tqdm.auto import tqdm
from typing import Optional

device = 'cuda' if torch.cuda.is_available() else 'cpu'

DEFAULT_MODEL = "meta-llama/Llama-3.2-3B-Instruct"


model = AutoModelForCausalLM.from_pretrained(
    DEFAULT_MODEL,
    torch_dtype=torch.bfloat16,
    use_safetensors=True,
    device_map=device,
)

tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL, use_safetensors=True)
tokenizer.pad_token_id = tokenizer.eos_token_id

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
conversation = [
    # {"role": "system", "content": '''respind to the user as if you are stewie from family guy'''},
    {"role": "user", "content": '''who is mohan dash ?'''},
]

In [3]:
prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
inputs = tokenizer(prompt, return_tensors="pt").to(device)
# print(prompt)

with torch.no_grad():
    output = model.generate(
        **inputs,
        do_sample=True,
        max_new_tokens=100
    )

processed_text = tokenizer.decode(output[0], skip_special_tokens=False)

print(processed_text)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 07 Mar 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

who is mohan dash?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Mohan Das is a popular Indian YouTuber and content creator known for his educational and entertaining content, particularly in the fields of science, history, and philosophy. His real name is not widely known, and he prefers to keep his personal life private.

Mohan Das is particularly famous for his channel "Mohan Das", where he creates in-depth, engaging, and informative videos on a wide range of topics, including:

1. Science and technology: He explains complex scientific concepts


In [4]:
pdf_path = '/home/mohan.dash/sand particle counting/NLP/1506.02640v5.pdf'

with open(pdf_path, 'rb') as file:
    # Create PDF reader object
    pdf_reader = PyPDF2.PdfReader(file)
    
    # Get total number of pages
    num_pages = len(pdf_reader.pages)
    print(f"Processing PDF with {num_pages} pages...")
    
    extracted_text = []
    total_chars = 0
    max_chars = 100000
    
    # Iterate through all pages
    for page_num in range(num_pages):
        # Extract text from page
        page = pdf_reader.pages[page_num]
        text = page.extract_text()
        
        # Check if adding this page's text would exceed the limit
        if total_chars + len(text) > max_chars:
            # Only add text up to the limit
            remaining_chars = max_chars - total_chars
            extracted_text.append(text[:remaining_chars])
            print(f"Reached {max_chars} character limit at page {page_num + 1}")
            break
        
        extracted_text.append(text)
        total_chars += len(text)
        print(f"Processed page {page_num + 1}/{num_pages}")
    
    final_text = '\n'.join(extracted_text)
    print(f"\nExtraction complete! Total characters: {len(final_text)}")


Processing PDF with 10 pages...
Processed page 1/10
Processed page 2/10
Processed page 3/10
Processed page 4/10
Processed page 5/10
Processed page 6/10
Processed page 7/10
Processed page 8/10
Processed page 9/10
Processed page 10/10

Extraction complete! Total characters: 42093


In [5]:
if final_text:
    output_file = 'extracted_text.txt'
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(final_text)
    print(f"\nExtracted text has been saved to {output_file}")


Extracted text has been saved to extracted_text.txt


# Llama Pre-Processing
Now let's proceed to justify our distaste for writing regex and use that as a justification for a LLM instead:

At this point, have a text file extracted from a PDF of a paper. Generally PDF extracts can be messy due to characters, formatting, Latex, Tables, etc.

One way to handle this would be using regex, instead we can also prompt the feather light Llama models to clean up our text for us.

Please try changing the SYS_PROMPT below to see what improvements you can make:

In [6]:
SYS_PROMPT = """
You are a world class text pre-processor, here is the raw data from a PDF, please parse and return it in a way that is crispy and usable to send to a podcast writer.

The raw data is messed up with new lines, Latex math and you will see fluff that we can remove completely. Basically take away any details that you think might be useless in a podcast author's transcript.

Remember, the podcast could be on any topic whatsoever so the issues listed above are not exhaustive

Please be smart with what you remove and be creative ok?

Remember DO NOT START SUMMARIZING THIS, YOU ARE ONLY CLEANING UP THE TEXT AND RE-WRITING WHEN NEEDED

Be very smart and aggressive with removing details, you will get a running portion of the text and keep returning the processed text.

PLEASE DO NOT ADD MARKDOWN FORMATTING, STOP ADDING SPECIAL CHARACTERS THAT MARKDOWN CAPATILISATION ETC LIKES

ALWAYS start your response directly with processed text and NO ACKNOWLEDGEMENTS about my questions ok?
Here is the text:
"""

In [7]:
def create_word_bounded_chunks(text, target_chunk_size):
    """
    Split text into chunks at word boundaries close to the target chunk size.
    """
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    
    for word in words:
        word_length = len(word) + 1  # +1 for the space
        if current_length + word_length > target_chunk_size and current_chunk:
            # Join the current chunk and add it to chunks
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = word_length
        else:
            current_chunk.append(word)
            current_length += word_length
    
    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

In [8]:
CHUNK_SIZE = 1000  # Adjust chunk size if needed

chunks = create_word_bounded_chunks(final_text, CHUNK_SIZE)
print(len(chunks))

43


In [9]:
text_chunk = chunks[22]


conversation = [
    {"role": "system", "content": SYS_PROMPT},
    {"role": "user", "content": text_chunk},
]

prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
inputs = tokenizer(prompt, return_tensors="pt").to(device)

with torch.no_grad():
    output = model.generate(
        **inputs,
        temperature=0.7,
        top_p=0.9,
        max_new_tokens=512
    )

processed_text = tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()

# Print chunk information for monitoring
#print(f"\n{'='*40} Chunk {chunk_num} {'='*40}")
print(f"INPUT TEXT:\n{text_chunk[:500]}...")  # Show first 500 chars of input
print(f"\nPROCESSED TEXT:\n{processed_text[:500]}...")  # Show first 500 chars of output
print(f"{'='*90}\n")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


INPUT TEXT:
to current state-of-the-art methods. Finally, we show that YOLO generalizes to new domains better than other detectors on two artwork datasets. 4.1. Comparison to Other Real-Time Systems Many research efforts in object detection focus on mak- ing standard detection pipelines fast. [5] [38] [31] [14] [17] [28] However, only Sadeghi et al. actually produce a de- tection system that runs in real-time (30 frames per second or better) [31]. We compare YOLO to their GPU imple- mentation of DPM which r...

PROCESSED TEXT:
ors on two artwork datasets. Our system is the fastest object detection method on PASCAL, with a mAP of 52.7% and is more than twice as accurate as prior work on real-time detection. YOLO achieves a mAP of 63.4% while maintaining real-time performance....



In [10]:
def process_chunk(text_chunk, chunk_num):
    """Process a chunk of text and return both input and output for verification"""
    conversation = [
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": text_chunk},
    ]
    
    prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        output = model.generate(
            **inputs,
            temperature=0.7,
            top_p=0.9,
            max_new_tokens=512
        )
    
    processed_text = tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
    
    return processed_text

In [11]:
# Read the file
import os
INPUT_FILE = 'extracted_text.txt'
with open(INPUT_FILE, 'r', encoding='utf-8') as file:
    text = file.read()

# Calculate number of chunks
num_chunks = (len(text) + CHUNK_SIZE - 1) // CHUNK_SIZE

# Cell 6: Process the file with ordered output
# Create output file name
output_file = f"clean_{os.path.basename(INPUT_FILE)}"

with open(output_file, 'w', encoding='utf-8') as out_file:
    for chunk_num, chunk in enumerate(tqdm(chunks, desc="Processing chunks")):
        # Process chunk and append to complete text
        processed_chunk = process_chunk(chunk, chunk_num)
        processed_text += processed_chunk + "\n"
        
        # Write chunk immediately to file
        out_file.write(processed_chunk + "\n")
        out_file.flush()

Processing chunks:   0%|          | 0/43 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for