# Chetan - LlamaParse PDF Text Extraction

In [None]:
import os
from pathlib import Path
from llama_parse import LlamaParse
from dotenv import load_dotenv
import sys

sys.path.append(str(Path.cwd().parent))
from src.config import DATA_DIR

load_dotenv()

print(f"Data directory: {DATA_DIR}")
print(f"API Key loaded: {'Yes' if os.getenv('LLAMA_CLOUD_API_KEY') else 'No'}")

Data directory: /Users/cgoenka/Documents/cs189/ML289-PRAP/data/records
API Key loaded: Yes


In [4]:
# Initialize parser
parser = LlamaParse(
    api_key=os.getenv("LLAMA_CLOUD_API_KEY"),
    result_type="markdown",
    verbose=True,
    language="en",
)

In [5]:
# TESTING ON SINGLE PDF

test_pdf = DATA_DIR / "record0.pdf"

print(f"Parsing: {test_pdf.name}")
documents = parser.load_data(str(test_pdf))

content = "\n\n".join([doc.text for doc in documents])

print(f"\nParsed {len(documents)} page(s)")
print(f"Total content length: {len(content)} characters")

Parsing: record0.pdf
Started parsing the file under job_id 34b4a339-bf42-48d9-8487-9281894e1edf

Parsed 1 page(s)
Total content length: 1520 characters


In [6]:
# Display parsed content
print(content)

User: PITKIN
# Napa Police Department

04/05/2023 17:34:42

# Case Management Tracking

| Time                | Action     | Description                                                                                                                                                    | Officer          | Hours Spent |
| ------------------- | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------- | ----------- |
| 06/02/2022 07:35:11 | OFFICER    | (320140) PIERSIG, PETER assigned (320126) UPCHURCH, KYLE to Case as LEAD INVESTIGATOR                                                                          | Piersig, Peter   | 0.00        |
| 06/02/2022 11:08:20 | CLEARED BY | Closed by arrest made by (320494) BARRERA, ADAM                                                                                                                | Barrera, Adam    | 0.

In [9]:
# PARSE FIRST n PDFs

num_records = 3
results = {}

for i in range(num_records):
    pdf_path = DATA_DIR / f"record{i}.pdf"

    if not pdf_path.exists():
        print(f"{pdf_path.name} not found")
        continue

    print(f"Parsing: {pdf_path.name}")

    parser_temp = LlamaParse(
        api_key=os.getenv("LLAMA_CLOUD_API_KEY"),
        result_type="markdown",
        verbose=True,
        language="en",
    )

    documents = parser_temp.load_data(str(pdf_path))
    content = "\n\n".join([doc.text for doc in documents])
    results[pdf_path.name] = content
    print(f"Parsing complete: {len(content)} characters, {len(documents)} page(s) \n")

print(f"\nParsed {len([r for r in results.values() if r])} out of {num_records} records successfully")

Parsing: record0.pdf
Started parsing the file under job_id 8fa8cd3c-cc1b-4a49-b875-02ec8289bbbc
Parsing complete: 1520 characters, 1 page(s) 

Parsing: record1.pdf
Started parsing the file under job_id 6c4ddd08-3ace-4170-8bfb-b2c4596fd89b
Parsing complete: 4072 characters, 2 page(s) 

Parsing: record2.pdf
Started parsing the file under job_id 55b423f2-4db7-4cdd-8dfb-040fa19d54c4
Parsing complete: 13037 characters, 4 page(s) 


Parsed 3 out of 3 records successfully


In [10]:
# SUMMARY OF PARSED DATA

import pandas as pd

summary = []
for filename, content in results.items():
    if content:
        summary.append({
            'filename': filename,
            'length': len(content),
            'lines': content.count('\n'),
            'words': len(content.split()),
        })

df_summary = pd.DataFrame(summary)
df_summary

Unnamed: 0,filename,length,lines,words
0,record0.pdf,1520,14,156
1,record1.pdf,4072,93,609
2,record2.pdf,13037,222,1904
