# Chetan - LlamaParse PDF Text Extraction

In [2]:
# SETUP

import os
from pathlib import Path
from llama_parse import LlamaParse
from dotenv import load_dotenv
import sys

# Add src to path
sys.path.append(str(Path.cwd().parent))

from src.config import DATA_DIR

load_dotenv()

print(f"Data directory: {DATA_DIR}")
print(f"API Key loaded: {'Yes' if os.getenv('LLAMA_CLOUD_API_KEY') else 'No'}")

Data directory: /Users/cgoenka/Documents/cs189/ML289-PRAP/data/records
API Key loaded: Yes


In [3]:
# Initialize parser

parser = LlamaParse(
    api_key=os.getenv("LLAMA_CLOUD_API_KEY"),
    result_type="markdown",
    verbose=True,
    language="en",
)

In [4]:
# TESTING ON SINGLE PDF

test_pdf = DATA_DIR / "record10.pdf"

print(f"Parsing: {test_pdf.name}")
documents = parser.load_data(str(test_pdf))

content = "\n\n".join([doc.text for doc in documents])

print(f"\nParsed {len(documents)} page(s)")
print(f"Total content length: {len(content)} characters")

Parsing: record10.pdf
Started parsing the file under job_id c776e0df-6f6f-4ec4-b8ec-dde1dd753bf6

Parsed 24 page(s)
Total content length: 66226 characters


In [5]:
# Display parsed content
print(content)


Sallnas Pollce Department

# Report # 23-100611 - Incident Report Report Cover Sheet

| REPORT DATE / TIME | BEAT / REPORTING DISTRICT | EVENT START DATE / TIME | EVENT END DATE / TIME |
| ------------------ | ------------------------- | ----------------------- | --------------------- |
| Oct 20, 2023 20:01 | SN9                       | Oct 20, 2023 16:56      | 16:56                 |

# OFFENSE-1

| OFFENSE CODE                                                                                 | OFFENSE LOCATION                            | OFFENSE START DATE | OFFENSE END DATE   |
| -------------------------------------------------------------------------------------------- | ------------------------------------------- | ------------------ | ------------------ |
| VC 2800.2(A) EVADE PEACE OFFICER WITH WANTON DISREGARD FOR SAFETY F - 90Z AIl Other Offenses | ALAMEDA AVE & E ROMIE LN, SALINAS, CA 93901 | Oct 20, 2023 17:00 | Oct 20, 2023 20:10 |

# OFFENSE-2

| OFFENSE CODE             

In [9]:
# PARSE FIRST n PDFs

num_records = 3
results = {}

for i in range(num_records):
    pdf_path = DATA_DIR / f"record{i}.pdf"

    if not pdf_path.exists():
        print(f"{pdf_path.name} not found")
        continue

    print(f"Parsing: {pdf_path.name}")

    parser_temp = LlamaParse(
        api_key=os.getenv("LLAMA_CLOUD_API_KEY"),
        result_type="markdown",
        verbose=True,
        language="en",
    )

    documents = parser_temp.load_data(str(pdf_path))
    content = "\n\n".join([doc.text for doc in documents])
    results[pdf_path.name] = content
    print(f"Parsing complete: {len(content)} characters, {len(documents)} page(s) \n")

print(f"\nParsed {len([r for r in results.values() if r])} out of {num_records} records successfully")

Parsing: record0.pdf
Started parsing the file under job_id 8fa8cd3c-cc1b-4a49-b875-02ec8289bbbc
Parsing complete: 1520 characters, 1 page(s) 

Parsing: record1.pdf
Started parsing the file under job_id 6c4ddd08-3ace-4170-8bfb-b2c4596fd89b
Parsing complete: 4072 characters, 2 page(s) 

Parsing: record2.pdf
Started parsing the file under job_id 55b423f2-4db7-4cdd-8dfb-040fa19d54c4
Parsing complete: 13037 characters, 4 page(s) 


Parsed 3 out of 3 records successfully


In [10]:
# SUMMARY OF PARSED DATA

import pandas as pd

summary = []
for filename, content in results.items():
    if content:
        summary.append({
            'filename': filename,
            'length': len(content),
            'lines': content.count('\n'),
            'words': len(content.split()),
        })

df_summary = pd.DataFrame(summary)
df_summary

Unnamed: 0,filename,length,lines,words
0,record0.pdf,1520,14,156
1,record1.pdf,4072,93,609
2,record2.pdf,13037,222,1904
