# Reading PDF Files in Python

This notebook demonstrates different ways to read PDF files using Python libraries.


In [None]:
# Method 1: Using pypdf (simple and lightweight)
# First, install: uv add pypdf

from pypdf import PdfReader

# Read a PDF file
pdf_path = "CV/CV.pdf"
# pdf_path = "CV/cover_letter.pdf"
reader = PdfReader(pdf_path)

# Get number of pages
print(f"Number of pages: {len(reader.pages)}")

# Extract text from first page
first_page = reader.pages[0]
text = first_page.extract_text()
print("\n--- First Page Text ---")
print(text)

# Extract text from all pages
all_text = []
for page in reader.pages:
    all_text.append(page.extract_text())

full_text = "\n".join(all_text)
print(f"\n--- Full Document ({len(reader.pages)} pages) ---")
print(full_text[:500] + "..." if len(full_text) > 500 else full_text)

In [None]:
print(full_text)

In [None]:
full_text_simp = full_text

In [None]:
# Method 2: Using pdfplumber (better for tables and formatted text)
# Install: uv add pdfplumber

import pdfplumber

pdf_path = "CV/CV.pdf"

with pdfplumber.open(pdf_path) as pdf:
    print(f"Number of pages: {len(pdf.pages)}")
    
    # Extract text from first page
    first_page = pdf.pages[0]
    text = first_page.extract_text()
    print("\n--- First Page Text ---")
    print(text)
    
    # Extract tables (useful for structured data)
    print("\n--- Tables found on first page ---")
    tables = first_page.extract_tables()
    for i, table in enumerate(tables):
        print(f"Table {i+1}:")
        for row in table:
            print(row)
    
    # Extract all text from all pages
    all_text = []
    for page in pdf.pages:
        all_text.append(page.extract_text())
    
    full_text = "\n".join(all_text)
    print(f"\n--- Full Document Text ---")
    print(full_text)


In [None]:
# Compare full_text_simp (pypdf) and full_text (pdfplumber)

print("=" * 80)
print("COMPARISON: full_text_simp (pypdf) vs full_text (pdfplumber)")
print("=" * 80)

print(f"\nLength comparison:")
print(f"  full_text_simp: {len(full_text_simp)} characters")
print(f"  full_text:      {len(full_text)} characters")
print(f"  Difference:     {abs(len(full_text_simp) - len(full_text))} characters")

# Show first 500 characters of each
print("\n" + "=" * 80)
print("First 500 characters - full_text_simp (pypdf):")
print("=" * 80)
print(full_text_simp[:500])

print("\n" + "=" * 80)
print("First 500 characters - full_text (pdfplumber):")
print("=" * 80)
print(full_text[:500])

# Check if they're identical
if full_text_simp == full_text:
    print("\n✓ The texts are IDENTICAL")
else:
    print("\n✗ The texts are DIFFERENT")
    
    # Find first difference
    min_len = min(len(full_text_simp), len(full_text))
    for i in range(min_len):
        if full_text_simp[i] != full_text[i]:
            print(f"\nFirst difference at position {i}:")
            print(f"  full_text_simp[{i}]: '{full_text_simp[i]}' (ord: {ord(full_text_simp[i])})")
            print(f"  full_text[{i}]:      '{full_text[i]}' (ord: {ord(full_text[i])})")
            print(f"\nContext around difference:")
            start = max(0, i - 50)
            end = min(min_len, i + 50)
            print(f"  full_text_simp: ...{full_text_simp[start:end]}...")
            print(f"  full_text:      ...{full_text[start:end]}...")
            break


In [None]:
# More detailed comparison using difflib

import difflib

# Split into lines for better comparison
simp_lines = full_text_simp.splitlines()
pdf_lines = full_text.splitlines()

# Show unified diff
print("=" * 80)
print("UNIFIED DIFF (showing differences line by line)")
print("=" * 80)
print("Lines prefixed with '-' are in full_text_simp (pypdf) but not in full_text (pdfplumber)")
print("Lines prefixed with '+' are in full_text (pdfplumber) but not in full_text_simp (pypdf)")
print("Lines prefixed with '?' show character-level differences")
print("=" * 80)

diff = difflib.unified_diff(
    simp_lines, 
    pdf_lines, 
    fromfile='full_text_simp (pypdf)', 
    tofile='full_text (pdfplumber)',
    lineterm='',
    n=3  # context lines
)

# Show first 50 lines of diff
diff_list = list(diff)
if diff_list:
    for i, line in enumerate(diff_list[:50]):
        print(line)
    if len(diff_list) > 50:
        print(f"\n... ({len(diff_list) - 50} more diff lines)")
else:
    print("No differences found!")

# Count differences
print("\n" + "=" * 80)
print("STATISTICS")
print("=" * 80)
print(f"Total lines in full_text_simp: {len(simp_lines)}")
print(f"Total lines in full_text: {len(pdf_lines)}")
print(f"Lines only in full_text_simp: {len(set(simp_lines) - set(pdf_lines))}")
print(f"Lines only in full_text: {len(set(pdf_lines) - set(simp_lines))}")
print(f"Common lines: {len(set(simp_lines) & set(pdf_lines))}")

# Similarity ratio
similarity = difflib.SequenceMatcher(None, full_text_simp, full_text).ratio()
print(f"\nOverall similarity: {similarity:.2%}")


In [None]:
# Method 3: Using PyMuPDF (fitz) - Fast and powerful
# Install: uv add pymupdf

import fitz  # PyMuPDF

pdf_path = "CV/CV.pdf"
doc = fitz.open(pdf_path)

print(f"Number of pages: {doc.page_count}")

# Extract text from first page
first_page = doc[0]
text = first_page.get_text()
print("\n--- First Page Text ---")
print(text)

# Extract all text
all_text = []
for page_num in range(doc.page_count):
    page = doc[page_num]
    all_text.append(page.get_text())

full_text = "\n".join(all_text)
print(f"\n--- Full Document Text ---")
print(full_text[:500] + "..." if len(full_text) > 500 else full_text)

doc.close()


## Comparison of Libraries

- **pypdf**: Simple, lightweight, good for basic text extraction
- **pdfplumber**: Best for extracting tables and maintaining text formatting
- **PyMuPDF (fitz)**: Fastest, can also extract images and handle complex PDFs

For most use cases, **pypdf** or **pdfplumber** are recommended.


In [None]:
# Helper function to read PDF (using pypdf as default)
def read_pdf(file_path: str) -> str:
    """
    Read a PDF file and return its text content.
    
    Args:
        file_path: Path to the PDF file
        
    Returns:
        Full text content of the PDF
    """
    from pypdf import PdfReader
    
    reader = PdfReader(file_path)
    text_parts = []
    
    for page in reader.pages:
        text_parts.append(page.extract_text())
    
    return "\n".join(text_parts)

# Example usage
cv_text = read_pdf("CV/CV.pdf")
print(f"CV text length: {len(cv_text)} characters")
print("\nFirst 300 characters:")
print(cv_text[:300])


# Testing SearchAgent

This section tests the SearchAgent functionality for searching company information.


In [1]:
# Import the SearchAgent
import sys
import os

# Add parent directory to path to import from agents module
current_dir = os.getcwd()
if current_dir not in sys.path:
    sys.path.insert(0, current_dir)

from agents.search import SearchAgent

# Initialize the search agent
search_agent = SearchAgent()
print("SearchAgent initialized successfully!")


SearchAgent initialized successfully!


In [None]:
# Test 1: Simple search with a direct query string
test_query = "Adspert, Berlin, Germany, Digital Marketing"
print(f"Testing search with query: '{test_query}'")
print("-" * 80)

try:
    results = search_agent.search_tavily(test_query)
    print(f"✓ Search completed successfully!")
    print(f"Number of results: {len(results.get('results', []))}")
    print(f"\nFirst result:")
    if results.get('results'):
        first_result = results['results'][0]
        print(f"  Title: {first_result.get('title', 'N/A')}")
        print(f"  URL: {first_result.get('url', 'N/A')}")
        print(f"  Content preview: {first_result.get('content', 'N/A')[:200]}...")
except Exception as e:
    print(f"✗ Error: {e}")
    import traceback
    traceback.print_exc()


In [None]:
results

In [None]:
# Test 2: Search with company information dictionary
# Note: The run method currently has an issue - it passes dict to search_tavily which expects a string
# This test shows how it should work with formatted query

company_info = {
    "name": "Adspert",
    "industry": "Digital Marketing",
    "location": "Berlin, Germany"
}

# Format the query template with company information
from agents.search import query as query_template

formatted_query = query_template.format(
    company_information=f"Company: {company_info['name']}, Industry: {company_info['industry']}, Location: {company_info['location']}"
)

print(f"Testing search with formatted query:")
print(f"Query: {formatted_query[:100]}...")
print("-" * 80)

try:
    results = search_agent.search_tavily(formatted_query)
    print(f"✓ Search completed successfully!")
    print(f"Number of results: {len(results.get('results', []))}")
    
    # Display all results
    print(f"\nAll results:")
    for i, result in enumerate(results.get('results', [])[:5], 1):  # Show first 5
        print(f"\n{i}. {result.get('title', 'N/A')}")
        print(f"   URL: {result.get('url', 'N/A')}")
        print(f"   Content: {result.get('content', 'N/A')[:150]}...")
        
except Exception as e:
    print(f"✗ Error: {e}")
    import traceback
    traceback.print_exc()


In [None]:
# Test 3: Inspect full search results structure
test_query = "Adspert company"
results = search_agent.search_tavily(test_query)

print("Full results structure:")
print("=" * 80)
print(f"Keys in results: {list(results.keys())}")
print(f"\nQuery used: {results.get('query', 'N/A')}")
print(f"Response time: {results.get('response_time', 'N/A')} seconds")
print(f"Number of results: {len(results.get('results', []))}")

if results.get('results'):
    print(f"\nFirst result structure:")
    first_result = results['results'][0]
    print(f"  Keys: {list(first_result.keys())}")
    for key, value in first_result.items():
        if isinstance(value, str) and len(value) > 200:
            print(f"  {key}: {value[:200]}...")
        else:
            print(f"  {key}: {value}")


In [4]:
# Test 4: Test the run method (note: this may need fixing in search.py)
# company_info = {
#     "name": "Adspert",
#     "industry": "Digital Marketing",
#     "description": "A digital marketing company"
# }
company_info = "Adspert, Berlin, Germany, A digital marketing company"


print(f"Testing run method with company_info: {company_info}")
print("-" * 80)
print("Note: The run method currently has an issue - it passes dict directly to search_tavily")
print("This test will show the current behavior")
print("-" * 80)

try:
    result = search_agent.run(company_info)
    print(f"Result type: {type(result)}")
    if result:
        print(f"Result: {result}")
    else:
        print("Result is None (method doesn't return anything)")
except Exception as e:
    print(f"✗ Error: {e}")
    import traceback
    traceback.print_exc()


Testing run method with company_info: Adspert, Berlin, Germany, A digital marketing company
--------------------------------------------------------------------------------
Note: The run method currently has an issue - it passes dict directly to search_tavily
This test will show the current behavior
--------------------------------------------------------------------------------
Result type: <class 'NoneType'>
Result is None (method doesn't return anything)


In [3]:
results

NameError: name 'results' is not defined

In [None]:
company_info = {
    "name": "Adspert",
    "industry": "Digital Marketing",
    "description": "A digital marketing company"
}

In [None]:
"Adspert, Berlin, Germany, A digital marketing company"