# PDF Text Extraction and Processing

This notebook demonstrates how to extract text from PDF files, process the content, and prepare it for further analysis.

## Setup and Imports

In [11]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm.notebook import tqdm
import pypdf
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Set paths
DATA_DIR = 'data/'

## PDF Text Extraction Function

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = pypdf.PdfReader(file)
            text = ""
            
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text() + "\n\n"
            
            return {
                'filename': os.path.basename(pdf_path),
                'text': text,
                'pages': len(pdf_reader.pages)
            }
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return None

## Loading PDF Files

# Load all PDFs from directory
pdf_files = list(Path(DATA_DIR).glob('**/*.pdf'))
print(f"Found {len(pdf_files)} PDF files")

# Process PDFs
documents = []
for pdf_path in tqdm(pdf_files):
    doc = extract_text_from_pdf(pdf_path)
    if doc:
        documents.append(doc)

## Basic Analysis

python# Create a DataFrame
df = pd.DataFrame(documents)

# Display basic statistics
print(f"Total documents: {len(df)}")
print("\nDocument page distribution:")
print(df['pages'].describe())

# Calculate text length
df['text_length'] = df['text'].apply(len)
print("\nText length statistics:")
print(df['text_length'].describe())

## Visualization


In [None]:
# Visualize page count
plt.figure(figsize=(8, 5))
sns.histplot(df['pages'])
plt.title('Number of Pages per Document')
plt.xlabel('Pages')
plt.ylabel('Count')
plt.show()

## Text Chunking for Processing

In [None]:
# Split documents into chunks for better retrieval
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)

# Process chunks
all_chunks = []
for idx, row in tqdm(df.iterrows(), total=len(df)):
    chunks = text_splitter.split_text(row['text'])
    
    for i, chunk in enumerate(chunks):
        all_chunks.append({
            'chunk_id': f"{row['filename']}_{i}",
            'document': row['filename'],
            'text': chunk,
            'chunk_length': len(chunk)
        })

## Analyzing Chunks

In [None]:
# Create chunks DataFrame
chunks_df = pd.DataFrame(all_chunks)

print(f"\nTotal chunks created: {len(chunks_df)}")
print(f"Average chunks per document: {len(chunks_df) / len(df):.2f}")

# Sample a chunk
print("\nSample chunk:")
sample = chunks_df.sample(1).iloc[0]
print(f"Document: {sample['document']}")
print(f"Chunk ID: {sample['chunk_id']}")
print(f"Preview: {sample['text'][:200]}...")

## Saving Results


In [None]:
# Save processed chunks for ingestion
chunks_df.to_csv('processed_chunks.csv', index=False)
print(f"Saved {len(chunks_df)} chunks to processed_chunks.csv")