# EPUB Text Extraction Script

This notebook extracts text content and metadata from EPUB files in the Nordic travel literature collection.

**Purpose:**
- Batch process multiple EPUB files from the `literature epub` folder
- Extract element content and metadata (source file, element structure) from each EPUB
- Save extracted data as CSV files for further analysis

**Workflow:**
1. Load EPUB files using LangChain's UnstructuredEPubLoader
2. Extract text content and metadata from each element (chapters, sections, paragraphs)
3. Organize data into pandas DataFrames
4. Export to CSV files in `literature csv_raw` folder

**Input:** EPUB files in `.\literature epub\`

**Output:** CSV files (one per EPUB) in `.\literature csv_raw\` containing:
- `source`: Source EPUB filename
- `page_content`: Extracted text content from each element
- Additional metadata fields depending on EPUB structure

**Dependencies:** langchain_community, pandas, tqdm

**Note:** EPUB files are processed as elements (structural components) rather than pages, as EPUBs are reflowable documents without fixed pagination.

In [1]:
# Import required libraries
print ('import libraries')

# LangChain document loaders for EPUB and PDF processing
from langchain_community.document_loaders import UnstructuredEPubLoader  # Main EPUB loader
from langchain.document_loaders import PyPDFLoader  # For PDF files (if needed)

# Data manipulation and analysis
import pandas as pd  # For creating and managing DataFrames

# Text processing and file operations
import re  # Regular expressions for text cleaning (if needed)
import os  # For file and directory operations
import csv  # CSV file handling
import json  # JSON data handling (if needed)

# Progress tracking
from tqdm.notebook import tqdm  # Display progress bars in Jupyter notebooks
tqdm.pandas()  # Enable progress bars for pandas operations

# Time tracking
import time  # For measuring execution time (if needed)

import libraries


In [2]:
# Function to extract text from EPUB and return data in a DataFrame
def epub_text_extract_from_file(epub_file):
    """
    Extract text content and metadata from an EPUB file.
    
    Args:
        epub_file (str): Path to the EPUB file to process
        
    Returns:
        pd.DataFrame: DataFrame containing element metadata and content with columns:
                      - source: Source EPUB filename
                      - page_content: Extracted text from each element
                      - Additional metadata fields (varies by EPUB structure)
    """
    
    # Initialize the EPUB loader with element-based mode
    # mode="elements" breaks the EPUB into structural components (chapters, sections, paragraphs)
    # This provides more granular text extraction than loading the entire book at once
    loader = UnstructuredEPubLoader(epub_file, mode="elements")
    
    # Load all elements from the EPUB
    # Returns a list of Document objects, one per structural element
    docs = loader.load()
    
    # Create an empty list to store individual element DataFrames
    data_bucket = [] 
    
    # Iterate through each element document
    for i in docs:
        # Extract metadata (source file, element type, etc.)
        meta_data = i.metadata
        
        # Extract the actual text content from the element
        page_content = i.page_content
        
        # Create a DataFrame from the metadata dictionary
        # The [meta_data] wraps it in a list to create a single-row DataFrame
        df = pd.DataFrame.from_dict([meta_data])
        
        # Add the element content as a new column
        df['page_content'] = page_content
        
        # Append this element's DataFrame to the collection
        data_bucket.append(df)
    
    # Check if we have any data to concatenate
    if not data_bucket:
        # Return an empty DataFrame if no elements were extracted
        return pd.DataFrame()
    
    # Concatenate all element DataFrames into a single DataFrame
    # ignore_index=True creates a new sequential index (0, 1, 2, ...)
    book_data = pd.concat(data_bucket, ignore_index=True)
    
    # Return the complete book data
    return book_data

In [None]:
# ============================================
# BATCH PROCESSING: Extract all EPUB files
# ============================================

# Define input and output directories
departure_folder = r'.\literature epub'     # Source folder containing EPUB files
arrival_folder = r'.\literature csv_raw'    # Destination folder for CSV output

# Create the output directory if it doesn't exist
# exist_ok=True prevents errors if the directory already exists
os.makedirs(arrival_folder, exist_ok=True)

# Get a list of all files in the EPUB folder
files_in_folder = os.listdir(departure_folder)

# Process each file in the folder with a progress bar
for file in tqdm(files_in_folder, desc='Extracting data from multiple epub files', colour='blue'):
    
    # Skip non-EPUB files (e.g., hidden files, directories)
    if not file.lower().endswith('.epub'):
        continue
    
    try:
        # Construct the full path to the EPUB file
        file_path = os.path.join(departure_folder, file)
        
        # Extract text and metadata from the EPUB using our function
        # Returns a DataFrame with all elements from this book
        book = epub_text_extract_from_file(file_path)
        
        # Skip empty DataFrames
        if book.empty:
            print(f"Warning: No content extracted from {file}")
            continue
        
        # Create the CSV filename by replacing .epub extension with .csv
        # str(file)[:-5] removes the last 5 characters (.epub)
        csv_file_name = str(file)[:-5] + '.csv'
        
        # Construct the full path for the output CSV file
        csv_file_path = os.path.join(arrival_folder, csv_file_name)
        
        # Save the DataFrame to CSV
        # index=False prevents pandas from writing row numbers as a column
        book.to_csv(csv_file_path, index=False)
        
    except Exception as e:
        # Catch and report any errors during processing
        # This prevents one bad file from stopping the entire batch
        print(f"Error processing {file}: {str(e)}")

Extracting data from multiple epub files:   0%|          | 0/134 [00:00<?, ?it/s]