In [4]:
import pdfplumber
import re
import pandas as pd
from datasets import Dataset

def is_heading(line):
    """Identify if a line is a heading based on simple heuristics."""
    return line.isupper() or len(line.split()) < 5

def extract_sections_from_pdf(pdf_path):
    """Extract sections from a PDF by identifying headings and their content."""
    sections = []
    with pdfplumber.open(pdf_path) as pdf:
        current_section = None
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                lines = text.split('\n')
                for line in lines:
                    if is_heading(line):
                        if current_section:
                            sections.append(current_section)
                        current_section = {'title': line.strip(), 'content': ''}
                    elif current_section:
                        current_section['content'] += line + ' '
        if current_section:
            sections.append(current_section)
    return sections

def get_first_paragraph(content, max_sentences=3):
    """Extract the first few sentences from the content."""
    sentences = re.split(r'[.!?]\s+', content.strip())
    first_sentences = ' '.join(sentences[:max_sentences])
    return first_sentences.strip()

def format_book_sample(title, first_paragraph):
    """Format a section into a conversational Q&A sample."""
    user_query = f"Tell me about {title.lower()}."
    assistant_response = f"Sure! {first_paragraph}"
    sample = f"User: {user_query}\nAssistant: {assistant_response}"
    return sample

def preprocess_pdf_book(pdf_path, output_file='formatted_book_data.csv'):
    """Process a PDF book into conversational samples and save to a dataset."""
    sections = extract_sections_from_pdf(pdf_path)
    samples = []
    
    for section in sections:
        title = section['title']
        first_paragraph = get_first_paragraph(section['content'])
        if first_paragraph:
            sample = format_book_sample(title, first_paragraph)
            samples.append({'text': sample, 'title': title})
    
    # Save to CSV
    df = pd.DataFrame(samples)
    df.to_csv(output_file, index=False)
    
    # Convert to Dataset
    dataset = Dataset.from_pandas(df[['text']])
    return dataset, len(samples)

# Example usage
if __name__ == "__main__":
    PDF_PATH = "D:/Python/dating coach/book/Make Her Chase You_ The Guide To Attracting Girls Who Are ''Out Of Your League'' Even If You'Re Not Rich Or Handsome'' ( PDFDrive ).pdf"  # Replace with your PDF file path
    OUTPUT_FILE = "D:/Python/dating coach/formatted_book_data.csv"
    dataset, sample_count = preprocess_pdf_book(PDF_PATH, OUTPUT_FILE)
    print(f"Processed {sample_count} samples. Saved to {OUTPUT_FILE}")



Processed 442 samples. Saved to D:/Python/dating coach/formatted_book_data.csv
