In [1]:
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import os

try:
    # Get the poppler path
    if os.path.exists('/opt/homebrew/bin'):  # For Apple Silicon Macs
        poppler_path = '/opt/homebrew/bin'
        pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract'
    elif os.path.exists('/usr/local/bin'):   # For Intel Macs
        poppler_path = '/usr/local/bin'
        pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
    else:
        raise Exception("Couldn't find required paths")
        
    print(f"Using poppler path: {poppler_path}")
    print(f"Using tesseract path: {pytesseract.pytesseract.tesseract_cmd}")
    
    # Convert PDF to images
    pages = convert_from_path(
        'Atomic habits.pdf',
        first_page=10,
        last_page=10,
        poppler_path=poppler_path
    )
    
    # Get first page as image
    first_page = pages[0]
    
    # Extract text using OCR
    text = pytesseract.image_to_string(first_page)
    
    # Debug info
    print(f"Length of extracted text: {len(text)}")
    
    # Split and print first 10 lines
    lines = text.split('\n')
    print(f"Number of lines: {len(lines)}")
    
    for line in lines[:100]:
        print(f"Line: '{line}'")

except FileNotFoundError:
    print("Error: Could not find 'Atomic habits.pdf'")
except Exception as e:
    print(f"An error occurred: {str(e)}")

Using poppler path: /opt/homebrew/bin
Using tesseract path: /opt/homebrew/bin/tesseract
Length of extracted text: 2876
Number of lines: 46
Line: 'Mercifully, by the next morning my breathing had rebounded to the point where'
Line: 'the doctors felt comfortable releasing me from the coma. When I finally'
Line: 'regained consciousness, I discovered that I had lost my ability to smell. As a test,'
Line: 'a nurse asked me to blow my nose and sniff an apple juice box. My sense of'
Line: 'smell returned, but—to everyone’s surprise—the act of blowing my nose forced'
Line: 'air through the fractures in my eye socket and pushed my left eye outward. My'
Line: 'eyeball bulged out of the socket, held precariously in place by my eyelid and the'
Line: 'optic nerve attaching my eye to my brain.'
Line: ''
Line: 'The ophthalmologist said my eye would gradually slide back into place as the'
Line: 'air seeped out, but it was hard to tell how long this would take. I was scheduled'
Line: 'for surgery one w

In [2]:
try:
    # Convert all pages of PDF to images
    pages = convert_from_path(
        'Atomic habits.pdf',
        poppler_path=poppler_path
    )
    
    print(f"Total pages found: {len(pages)}")
    
    # Extract text from all pages
    full_text = ""
    for i, page in enumerate(pages):
        text = pytesseract.image_to_string(page)
        full_text += text + "\n\n"  # Add extra newlines between pages
        print(f"Processed page {i+1}/{len(pages)}")
        
    # Save to text file
    with open('atomic_habits.txt', 'w', encoding='utf-8') as f:
        f.write(full_text)
        
    print(f"Successfully saved text to atomic_habits.txt")
    print(f"Total characters extracted: {len(full_text)}")

except Exception as e:
    print(f"An error occurred: {str(e)}")


Total pages found: 256
Processed page 1/256
Processed page 2/256
Processed page 3/256
Processed page 4/256
Processed page 5/256
Processed page 6/256
Processed page 7/256
Processed page 8/256
Processed page 9/256
Processed page 10/256
Processed page 11/256
Processed page 12/256
Processed page 13/256
Processed page 14/256
Processed page 15/256
Processed page 16/256
Processed page 17/256
Processed page 18/256
Processed page 19/256
Processed page 20/256
Processed page 21/256
Processed page 22/256
Processed page 23/256
Processed page 24/256
Processed page 25/256
Processed page 26/256
Processed page 27/256
Processed page 28/256
Processed page 29/256
Processed page 30/256
Processed page 31/256
Processed page 32/256
Processed page 33/256
Processed page 34/256
Processed page 35/256
Processed page 36/256
Processed page 37/256
Processed page 38/256
Processed page 39/256
Processed page 40/256
Processed page 41/256
Processed page 42/256
Processed page 43/256
Processed page 44/256
Processed page 45/

In [4]:
import os
import openai
from tqdm import tqdm

try:

    client = openai.OpenAI(
        # This is the default and can be omitted
        api_key=os.getenv('OPENAI_TOKEN'),
    )
    
    # Read the original text file
    with open('atomic_habits.txt', 'r', encoding='utf-8') as f:
        text = f.read()
    
    # Split into chunks of ~2000 characters to stay within API limits
    chunk_size = 2000
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    
    print(f"Split text into {len(chunks)} chunks")
    
    # Process each chunk with ChatGPT
    cleaned_chunks = []
    for chunk in tqdm(chunks, desc="Processing chunks"):
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that cleans up and reformats text. Fix any OCR errors, weird line breaks, and formatting issues to make the text clean and readable."},
                {"role": "user", "content": f"Please clean up and reformat this text:\n\n{chunk}"}
            ]
        )
        cleaned_chunks.append(response.choices[0].message.content)
        
    # Combine chunks and save to new file
    cleaned_text = "\n".join(cleaned_chunks)
    
    with open('atomic_habits_cleaned.txt', 'w', encoding='utf-8') as f:
        f.write(cleaned_text)
        
    print(f"Successfully saved cleaned text to atomic_habits_cleaned.txt")
    print(f"Original length: {len(text)}")
    print(f"Cleaned length: {len(cleaned_text)}")

except Exception as e:
    print(f"An error occurred: {str(e)}")


Split text into 154 chunks


Processing chunks: 100%|██████████| 154/154 [12:31<00:00,  4.88s/it]

Successfully saved cleaned text to atomic_habits_cleaned.txt
Original length: 306614
Cleaned length: 305269





In [5]:
# Read the context file
with open('my context.txt', 'r', encoding='utf-8') as f:
    context = f.read()


def get_prompt_chunk_rewriting(chunk):
    return f"""
    I will give a part of the Atomic Habits book. Please rewrite it in a way that is tailored to my own needs and my own examples. You don't need to change the overall gist, you just need to showcase relevant examples from my life.

    Here is the part of the book:
    {chunk}

    Here is my context:
    {context}

    Please rewrite the text in a way that is tailored to my own needs and my own examples. You don't need to change the overall gist, you just need to showcase relevant examples from my life. And don't add anything else.
    """

In [6]:
import time

# Read the original cleaned text
with open('atomic_habits_cleaned.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Split into chunks of ~2000 characters
chunk_size = 2000
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# Process each chunk with ChatGPT
rewritten_chunks = []
for chunk in tqdm(chunks, desc="Processing chunks"):
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that rewrites text to be more personalized."},
                {"role": "user", "content": get_prompt_chunk_rewriting(chunk)}
            ]
        )
        rewritten_chunks.append(response.choices[0].message.content)
        time.sleep(1) # Rate limiting
    except Exception as e:
        print(f"Error processing chunk: {str(e)}")
        rewritten_chunks.append(chunk) # Keep original on error
        
# Combine chunks and save to new file
rewritten_text = "\n".join(rewritten_chunks)

with open('atomic_habits_rewritten_v0.txt', 'w', encoding='utf-8') as f:
    f.write(rewritten_text)
    
print(f"Successfully saved rewritten text to atomic_habits_rewritten_v0.txt")
print(f"Original length: {len(text)}")
print(f"Rewritten length: {len(rewritten_text)}")


Processing chunks: 100%|██████████| 153/153 [13:10<00:00,  5.16s/it]

Successfully saved rewritten text to atomic_habits_rewritten_v0.txt
Original length: 305757
Rewritten length: 299304



