In [200]:
#Basic imports
from datetime import datetime
from dotenv import load_dotenv
import os
import base64
import hashlib

#Parse text
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

#Organize 
import openai
from openai import OpenAI

In [201]:
#Get OpenAI API key
load_dotenv()  
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

In [202]:
#Function for extracting images from enex file

#Find every resource block in evernote note, decode the base64 data, and save 
def extract_resources(note_elem, output_dir="resources"):
    os.makedirs(output_dir, exist_ok=True)
    resources_map = {}

    for res in note_elem.findall("resource"):
        data_elem = res.find("data")
        mime = res.findtext("mime")

        if data_elem is None or not data_elem.text:
            print("⚠️ Skipping resource with no data")
            continue

        try:
            # Decode the binary data
            binary_data = base64.b64decode(data_elem.text)
        except Exception as e:
            print("⚠️ Error decoding resource:", e)
            continue

        #Compute MD5 digest
        hash_val = hashlib.md5(binary_data).hexdigest()

        #File extension from MIME
        ext = "png" if mime == "image/png" else "jpg" if mime == "image/jpeg" else "bin"
        filename = f"{hash_val}.{ext}"
        filepath = os.path.join(output_dir, filename)

        #Save file
        with open(filepath, "wb") as f:
            f.write(binary_data)

        resources_map[hash_val] = f"{output_dir}/{filename}"

    print("Extracted resources:", list(resources_map.keys())[:5])
    return resources_map

In [203]:
#Function to replace the tag in the content 

def replace_en_media(content_html, resources_map):
    soup = BeautifulSoup(content_html, "html.parser")
    for media in soup.find_all("en-media"):
        hash_val = media.get("hash")
        if hash_val in resources_map:
            new_img = soup.new_tag("img", src=resources_map[hash_val])
            media.replace_with(new_img)
    return str(soup)

In [204]:
# Parse the metadata with XML parser
tree = ET.parse("test.enex")
root = tree.getroot()
note_elem = root.find("note")
for child in note_elem:
    print(child.tag, child.text[:50])

title Abdominoplasty and Belt Lipectomy
created 20180311T201956Z
updated 20241022T190033Z
note-attributes 
      
content 
      <?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE en-note SYSTEM "http://xml.e
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      
resource 
      


In [205]:
#Let's peek at things
print(note_elem.find("title").text)
print(note_elem.find("created").text)
print(note_elem.find("updated").text)

Abdominoplasty and Belt Lipectomy
20180311T201956Z
20241022T190033Z


In [206]:
#Pull out metadata for YAML file
title = note_elem.find("title").text
created = note_elem.find("created").text
updated = note_elem.find("updated").text
tags = [tag_elem.text for tag_elem in note_elem.findall("tag")]

#Dates need to be converted to ISO for cleaner formatting
created = datetime.strptime(created, "%Y%m%dT%H%M%SZ").isoformat()
updated = datetime.strptime(updated, "%Y%m%dT%H%M%SZ").isoformat()

In [207]:
#Create YAML string
lines = []
lines.append("---")
lines.append(f"title: \"{title}\"")
lines.append(f"created: {created}")
lines.append(f"updated: {updated}")
if tags:
    lines.append("tags:")
    for tag in tags:
        lines.append(f"  - {tag}")
lines.append("---")

yaml_metadata = "\n".join(lines)

In [208]:
#We're interested in the content of the note, which we pass to BeautifulSoup to parse the HTML content
note_elem.find("content").text[:300]

'\n      <?xml version="1.0" encoding="UTF-8" standalone="no"?>\n<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"><en-note><div><br/></div><div>Abdominoplasty</div><ul><li><div>Anatomy</div></li><ul><li><div>Ab wall</div></li><ul><li><div>Skin</div></li><li><div>SubQ fat (superficial)</'

In [209]:
#Save the note content
raw_content = note_elem.find("content").text

In [210]:
#Clean it up
start = raw_content.find("<en-note>")
end = raw_content.find("</en-note>") + len("</en-note>")

cleaned_content = raw_content[start:end]

#Strip whitespace
cleaned_content = cleaned_content.strip()

In [211]:
#Map images to resources 
resources_map = extract_resources(note_elem, output_dir="resources")
cleaned_content = replace_en_media(cleaned_content, resources_map)

Extracted resources: ['f606a594cbf16b873d12662f98560ca1', '39b3fad5caeb34d57c093985ab4853df', 'a44e7a7f85ae5beb6eb7b7d3942ffada', '1fb1141589b3c98e327622d637d282ce', '6eae52ad273d203e37f9af59f2f2a7a0']


In [212]:
#Let's use BeautifulSoup to parse the HTML content
soup = BeautifulSoup(cleaned_content, "html.parser")
en_note = soup.find("en-note")

<en-note>
 <div>
  <br/>
 </div>
 <div>
  Abdominoplasty
 </div>
 <ul>
  <li>
   <div>
    Anatomy
   </div>
  </li>
  <ul>
   <li>
    <div>
     Ab wall
    </div>
   </li>
   <ul>
    <li>
     <div>
      Skin
     </div>
    </li>
    <li>
     <div>
      SubQ fat (superficial)
     </div>
    </li>
    <li>
     <div>
      Scarpa
     </div>
    </li>
    <li>
     <div>
      Subscarpal fat
     </div>
    </li>
    <li>
     <div>
      Anterior rectus sheath
     </div>
    </li>
    <li>
     <div>
      Muscle
     </div>
    </li>
    <li>
     <div>
      Posterior rectus sheath
     </div>
    </li>
   </ul>
   <li>
    <div>
     Four paired muscles
    </div>
   </li>
   <ul>
    <li>
     <div>
      Rectus abdominis
     </div>
    </li>
    <li>
     <div>
      External oblique
     </div>
    </li>
    <li>
     <div>
      Internal oblique
     </div>
    </li>
    <li>
     <div>
      Transverse abdominis
     </div>
    </li>
   </ul>
   <li>
    <div>
     V

In [214]:
#This is the clean content for the OpenAI model

#Clean parsed note content
note_content = cleaned_content  

#Pull in the prompt
with open("system_prompt.txt", "r") as f:
    system_msg = f.read()

#Make the call 
response = client.chat.completions.create(
    model="gpt-5",
    messages=[
        {"role": "system", "content": system_msg},
        {"role": "user", "content": note_content}
    ]
    #temperature=0 #GPT-5 models don't support temperature setting  
)

markdown_output = response.choices[0].message.content

#Final note combines YAML metadata and cleaned markdown content
final_note = f"{yaml_metadata}\n\n{markdown_output}"

#Save to file
with open("cleaned_note.md", "w", encoding="utf-8") as f:
    f.write(final_note)