microsoft · ultmaster · Jun 5, 2025 · Jun 5, 2025 · Jun 5, 2025 · Jun 5, 2025
diff --git a/docs/components.md b/docs/components.md
@@ -748,6 +748,7 @@ To display a Word document without including the real multimedia:
 
 - **src**: The source file to read the data from. This must be provided if records is not provided.
 - **buffer**: Buffer. Document data buffer. Recommended to use `src` instead unless you want to use a string.
+- **base64**: Base64 encoded string of the document data. Mutually exclusive with `src` and `buffer`.
 - **parser**: Can be one of: auto, pdf, docx, txt. The parser to use for reading the data. If not provided, it will be inferred from the file extension.
 - **multimedia**: Boolean. If true, the multimedias will be displayed. If false, the alt strings will be displayed at best effort. Default is `true`.
 - **selectedPages**: The pages to be selected. This is only available **for PDF documents**. If not provided, all pages will be selected.
@@ -998,6 +999,7 @@ Convert HTML to structured POML components:
 - **url**: The URL of the webpage to fetch and display.
 - **src**: Local file path to an HTML file to display.
 - **buffer**: Buffer. HTML content as string or buffer.
+- **base64**: Base64 encoded HTML content.
 - **extractText**: Boolean. Whether to extract plain text content (true) or convert HTML to structured POML (false). Default is false.
 - **selector**: CSS selector to extract specific content from the page (e.g., "article", ".content", "#main"). Default is "body".
 - **syntax**: Can be one of: markdown, html, json, yaml, xml, text. The syntax of the content.

diff --git a/packages/poml-vscode/lsp/parseComments.ts b/packages/poml-vscode/lsp/parseComments.ts
@@ -1,9 +1,9 @@
-import "poml";
-import { ComponentSpec, Parameter } from "poml/base";
+import 'poml';
+import { ComponentSpec, Parameter } from 'poml/base';
 
-import { readFileSync, readdirSync, writeFileSync } from "fs";
-import { join } from "path";
-import { formatComponentDocumentation } from "./documentFormatter";
+import { readFileSync, readdirSync, writeFile, writeFileSync } from 'fs';
+import { join } from 'path';
+import { formatComponentDocumentation } from './documentFormatter';
 
 const basicComponents: string[] = [];
 const intentions: string[] = [];
@@ -18,13 +18,14 @@ function tsCommentToMarkdown(comment: string): ComponentSpec {
     .replace(/^\/\*\*?/, '')
     .replace(/\*\/$/, '')
     .split('\n')
-    .map((line) => line.replace(/^\s*\*( )?/, ''))
-    .map((line) => line.replace(/\s+$/, ''))
+    .map(line => line.replace(/^\s*\*( )?/, ''))
+    .map(line => line.replace(/\s+$/, ''))
     .join('\n');
 
   // Recognize description, @param and @example in the comment.
   const descriptionRegex = /([\s\S]*?)(?=@param|@example|@see|$)/;
-  const paramRegex = /@param\s+(\{([\S'"\|]+?)\}\s+)?(\w+)\s+-\s+([\s\S]*?)(?=@param|@example|@see|$)/g;
+  const paramRegex =
+    /@param\s+(\{([\S'"\|]+?)\}\s+)?(\w+)\s+-\s+([\s\S]*?)(?=@param|@example|@see|$)/g;
   const exampleRegex = /@example\s+([\s\S]*?)(?=@param|@example|@see|$)/;
   const seeRegex = /@see\s+([\s\S]*?)(?=@param|@example|@see|$)/g;
 
@@ -50,7 +51,7 @@ function tsCommentToMarkdown(comment: string): ComponentSpec {
       fallbackType = 'string';
     } else if (paramMatch[2] && paramMatch[2].includes('|')) {
       type = 'string';
-      choices = paramMatch[2].split('|').map((choice) => choice.replace(/['"\s]/g, '').trim());
+      choices = paramMatch[2].split('|').map(choice => choice.replace(/['"\s]/g, '').trim());
     } else if (paramMatch[2]) {
       type = paramMatch[2];
     }
@@ -80,7 +81,7 @@ function tsCommentToMarkdown(comment: string): ComponentSpec {
     params,
     example,
     baseComponents
-  }
+  };
 }
 
 function extractTsComments(text: string) {
@@ -95,7 +96,8 @@ function extractTsComments(text: string) {
 
 function extractComponentComments(text: string) {
   const comments: ComponentSpec[] = [];
-  const commentRegex = /(\/\*\*([\s\S]*?)\*\/)\nexport const [\w]+ = component\(['"](\w+)['"](,[\S\s]*?)?\)/g;
+  const commentRegex =
+    /(\/\*\*([\s\S]*?)\*\/)\nexport const [\w]+ = component\(['"](\w+)['"](,[\S\s]*?)?\)/g;
   let match;
   while ((match = commentRegex.exec(text)) !== null) {
     const doc = { name: match[3], ...tsCommentToMarkdown(match[2]) };
@@ -104,7 +106,6 @@ function extractComponentComments(text: string) {
   return comments;
 }
 
-
 function* walk(folderPath: string): IterableIterator<string> {
   for (const entry of readdirSync(folderPath, { withFileTypes: true })) {
     if (entry.isFile() && (entry.name.endsWith('.tsx') || entry.name.endsWith('.ts'))) {
@@ -135,7 +136,7 @@ function scanComponentDocs(folderPath: string) {
     } else {
       utilities.push(...names);
     }
-  };
+  }
   return allComments;
 }
 
@@ -159,6 +160,134 @@ function docsToMarkdown(docs: ComponentSpec[]) {
   return parts.join('\n\n');
 }
 
+function camelToSnake(str: string): string {
+  return str
+    .replace(/([A-Z]+)([A-Z][a-z])/g, '$1_$2') // Handles cases like "XMLFile" -> "XML_File"
+    .replace(/([a-z\d])([A-Z])/g, '$1_$2') // Handles "camelCase" -> "camel_Case"
+    .toLowerCase(); // Converts to lowercase: "XML_File" -> "xml_file"
+}
+
+function getPythonType(jsonType: string, paramName: string): string {
+  const lcJsonType = jsonType.toLowerCase();
+  switch (lcJsonType) {
+    case 'string':
+      return 'str';
+    case 'boolean':
+      return 'bool';
+    case 'buffer':
+      return 'bytes';
+    case 'number':
+      // Heuristic for int vs float based on common parameter names
+      if (
+        paramName.includes('max') ||
+        paramName.includes('count') ||
+        paramName.includes('depth') ||
+        paramName.endsWith('Index')
+      ) {
+        return 'int';
+      }
+      return 'float';
+    case 'object':
+      return 'Any'; // Could be Dict[str, Any]
+    case 'regexp':
+      return 'str'; // Python uses strings for regex patterns
+    default:
+      if (jsonType.endsWith('[]')) {
+        // Handles array types like TreeItemData[]
+        return 'List[Any]'; // Generic list type
+      }
+      // For unknown or complex non-array types (e.g., a specific object schema name)
+      return 'Any';
+  }
+}
+
+function generatePythonMethod(tag: ComponentSpec): string {
+  const methodName = camelToSnake(tag.name!);
+  let paramsSignatureList: string[] = ['        self'];
+  let argsDocstring = '';
+  const callArgsList: string[] = [`tag_name="${tag.name}"`];
+
+  tag.params.forEach(param => {
+    const paramName = param.name; // Use original JSON name for Python parameter
+    const pythonType = getPythonType(param.type, paramName);
+    const typeHint = `Optional[${pythonType}]`;
+
+    paramsSignatureList.push(`        ${paramName}: ${typeHint} = None`);
+    callArgsList.push(`${paramName}=${paramName}`);
+
+    let paramDesc = param.description.replace(/\n/g, '\n            ');
+    if (param.defaultValue !== undefined) {
+      const defValStr =
+        typeof param.defaultValue === 'string' ? `"${param.defaultValue}"` : param.defaultValue;
+      paramDesc += ` Default is \`${defValStr}\`.`;
+    }
+    if (param.choices && param.choices.length > 0) {
+      paramDesc += ` Choices: ${param.choices.map(c => `\`${JSON.stringify(c)}\``).join(', ')}.`;
+    }
+    argsDocstring += `            ${paramName} (${typeHint}): ${paramDesc}\n`;
+  });
+
+  paramsSignatureList.push('        **kwargs: Any');
+
+  const paramsString = paramsSignatureList.join(',\n');
+
+  let docstring = `"""${tag.description.replace(/\n/g, '\n        ')}\n\n`;
+  if (argsDocstring) {
+    docstring += `        Args:\n${argsDocstring}`;
+  }
+  if (tag.example) {
+    const exampleIndented = tag.example
+      .replace(/\\/g, '\\\\') // Escape backslashes for string literal
+      .replace(/"""/g, '\\"\\"\\"') // Escape triple quotes if any in example
+      .replace(/\n/g, '\n            ');
+    docstring += `\n        Example:\n            ${exampleIndented}\n`;
+  }
+  docstring += `        """`;
+
+  const methodBody = `return self.tag(
+            ${callArgsList.join(',\n            ')},
+            **kwargs,
+        )`;
+
+  return `
+    def ${methodName}(
+${paramsString},
+    ):
+        ${docstring}
+        ${methodBody}
+    `;
+}
+
+function generatePythonFile(jsonData: ComponentSpec[]): string {
+  let pythonCode = `# This file is auto-generated from component documentation.
+# Do not edit manually. Run \`npm run build-comment\` to regenerate.
+
+from typing import Optional, Any, Union, List, Dict
+# from numbers import Number # For more specific number types if needed
+
+class _TagLib:
+
+    def tag(self, tag_name: str, **kwargs: Any) -> Any:
+        """Helper method to create a tag with the given name and attributes.
+        Implemented by subclasses.
+        """
+        raise NotImplementedError("This method should be implemented by subclasses.")
+`;
+
+  jsonData.forEach(tag => {
+    if (!tag.name) {
+      console.warn('Skipping tag with no name:', tag);
+      return;
+    }
+    pythonCode += generatePythonMethod(tag);
+  });
+
+  return pythonCode;
+}
+
 const allDocs = scanComponentDocs('packages/poml');
+const pythonCode = generatePythonFile(allDocs);
 writeFileSync('packages/poml/assets/componentDocs.json', JSON.stringify(allDocs, null, 2));
 writeFileSync('docs/components.md', docsToMarkdown(allDocs));
+writeFileSync('python/poml/_tags.py', pythonCode);
+console.log('Component documentation generated successfully!');
diff --git a/packages/poml/assets/componentDocs.json b/packages/poml/assets/componentDocs.json
@@ -18,6 +18,13 @@
         "description": "Document data buffer. Recommended to use `src` instead unless you want to use a string.",
         "required": false
       },
+      {
+        "name": "base64",
+        "type": "string",
+        "choices": [],
+        "description": "Base64 encoded string of the document data. Mutually exclusive with `src` and `buffer`.",
+        "required": false
+      },
       {
         "name": "parser",
         "type": "string",
@@ -1106,6 +1113,13 @@
         "description": "HTML content as string or buffer.",
         "required": false
       },
+      {
+        "name": "base64",
+        "type": "string",
+        "choices": [],
+        "description": "Base64 encoded HTML content.",
+        "required": false
+      },
       {
         "name": "extractText",
         "type": "boolean",

diff --git a/packages/poml/components/document.tsx b/packages/poml/components/document.tsx
@@ -188,6 +188,7 @@ interface DocumentProps extends PropsSyntaxBase {
   src?: string;
   parser?: DocumentParser;
   buffer?: string | Buffer;
+  base64?: string;
   multimedia?: boolean;
   selectedPages?: string;
 }
@@ -238,6 +239,7 @@ async function autoParseDocument(
  *
  * @param {string} src - The source file to read the data from. This must be provided if records is not provided.
  * @param {Buffer|string} buffer - Document data buffer. Recommended to use `src` instead unless you want to use a string.
+ * @param {string} base64 - Base64 encoded string of the document data. Mutually exclusive with `src` and `buffer`.
  * @param {'auto'|'pdf'|'docx'|'txt'} parser - The parser to use for reading the data. If not provided, it will be inferred from the file extension.
  * @param {boolean} multimedia - If true, the multimedias will be displayed. If false, the alt strings will be displayed at best effort. Default is `true`.
  * @param {string} selectedPages - The pages to be selected. This is only available **for PDF documents**. If not provided, all pages will be selected.
@@ -255,15 +257,22 @@ async function autoParseDocument(
 export const Document = component('Document', { aliases: ['doc'], asynchorous: true })((
   props: DocumentProps
 ) => {
-  let { buffer, parser, ...others } = props;
+  let { buffer, parser, base64, ...others } = props;
   let parsedBuffer: Buffer | undefined;
-  if (typeof buffer === 'string') {
-    parsedBuffer = Buffer.from(buffer, 'utf-8');
-    if (parser === undefined || parser === 'auto') {
-      parser = 'txt';
+  if (base64) {
+    if (buffer !== undefined) {
+      throw new Error('Either buffer or base64 should be provided, not both.');
     }
+    parsedBuffer = Buffer.from(base64, 'base64');
   } else {
-    parsedBuffer = buffer;
+    if (typeof buffer === 'string') {
+      parsedBuffer = Buffer.from(buffer, 'utf-8');
+      if (parser === undefined || parser === 'auto') {
+        parser = 'txt';
+      }
+    } else {
+      parsedBuffer = buffer;
+    }
   }
   const document = useWithCatch(
     autoParseDocument({ buffer: parsedBuffer, parser, ...others }),

diff --git a/packages/poml/components/tree.tsx b/packages/poml/components/tree.tsx
@@ -261,8 +261,12 @@ function readDirectoryToTreeItems(
     const children: TreeItemData[] = [];
     const entries = fs.readdirSync(dirPath, { withFileTypes: true }).sort((a, b) => {
       // Directories first, then files
-      if (a.isDirectory() && !b.isDirectory()) return -1;
-      if (!a.isDirectory() && b.isDirectory()) return 1;
+      if (a.isDirectory() && !b.isDirectory()) {
+        return -1;
+      }
+      if (!a.isDirectory() && b.isDirectory()) {
+        return 1;
+      }
       return a.name.localeCompare(b.name);
     });
 

diff --git a/packages/poml/components/webpage.tsx b/packages/poml/components/webpage.tsx
@@ -10,6 +10,7 @@ export interface WebpageProps extends PropsSyntaxBase {
   src?: string;
   url?: string;
   buffer?: string | Buffer;
+  base64?: string;
   extractText?: boolean;
   selector?: string;
 }
@@ -102,6 +103,7 @@ async function processWebpage(props: WebpageProps): Promise<React.ReactElement>
  * @param {string} url - The URL of the webpage to fetch and display.
  * @param {string} src - Local file path to an HTML file to display.
  * @param {string|Buffer} buffer - HTML content as string or buffer.
+ * @param {string} base64 - Base64 encoded HTML content.
  * @param {boolean} extractText - Whether to extract plain text content (true) or convert HTML to structured POML (false). Default is false.
  * @param {string} selector - CSS selector to extract specific content from the page (e.g., "article", ".content", "#main"). Default is "body".
  *
@@ -126,7 +128,13 @@ async function processWebpage(props: WebpageProps): Promise<React.ReactElement>
 export const Webpage = component('Webpage', { asynchorous: true })((
   props: WebpageProps
 ) => {
-  const { src, url, buffer, extractText, selector, ...others } = props;
-  const content = useWithCatch(processWebpage(props), others);
+  let { src, url, buffer, base64, extractText, selector, ...others } = props;
+  if (base64) {
+    if (buffer !== undefined) {
+      throw new Error('Either buffer or base64 should be provided, not both.');
+    }
+    buffer = Buffer.from(base64, 'base64');
+  }
+  const content = useWithCatch(processWebpage({ ...props, buffer: buffer }), others);
   return <Text {...others}>{content ?? null}</Text>;
 });
diff --git a/packages/poml/tests/components.test.tsx b/packages/poml/tests/components.test.tsx
@@ -50,6 +50,15 @@ describe('document', () => {
       /without any merged cells:\n\n\| Screen Reader \| Responses \| Share \|\n/g
     );
   });
+
+  test('docx from base64', async () => {
+    const buffer = readFileSync(__dirname + '/assets/sampleWord.docx');
+    const base64 = buffer.toString('base64');
+    const result = await poml(<Document base64={base64} parser="docx" />);
+    expect(result[4]).toMatch(
+      /without any merged cells:\n\n\| Screen Reader \| Responses \| Share \|\n/g
+    );
+  });
 });
 
 describe('message', () => {
@@ -481,4 +490,13 @@ Finally, link to another page in your own Web site.
 
     expect(result).toContain('<h1>Enter the main heading, usually the same as the title.</h1>');
   });
+
+  test('loading HTML from base64', async () => {
+    const htmlContent = readFileSync(webpagePath, 'utf-8');
+    const base64Content = Buffer.from(htmlContent).toString('base64');
+    const markup = <Webpage base64={base64Content} selector="h1" syntax="html" />;
+    const result = await poml(markup);
+
+    expect(result).toContain('<h1>Enter the main heading, usually the same as the title.</h1>');
+  });
 });
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ name = "poml"
 version = "0.0.5"
 description = "Prompt Orchestration Markup Language"
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 license = {file = "LICENSE"}
 dependencies = [
   "nodejs-wheel"

diff --git a/python/poml/__init__.py b/python/poml/__init__.py
@@ -2,3 +2,4 @@
 
 from .api import poml
 from .cli import entrypoint, run
+from .prompt import Prompt
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,3 +2,4 @@

		from .api import poml
		from .cli import entrypoint, run
		from .prompt import Prompt