Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/components.md
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,7 @@ To display a Word document without including the real multimedia:

- **src**: The source file to read the data from. This must be provided if records is not provided.
- **buffer**: Buffer. Document data buffer. Recommended to use `src` instead unless you want to use a string.
- **base64**: Base64 encoded string of the document data. Mutually exclusive with `src` and `buffer`.
- **parser**: Can be one of: auto, pdf, docx, txt. The parser to use for reading the data. If not provided, it will be inferred from the file extension.
- **multimedia**: Boolean. If true, the multimedias will be displayed. If false, the alt strings will be displayed at best effort. Default is `true`.
- **selectedPages**: The pages to be selected. This is only available **for PDF documents**. If not provided, all pages will be selected.
Expand Down Expand Up @@ -998,6 +999,7 @@ Convert HTML to structured POML components:
- **url**: The URL of the webpage to fetch and display.
- **src**: Local file path to an HTML file to display.
- **buffer**: Buffer. HTML content as string or buffer.
- **base64**: Base64 encoded HTML content.
- **extractText**: Boolean. Whether to extract plain text content (true) or convert HTML to structured POML (false). Default is false.
- **selector**: CSS selector to extract specific content from the page (e.g., "article", ".content", "#main"). Default is "body".
- **syntax**: Can be one of: markdown, html, json, yaml, xml, text. The syntax of the content.
Expand Down
155 changes: 142 additions & 13 deletions packages/poml-vscode/lsp/parseComments.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import "poml";
import { ComponentSpec, Parameter } from "poml/base";
import 'poml';
import { ComponentSpec, Parameter } from 'poml/base';

import { readFileSync, readdirSync, writeFileSync } from "fs";
import { join } from "path";
import { formatComponentDocumentation } from "./documentFormatter";
import { readFileSync, readdirSync, writeFile, writeFileSync } from 'fs';
import { join } from 'path';
import { formatComponentDocumentation } from './documentFormatter';

const basicComponents: string[] = [];
const intentions: string[] = [];
Expand All @@ -18,13 +18,14 @@ function tsCommentToMarkdown(comment: string): ComponentSpec {
.replace(/^\/\*\*?/, '')
.replace(/\*\/$/, '')
.split('\n')
.map((line) => line.replace(/^\s*\*( )?/, ''))
.map((line) => line.replace(/\s+$/, ''))
.map(line => line.replace(/^\s*\*( )?/, ''))
.map(line => line.replace(/\s+$/, ''))
.join('\n');

// Recognize description, @param and @example in the comment.
const descriptionRegex = /([\s\S]*?)(?=@param|@example|@see|$)/;
const paramRegex = /@param\s+(\{([\S'"\|]+?)\}\s+)?(\w+)\s+-\s+([\s\S]*?)(?=@param|@example|@see|$)/g;
const paramRegex =
/@param\s+(\{([\S'"\|]+?)\}\s+)?(\w+)\s+-\s+([\s\S]*?)(?=@param|@example|@see|$)/g;
const exampleRegex = /@example\s+([\s\S]*?)(?=@param|@example|@see|$)/;
const seeRegex = /@see\s+([\s\S]*?)(?=@param|@example|@see|$)/g;

Expand All @@ -50,7 +51,7 @@ function tsCommentToMarkdown(comment: string): ComponentSpec {
fallbackType = 'string';
} else if (paramMatch[2] && paramMatch[2].includes('|')) {
type = 'string';
choices = paramMatch[2].split('|').map((choice) => choice.replace(/['"\s]/g, '').trim());
choices = paramMatch[2].split('|').map(choice => choice.replace(/['"\s]/g, '').trim());
} else if (paramMatch[2]) {
type = paramMatch[2];
}
Expand Down Expand Up @@ -80,7 +81,7 @@ function tsCommentToMarkdown(comment: string): ComponentSpec {
params,
example,
baseComponents
}
};
}

function extractTsComments(text: string) {
Expand All @@ -95,7 +96,8 @@ function extractTsComments(text: string) {

function extractComponentComments(text: string) {
const comments: ComponentSpec[] = [];
const commentRegex = /(\/\*\*([\s\S]*?)\*\/)\nexport const [\w]+ = component\(['"](\w+)['"](,[\S\s]*?)?\)/g;
const commentRegex =
/(\/\*\*([\s\S]*?)\*\/)\nexport const [\w]+ = component\(['"](\w+)['"](,[\S\s]*?)?\)/g;
let match;
while ((match = commentRegex.exec(text)) !== null) {
const doc = { name: match[3], ...tsCommentToMarkdown(match[2]) };
Expand All @@ -104,7 +106,6 @@ function extractComponentComments(text: string) {
return comments;
}


function* walk(folderPath: string): IterableIterator<string> {
for (const entry of readdirSync(folderPath, { withFileTypes: true })) {
if (entry.isFile() && (entry.name.endsWith('.tsx') || entry.name.endsWith('.ts'))) {
Expand Down Expand Up @@ -135,7 +136,7 @@ function scanComponentDocs(folderPath: string) {
} else {
utilities.push(...names);
}
};
}
return allComments;
}

Expand All @@ -159,6 +160,134 @@ function docsToMarkdown(docs: ComponentSpec[]) {
return parts.join('\n\n');
}

function camelToSnake(str: string): string {
return str
.replace(/([A-Z]+)([A-Z][a-z])/g, '$1_$2') // Handles cases like "XMLFile" -> "XML_File"
.replace(/([a-z\d])([A-Z])/g, '$1_$2') // Handles "camelCase" -> "camel_Case"
.toLowerCase(); // Converts to lowercase: "XML_File" -> "xml_file"
}

function getPythonType(jsonType: string, paramName: string): string {
const lcJsonType = jsonType.toLowerCase();
switch (lcJsonType) {
case 'string':
return 'str';
case 'boolean':
return 'bool';
case 'buffer':
return 'bytes';
case 'number':
// Heuristic for int vs float based on common parameter names
if (
paramName.includes('max') ||
paramName.includes('count') ||
paramName.includes('depth') ||
paramName.endsWith('Index')
) {
return 'int';
}
return 'float';
case 'object':
return 'Any'; // Could be Dict[str, Any]
case 'regexp':
return 'str'; // Python uses strings for regex patterns
default:
if (jsonType.endsWith('[]')) {
// Handles array types like TreeItemData[]
return 'List[Any]'; // Generic list type
}
// For unknown or complex non-array types (e.g., a specific object schema name)
return 'Any';
}
}

function generatePythonMethod(tag: ComponentSpec): string {
const methodName = camelToSnake(tag.name!);
let paramsSignatureList: string[] = [' self'];
let argsDocstring = '';
const callArgsList: string[] = [`tag_name="${tag.name}"`];

tag.params.forEach(param => {
const paramName = param.name; // Use original JSON name for Python parameter
const pythonType = getPythonType(param.type, paramName);
const typeHint = `Optional[${pythonType}]`;

paramsSignatureList.push(` ${paramName}: ${typeHint} = None`);
callArgsList.push(`${paramName}=${paramName}`);

let paramDesc = param.description.replace(/\n/g, '\n ');
if (param.defaultValue !== undefined) {
const defValStr =
typeof param.defaultValue === 'string' ? `"${param.defaultValue}"` : param.defaultValue;
paramDesc += ` Default is \`${defValStr}\`.`;
}
if (param.choices && param.choices.length > 0) {
paramDesc += ` Choices: ${param.choices.map(c => `\`${JSON.stringify(c)}\``).join(', ')}.`;
}
argsDocstring += ` ${paramName} (${typeHint}): ${paramDesc}\n`;
});

paramsSignatureList.push(' **kwargs: Any');

const paramsString = paramsSignatureList.join(',\n');

let docstring = `"""${tag.description.replace(/\n/g, '\n ')}\n\n`;
if (argsDocstring) {
docstring += ` Args:\n${argsDocstring}`;
}
if (tag.example) {
const exampleIndented = tag.example
.replace(/\\/g, '\\\\') // Escape backslashes for string literal
.replace(/"""/g, '\\"\\"\\"') // Escape triple quotes if any in example
.replace(/\n/g, '\n ');
docstring += `\n Example:\n ${exampleIndented}\n`;
}
docstring += ` """`;

const methodBody = `return self.tag(
${callArgsList.join(',\n ')},
**kwargs,
)`;

return `
def ${methodName}(
${paramsString},
):
${docstring}
${methodBody}
`;
}

function generatePythonFile(jsonData: ComponentSpec[]): string {
let pythonCode = `# This file is auto-generated from component documentation.
# Do not edit manually. Run \`npm run build-comment\` to regenerate.

from typing import Optional, Any, Union, List, Dict
# from numbers import Number # For more specific number types if needed

class _TagLib:

def tag(self, tag_name: str, **kwargs: Any) -> Any:
"""Helper method to create a tag with the given name and attributes.
Implemented by subclasses.
"""
raise NotImplementedError("This method should be implemented by subclasses.")
`;

jsonData.forEach(tag => {
if (!tag.name) {
console.warn('Skipping tag with no name:', tag);
return;
}
pythonCode += generatePythonMethod(tag);
});

return pythonCode;
}

const allDocs = scanComponentDocs('packages/poml');
const pythonCode = generatePythonFile(allDocs);
writeFileSync('packages/poml/assets/componentDocs.json', JSON.stringify(allDocs, null, 2));
writeFileSync('docs/components.md', docsToMarkdown(allDocs));
writeFileSync('python/poml/_tags.py', pythonCode);
console.log('Component documentation generated successfully!');
14 changes: 14 additions & 0 deletions packages/poml/assets/componentDocs.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@
"description": "Document data buffer. Recommended to use `src` instead unless you want to use a string.",
"required": false
},
{
"name": "base64",
"type": "string",
"choices": [],
"description": "Base64 encoded string of the document data. Mutually exclusive with `src` and `buffer`.",
"required": false
},
{
"name": "parser",
"type": "string",
Expand Down Expand Up @@ -1106,6 +1113,13 @@
"description": "HTML content as string or buffer.",
"required": false
},
{
"name": "base64",
"type": "string",
"choices": [],
"description": "Base64 encoded HTML content.",
"required": false
},
{
"name": "extractText",
"type": "boolean",
Expand Down
21 changes: 15 additions & 6 deletions packages/poml/components/document.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ interface DocumentProps extends PropsSyntaxBase {
src?: string;
parser?: DocumentParser;
buffer?: string | Buffer;
base64?: string;
multimedia?: boolean;
selectedPages?: string;
}
Expand Down Expand Up @@ -238,6 +239,7 @@ async function autoParseDocument(
*
* @param {string} src - The source file to read the data from. This must be provided if records is not provided.
* @param {Buffer|string} buffer - Document data buffer. Recommended to use `src` instead unless you want to use a string.
* @param {string} base64 - Base64 encoded string of the document data. Mutually exclusive with `src` and `buffer`.
* @param {'auto'|'pdf'|'docx'|'txt'} parser - The parser to use for reading the data. If not provided, it will be inferred from the file extension.
* @param {boolean} multimedia - If true, the multimedias will be displayed. If false, the alt strings will be displayed at best effort. Default is `true`.
* @param {string} selectedPages - The pages to be selected. This is only available **for PDF documents**. If not provided, all pages will be selected.
Expand All @@ -255,15 +257,22 @@ async function autoParseDocument(
export const Document = component('Document', { aliases: ['doc'], asynchorous: true })((
props: DocumentProps
) => {
let { buffer, parser, ...others } = props;
let { buffer, parser, base64, ...others } = props;
let parsedBuffer: Buffer | undefined;
if (typeof buffer === 'string') {
parsedBuffer = Buffer.from(buffer, 'utf-8');
if (parser === undefined || parser === 'auto') {
parser = 'txt';
if (base64) {
if (buffer !== undefined) {
throw new Error('Either buffer or base64 should be provided, not both.');
}
parsedBuffer = Buffer.from(base64, 'base64');
} else {
parsedBuffer = buffer;
if (typeof buffer === 'string') {
parsedBuffer = Buffer.from(buffer, 'utf-8');
if (parser === undefined || parser === 'auto') {
parser = 'txt';
}
} else {
parsedBuffer = buffer;
}
}
const document = useWithCatch(
autoParseDocument({ buffer: parsedBuffer, parser, ...others }),
Expand Down
8 changes: 6 additions & 2 deletions packages/poml/components/tree.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -261,8 +261,12 @@ function readDirectoryToTreeItems(
const children: TreeItemData[] = [];
const entries = fs.readdirSync(dirPath, { withFileTypes: true }).sort((a, b) => {
// Directories first, then files
if (a.isDirectory() && !b.isDirectory()) return -1;
if (!a.isDirectory() && b.isDirectory()) return 1;
if (a.isDirectory() && !b.isDirectory()) {
return -1;
}
if (!a.isDirectory() && b.isDirectory()) {
return 1;
}
return a.name.localeCompare(b.name);
});

Expand Down
12 changes: 10 additions & 2 deletions packages/poml/components/webpage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export interface WebpageProps extends PropsSyntaxBase {
src?: string;
url?: string;
buffer?: string | Buffer;
base64?: string;
extractText?: boolean;
selector?: string;
}
Expand Down Expand Up @@ -102,6 +103,7 @@ async function processWebpage(props: WebpageProps): Promise<React.ReactElement>
* @param {string} url - The URL of the webpage to fetch and display.
* @param {string} src - Local file path to an HTML file to display.
* @param {string|Buffer} buffer - HTML content as string or buffer.
* @param {string} base64 - Base64 encoded HTML content.
* @param {boolean} extractText - Whether to extract plain text content (true) or convert HTML to structured POML (false). Default is false.
* @param {string} selector - CSS selector to extract specific content from the page (e.g., "article", ".content", "#main"). Default is "body".
*
Expand All @@ -126,7 +128,13 @@ async function processWebpage(props: WebpageProps): Promise<React.ReactElement>
export const Webpage = component('Webpage', { asynchorous: true })((
props: WebpageProps
) => {
const { src, url, buffer, extractText, selector, ...others } = props;
const content = useWithCatch(processWebpage(props), others);
let { src, url, buffer, base64, extractText, selector, ...others } = props;
if (base64) {
if (buffer !== undefined) {
throw new Error('Either buffer or base64 should be provided, not both.');
}
buffer = Buffer.from(base64, 'base64');
}
const content = useWithCatch(processWebpage({ ...props, buffer: buffer }), others);
return <Text {...others}>{content ?? null}</Text>;
});
18 changes: 18 additions & 0 deletions packages/poml/tests/components.test.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,15 @@ describe('document', () => {
/without any merged cells:\n\n\| Screen Reader \| Responses \| Share \|\n/g
);
});

test('docx from base64', async () => {
const buffer = readFileSync(__dirname + '/assets/sampleWord.docx');
const base64 = buffer.toString('base64');
const result = await poml(<Document base64={base64} parser="docx" />);
expect(result[4]).toMatch(
/without any merged cells:\n\n\| Screen Reader \| Responses \| Share \|\n/g
);
});
});

describe('message', () => {
Expand Down Expand Up @@ -481,4 +490,13 @@ Finally, link to another page in your own Web site.

expect(result).toContain('<h1>Enter the main heading, usually the same as the title.</h1>');
});

test('loading HTML from base64', async () => {
const htmlContent = readFileSync(webpagePath, 'utf-8');
const base64Content = Buffer.from(htmlContent).toString('base64');
const markup = <Webpage base64={base64Content} selector="h1" syntax="html" />;
const result = await poml(markup);

expect(result).toContain('<h1>Enter the main heading, usually the same as the title.</h1>');
});
});
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name = "poml"
version = "0.0.5"
description = "Prompt Orchestration Markup Language"
readme = "README.md"
requires-python = ">=3.8"
requires-python = ">=3.9"
license = {file = "LICENSE"}
dependencies = [
"nodejs-wheel"
Expand Down
1 change: 1 addition & 0 deletions python/poml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@

from .api import poml
from .cli import entrypoint, run
from .prompt import Prompt
Loading
Loading