add RAW HTML parsing

mishushakov · Apr 26, 2024 · ed45843 · ed45843
1 parent c0ab395
commit ed45843
Show file tree

Hide file tree

Showing 2 changed files with 111 additions and 0 deletions.
diff --git a/examples/raw-html.ts b/examples/raw-html.ts
@@ -0,0 +1,49 @@
+// Example will grab todays HN page and works a lot like examples/hn.ts but
+// does the parsing from a local HTML file instead of using chromium/playwright
+import { z } from 'zod'
+import OpenAI from 'openai'
+import LLMScraper from '../src'
+import path from 'path'
+import { writeFileSync } from 'fs'
+
+// Initialize LLM provider
+const llm = new OpenAI()
+
+// Create a new LLMScraper
+const scraper = new LLMScraper(null, llm)
+
+// Define schema to extract contents into
+const schema = z.object({
+  top: z
+    .array(
+      z.object({
+        title: z.string(),
+        points: z.number(),
+        by: z.string(),
+        commentsURL: z.string(),
+      })
+    )
+    .length(5) // How many results to parse for this specific instance.
+    .describe('Top 5 stories on Hacker News'),
+})
+
+// Grab today's HN front page to run the example
+const htmlString = await fetch('https://news.ycombinator.com/')
+  .then((res) => res.text())
+  .catch((e) => {
+    console.error("Failed to fetch content from Hackernews", e)
+    return null;
+  })
+
+// Run the scraper
+const pages = await scraper.rawHTML([htmlString], {
+  model: 'gpt-4-turbo',
+  schema,
+  mode: 'html',
+  closeOnFinish: true,
+})
+
+// Stream the result from LLM
+for await (const page of pages) {
+  console.log(page.data)
+}
diff --git a/src/index.ts b/src/index.ts
@@ -152,6 +152,56 @@ export default class LLMScraper {
     return pages
   }
 
+  private async loadRawHTML(
+    htmlString: string | string[],
+    options: ScraperLoadOptions = { mode: 'html' }
+  ): Promise<Promise<ScraperLoadResult>[]> {
+    const roots = Array.isArray(htmlString) ? htmlString : [htmlString]
+    const pages = [];
+
+    for (const htmlRoot of roots) {
+      if (!htmlRoot) {
+        this.logger(`HTML content is empty - skipping`);
+        continue;
+      };
+
+      const page = parse(htmlRoot);
+      let content
+
+      if (options.mode === 'html') {
+        content = page.toString();
+      }
+
+      if (options.mode === 'markdown') {
+        const body = page.querySelector('body').innerHTML;
+        content = new Turndown().turndown(body)
+      }
+
+      if (options.mode === 'text') {
+        const readability = await import(
+          // @ts-ignore
+          'https://cdn.skypack.dev/@mozilla/readability'
+        )
+        const readable = new readability.Readability(page.toString()).parse()
+
+        content = `Page Title: ${readable.title}\n${readable.textContent}`
+      }
+
+      if (options.mode === 'image') {
+        this.logger(`'image' mode for options is not supported for local files.`);
+        continue;
+      };
+
+      pages.push({
+        htmlRoot, // maybe don't want to return entire HTML string back?
+        content,
+        mode: options.mode,
+      })
+    }
+
+    return pages
+  }
+
   // Generate completion using OpenAI
   private generateCompletions<T extends z.ZodSchema<any>>(
     pages: Promise<ScraperLoadResult>[],
@@ -191,6 +241,10 @@ export default class LLMScraper {
     return loader
   }
 
+  // TODO: Simplify this implementation
+  // as each entry will need another function
+  // and probably makes more sense to have one entry
+  // and delegate to the proper function.
   // Load pages and generate completion
   async run<T extends z.ZodSchema<any>>(
     url: string | string[],
@@ -208,6 +262,14 @@ export default class LLMScraper {
     return this.generateCompletions<T>(pages, options)
   }
 
+  async rawHTML<T extends z.ZodSchema<any>>(
+    htmlString: string | string[],
+    options: ScraperRunOptions<T>
+  ) {
+    const pages = await this.loadRawHTML(htmlString, options);
+    return this.generateCompletions<T>(pages, options)
+  }
+
   // Close the current context and the browser
   async close() {
     await this.context?.close()