Skip to content

Commit

Permalink
add RAW HTML parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
timothycarambat committed Apr 26, 2024
1 parent c0ab395 commit ed45843
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 0 deletions.
49 changes: 49 additions & 0 deletions examples/raw-html.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// Example will grab todays HN page and works a lot like examples/hn.ts but
// does the parsing from a local HTML file instead of using chromium/playwright
import { z } from 'zod'
import OpenAI from 'openai'
import LLMScraper from '../src'
import path from 'path'
import { writeFileSync } from 'fs'

// Initialize LLM provider
const llm = new OpenAI()

// Create a new LLMScraper
const scraper = new LLMScraper(null, llm)

// Define schema to extract contents into
const schema = z.object({
top: z
.array(
z.object({
title: z.string(),
points: z.number(),
by: z.string(),
commentsURL: z.string(),
})
)
.length(5) // How many results to parse for this specific instance.
.describe('Top 5 stories on Hacker News'),
})

// Grab today's HN front page to run the example
const htmlString = await fetch('https://news.ycombinator.com/')
.then((res) => res.text())
.catch((e) => {
console.error("Failed to fetch content from Hackernews", e)
return null;
})

// Run the scraper
const pages = await scraper.rawHTML([htmlString], {
model: 'gpt-4-turbo',
schema,
mode: 'html',
closeOnFinish: true,
})

// Stream the result from LLM
for await (const page of pages) {
console.log(page.data)
}
62 changes: 62 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,56 @@ export default class LLMScraper {
return pages
}

private async loadRawHTML(
htmlString: string | string[],
options: ScraperLoadOptions = { mode: 'html' }
): Promise<Promise<ScraperLoadResult>[]> {
const roots = Array.isArray(htmlString) ? htmlString : [htmlString]
const pages = [];

for (const htmlRoot of roots) {
if (!htmlRoot) {
this.logger(`HTML content is empty - skipping`);
continue;
};

const page = parse(htmlRoot);
let content

if (options.mode === 'html') {
content = page.toString();
}

if (options.mode === 'markdown') {
const body = page.querySelector('body').innerHTML;
content = new Turndown().turndown(body)
}

if (options.mode === 'text') {
const readability = await import(
// @ts-ignore
'https://cdn.skypack.dev/@mozilla/readability'
)
const readable = new readability.Readability(page.toString()).parse()

content = `Page Title: ${readable.title}\n${readable.textContent}`
}

if (options.mode === 'image') {
this.logger(`'image' mode for options is not supported for local files.`);
continue;
};

pages.push({
htmlRoot, // maybe don't want to return entire HTML string back?
content,
mode: options.mode,
})
}

return pages
}

// Generate completion using OpenAI
private generateCompletions<T extends z.ZodSchema<any>>(
pages: Promise<ScraperLoadResult>[],
Expand Down Expand Up @@ -191,6 +241,10 @@ export default class LLMScraper {
return loader
}

// TODO: Simplify this implementation
// as each entry will need another function
// and probably makes more sense to have one entry
// and delegate to the proper function.
// Load pages and generate completion
async run<T extends z.ZodSchema<any>>(
url: string | string[],
Expand All @@ -208,6 +262,14 @@ export default class LLMScraper {
return this.generateCompletions<T>(pages, options)
}

async rawHTML<T extends z.ZodSchema<any>>(
htmlString: string | string[],
options: ScraperRunOptions<T>
) {
const pages = await this.loadRawHTML(htmlString, options);
return this.generateCompletions<T>(pages, options)
}

// Close the current context and the browser
async close() {
await this.context?.close()
Expand Down

0 comments on commit ed45843

Please sign in to comment.