From f0e18c009351f44b7475b3df0ec5c6a0513d2402 Mon Sep 17 00:00:00 2001 From: Kiko Beats Date: Mon, 11 Jul 2022 17:13:59 +0200 Subject: [PATCH] docs: improve user walkthrough closes #565 --- README.md | 113 ++++++++++++++++-------------------------------------- 1 file changed, 34 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index c81ade1695..3e69713b3f 100644 --- a/README.md +++ b/README.md @@ -21,87 +21,38 @@ It follows a few principles: - Make it simple to add new rules or override existing ones. - Don't restrict rules to CSS selectors or text accessors. -## Installation - -```bash -$ npm install metascraper --save -``` - ## Usage -Let's extract accurate information from the following article: - -[![](https://raw.githubusercontent.com/microlinkhq/metascraper/add-comparison/support/screenshot.png)](http://www.bloomberg.com/news/articles/2016-05-24/as-zenefits-stumbles-gusto-goes-head-on-by-selling-insurance) - -Then call **metascraper** with the rules bundle you want to apply for extracting content: - -```js -const metascraper = require('metascraper')([ - require('metascraper-author')(), - require('metascraper-date')(), - require('metascraper-description')(), - require('metascraper-image')(), - require('metascraper-logo')(), - require('metascraper-clearbit')(), - require('metascraper-publisher')(), - require('metascraper-title')(), - require('metascraper-url')() -]) - -const { fetch } = require('undici') +Let's extract accurate information from the following website: -const siteUrl = 'http://www.bloomberg.com/news/articles/2016-05-24/as-zenefits-stumbles-gusto-goes-head-on-by-selling-insurance' +![](https://i.imgur.com/cFDIRUz.png) -;(async () => { - const { html, url } = await fetch(siteUrl).then(async res => ({ - url: res.url, - html: await res.text() - })) +First, **metrascraper** expects you provide the HTML markup behind the target URL. - const metadata = await metascraper({ html, url }) - console.log(metadata) -})() -``` - - -The output will be something like: - -```json -{ - "author": "Ellen Huet", - "date": "2016-05-24T18:00:03.894Z", - "description": "The HR startups go to war.", - "image": "https://assets.bwbx.io/images/users/iqjWHBFdfxIU/ioh_yWEn8gHo/v1/-1x-1.jpg", - "publisher": "Bloomberg.com", - "title": "As Zenefits Stumbles, Gusto Goes Head-On by Selling Insurance", - "url": "http://www.bloomberg.com/news/articles/2016-05-24/as-zenefits-stumbles-gusto-goes-head-on-by-selling-insurance" -} -``` - -As you can see, metascraper needs to be feed with regular HTML. - -Although you can use any HTTP client for getting the markup behind any URL, we recommend you to use [html-get](https://github.com/microlinkhq/html-get) that uses Headless chrome if needed: +There are multiple ways to get the HTML markup. In our case, we are going to run a programmatic headless browser to simulate real user navigation, so the data obtained will be close to a real-world example. ```js -const createBrowserless = require('browserless') const getHTML = require('html-get') -// Spawn Chromium process once -const browserlessFactory = createBrowserless() - -// Kill the process when Node.js exit -process.on('exit', browserlessFactory.close) +/** + * `browserless` will be pass `html-get` as driver + * for getting the rendered HTML. + */ +const browserless = require('browserless')() const getContent = async url => { // create a browser context inside Chromium process - const browserContext = browserlessFactory.createContext() - const getBrowserless = () => browserContext - const result = await getHTML(url, { getBrowserless }) + const browserContext = browserless.createContext() + const promise = getHTML(url, { getBrowserless: () => browserContext }) // close the browser context after it's used - await getBrowserless(browser => browser.destroyContext()) - return result + promise.then(() => browserContext).then(browser => browser.destroyContext()) + return promise } +/** + * `metascraper` is split into tiny packages, + * so you can just use what you need. + */ const metascraper = require('metascraper')([ require('metascraper-author')(), require('metascraper-date')(), @@ -114,30 +65,34 @@ const metascraper = require('metascraper')([ require('metascraper-url')() ]) -getContent('https://twitter.com/BytesAndHumans/status/1532772903523065858') +/** + * The main logic + */ +getContent('https://microlink.io') .then(metascraper) + .then(metadata => browserless.close().then(() => metadata)) .then(metadata => { console.log(metadata) - process.exit() + process.exit(0) }) ``` -being the output: +The output will be something like: -``` +```json { - author: null, - date: '2022-06-07T21:42:24.000Z', - description: 'β€œWhat a week πŸ£β€οΈπŸ“ˆβ€', - image: 'https://pbs.twimg.com/media/FUWAUW7XoAAxuP_.jpg:large', - logo: 'https://logo.clearbit.com/twitter.com', - publisher: 'Twitter', - title: 'Elena on Twitter', - url: 'https://twitter.com/BytesAndHumans/status/1532772903523065858' + "author": "Microlink HQ", + "date": "2022-07-10T22:53:04.856Z", + "description": "Enter a URL, receive information. Normalize metadata. Get HTML markup. Take a screenshot. Identify tech stack. Generate a PDF. Automate web scraping. Run Lighthouse", + "image": "https://cdn.microlink.io/logo/banner.jpeg", + "logo": "https://cdn.microlink.io/logo/trim.png", + "publisher": "Microlink", + "title": "Turns websites into data β€” Microlink", + "url": "https://microlink.io/" } ``` -## Metadata +## Metadata Detection ?> Other metadata can be defined using a custom [rule bundle](#rules-bundles).