docs: improve user walkthrough

closes #565
microlinkhq · Jul 11, 2022 · f0e18c0 · f0e18c0
1 parent e58e9a4
commit f0e18c0
Showing 1 changed file with 34 additions and 79 deletions.
diff --git a/README.md b/README.md
@@ -21,87 +21,38 @@ It follows a few principles:
 - Make it simple to add new rules or override existing ones.
 - Don't restrict rules to CSS selectors or text accessors.
 
-## Installation
-
-```bash
-$ npm install metascraper --save
-```
-
 ## Usage
 
-Let's extract accurate information from the following article:
-
-[![](https://raw.githubusercontent.com/microlinkhq/metascraper/add-comparison/support/screenshot.png)](http://www.bloomberg.com/news/articles/2016-05-24/as-zenefits-stumbles-gusto-goes-head-on-by-selling-insurance)
-
-Then call **metascraper** with the rules bundle you want to apply for extracting content:
-
-```js
-const metascraper = require('metascraper')([
-  require('metascraper-author')(),
-  require('metascraper-date')(),
-  require('metascraper-description')(),
-  require('metascraper-image')(),
-  require('metascraper-logo')(),
-  require('metascraper-clearbit')(),
-  require('metascraper-publisher')(),
-  require('metascraper-title')(),
-  require('metascraper-url')()
-])
-
-const { fetch } = require('undici')
+Let's extract accurate information from the following website:
 
-const siteUrl = 'http://www.bloomberg.com/news/articles/2016-05-24/as-zenefits-stumbles-gusto-goes-head-on-by-selling-insurance'
+![](https://i.imgur.com/cFDIRUz.png)
 
-;(async () => {
-  const { html, url } = await fetch(siteUrl).then(async res => ({
-    url: res.url,
-    html: await res.text()
-  }))
+First, **metrascraper** expects you provide the HTML markup behind the target URL.
 
-  const metadata = await metascraper({ html, url })
-  console.log(metadata)
-})()
-```
-
-
-The output will be something like:
-
-```json
-{
-  "author": "Ellen Huet",
-  "date": "2016-05-24T18:00:03.894Z",
-  "description": "The HR startups go to war.",
-  "image": "https://assets.bwbx.io/images/users/iqjWHBFdfxIU/ioh_yWEn8gHo/v1/-1x-1.jpg",
-  "publisher": "Bloomberg.com",
-  "title": "As Zenefits Stumbles, Gusto Goes Head-On by Selling Insurance",
-  "url": "http://www.bloomberg.com/news/articles/2016-05-24/as-zenefits-stumbles-gusto-goes-head-on-by-selling-insurance"
-}
-```
-
-As you can see, metascraper needs to be feed with regular HTML.
-
-Although you can use any HTTP client for getting the markup behind any URL, we recommend you to use [html-get](https://github.com/microlinkhq/html-get) that uses Headless chrome if needed:
+There are multiple ways to get the HTML markup. In our case, we are going to run a programmatic headless browser to simulate real user navigation, so the data obtained will be close to a real-world example.
 
 ```js
-const createBrowserless = require('browserless')
 const getHTML = require('html-get')
 
-// Spawn Chromium process once
-const browserlessFactory = createBrowserless()
-
-// Kill the process when Node.js exit
-process.on('exit', browserlessFactory.close)
+/**
+ * `browserless` will be pass `html-get` as driver
+ * for getting the rendered HTML.
+ */
+const browserless = require('browserless')()
 
 const getContent = async url => {
   // create a browser context inside Chromium process
-  const browserContext = browserlessFactory.createContext()
-  const getBrowserless = () => browserContext
-  const result = await getHTML(url, { getBrowserless })
+  const browserContext = browserless.createContext()
+  const promise = getHTML(url, { getBrowserless: () => browserContext })
   // close the browser context after it's used
-  await getBrowserless(browser => browser.destroyContext())
-  return result
+  promise.then(() => browserContext).then(browser => browser.destroyContext())
+  return promise
 }
 
+/**
+ * `metascraper` is split into tiny packages,
+ * so you can just use what you need.
+ */
 const metascraper = require('metascraper')([
   require('metascraper-author')(),
   require('metascraper-date')(),
@@ -114,30 +65,34 @@ const metascraper = require('metascraper')([
   require('metascraper-url')()
 ])
 
-getContent('https://twitter.com/BytesAndHumans/status/1532772903523065858')
+/**
+ * The main logic
+ */
+getContent('https://microlink.io')
   .then(metascraper)
+  .then(metadata => browserless.close().then(() => metadata))
   .then(metadata => {
     console.log(metadata)
-    process.exit()
+    process.exit(0)
   })
 ```
 
-being the output:
+The output will be something like:
 
-```
+```json
 {
-  author: null,
-  date: '2022-06-07T21:42:24.000Z',
-  description: '“What a week 🐣❤️📈”',
-  image: 'https://pbs.twimg.com/media/FUWAUW7XoAAxuP_.jpg:large',
-  logo: 'https://logo.clearbit.com/twitter.com',
-  publisher: 'Twitter',
-  title: 'Elena on Twitter',
-  url: 'https://twitter.com/BytesAndHumans/status/1532772903523065858'
+  "author": "Microlink HQ",
+  "date": "2022-07-10T22:53:04.856Z",
+  "description": "Enter a URL, receive information. Normalize metadata. Get HTML markup. Take a screenshot. Identify tech stack. Generate a PDF. Automate web scraping. Run Lighthouse",
+  "image": "https://cdn.microlink.io/logo/banner.jpeg",
+  "logo": "https://cdn.microlink.io/logo/trim.png",
+  "publisher": "Microlink",
+  "title": "Turns websites into data — Microlink",
+  "url": "https://microlink.io/"
 }
 ```
 
-## Metadata
+## Metadata Detection
 
 ?> Other metadata can be defined using a custom [rule bundle](#rules-bundles).