diff --git a/packages/metascraper-readability/src/index.js b/packages/metascraper-readability/src/index.js index 3531a9686..9e3aea199 100644 --- a/packages/metascraper-readability/src/index.js +++ b/packages/metascraper-readability/src/index.js @@ -1,22 +1,44 @@ 'use strict' const { memoizeOne, composeRule } = require('@metascraper/helpers') +const { Readability } = require('@mozilla/readability') const asyncMemoizeOne = require('async-memoize-one') -const { Worker } = require('worker_threads') -const path = require('path') +const { Browser } = require('happy-dom') -const SCRIPT_PATH = path.resolve(__dirname, 'worker.js') +const parseReader = reader => { + const parsed = reader.parse() + return parsed || {} +} -const readability = asyncMemoizeOne((url, html, readabilityOpts) => { - const worker = new Worker(SCRIPT_PATH, { - workerData: { url, html, readabilityOpts }, - stdout: true, - stderr: true +const getDocument = ({ url, html }) => { + const browser = new Browser({ + settings: { + disableComputedStyleRendering: true, + disableCSSFileLoading: true, + disableIframePageLoading: true, + disableJavaScriptEvaluation: true, + disableJavaScriptFileLoading: true + } }) - const { promise, resolve, reject } = Promise.withResolvers() - worker.on('message', message => resolve(JSON.parse(message))) - worker.on('error', reject) - return promise + + const page = browser.newPage() + page.url = url + page.content = html + return { + document: page.mainFrame.document, + teardown: () => browser.close() + } +} + +const readability = asyncMemoizeOne(async (url, html, readabilityOpts) => { + const { document, teardown } = getDocument({ url, html }) + try { + const reader = new Readability(document, readabilityOpts) + const result = parseReader(reader) + return result + } finally { + await teardown() + } }, memoizeOne.EqualityFirstArgument) module.exports = ({ readabilityOpts } = {}) => { diff --git a/packages/metascraper-readability/src/worker.js b/packages/metascraper-readability/src/worker.js deleted file mode 100644 index d54a88913..000000000 --- a/packages/metascraper-readability/src/worker.js +++ /dev/null @@ -1,34 +0,0 @@ -'use strict' - -const { workerData, parentPort } = require('node:worker_threads') -const { Readability } = require('@mozilla/readability') - -const parseReader = reader => { - try { - return reader.parse() - } catch (_) { - return {} - } -} - -const errorCapture = - process.env.NODE_ENV === 'test' ? 'tryAndCatch' : 'processLevel' - -const getDocument = ({ url, html }) => { - const { Window } = require('happy-dom') - const window = new Window({ - url, - settings: { errorCapture } - }) - const document = window.document - document.write(html) - return document -} - -const main = async ({ url, html, readabilityOpts } = {}) => { - const document = getDocument({ url, html }) - const reader = new Readability(document, readabilityOpts) - return parseReader(reader) -} - -main(workerData).then(result => parentPort.postMessage(JSON.stringify(result))) diff --git a/packages/metascraper/test/integration/atlasobscura/index.js b/packages/metascraper/test/integration/atlasobscura/index.js index aaf02da57..6b554ed12 100644 --- a/packages/metascraper/test/integration/atlasobscura/index.js +++ b/packages/metascraper/test/integration/atlasobscura/index.js @@ -21,7 +21,7 @@ const metascraper = require('../../..')([ require('metascraper-readability')() ]) -const url = 'http://www.atlasobscura.com/articles/ikea-bowl-blanda-blank-fire' +const url = 'https://www.atlasobscura.com/articles/ikea-bowl-blanda-blank-fire' test('atlasobscura', async t => { const html = await readFile(resolve(__dirname, 'input.html')) diff --git a/packages/metascraper/test/integration/atlasobscura/input.html b/packages/metascraper/test/integration/atlasobscura/input.html index 07d8a74c7..aeb1bf265 100644 --- a/packages/metascraper/test/integration/atlasobscura/input.html +++ b/packages/metascraper/test/integration/atlasobscura/input.html @@ -1,2741 +1,2798 @@ - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - + + + + This Ikea Bowl Has Been Setting Things on Fire - Atlas Obscura + + + - - - - - - - - - - - - - - - - - - - - - @media (min-width: 668px) { - #start-itinerary .htlunit-Infeed_Medrec { - margin: -70px auto 40px auto; - } - } - - @media (max-width: 667px) { - #start-itinerary .htlunit-Infeed_Medrec { - margin-top: 30px; - margin-bottom: -50px; - } - } - - .htlunit-Mobile_Leaderboard { - padding-top: 15px; - padding-bottom: 5px; - background-color: rgba(56,44,20,0.05); - width: 100%; - } - - @media (min-width: 668px) { - .htlunit-Mobile_Leaderboard { - padding: 0; - margin: 0; - display: none; + + - @media (max-width: 667px) { - .htlunit-Infeed_Medrec { - margin-top: 24px !important; - margin-bottom: 24px !important; - } - } - - .htlunit-Desktop_Leaderboard { - margin-bottom: 30px; - } - - #htlad-6-destination-desktop_leaderboard-1 { - margin-top: 30px !important; - margin-bottom: -25px !important; - } - - #htlad-8-destination-desktop_leaderboard-2 { - padding-top: 18px; - } - - @media (max-width: 667px) { - .nav-toggle-container { - margin-top: -10px; - height: 100% !important; - } - } - - .ad-background.htl-empty { - display:none; - } - - .htlunit-Place_page_inline { - margin-bottom: 22px; - } - - div.CardWrapper + div.htl-ad, - section.ao-container div.grid div + div.htl-ad { - grid-column-start: 1; - grid-column-end: -1; - } + - header div.htlunit-Rotational_Top_Slot iframe, - main div.htlunit-Feature_Inline iframe { - margin: 0 auto; - } - This Ikea Bowl Has Been Setting Things on Fire - Atlas Obscura - - - - - - - - +googletag.cmd.push(function() { + googletag.pubads().setTargeting('is_home', 'NO'); + googletag.pubads().setTargeting("page", "article") + googletag.pubads().setTargeting("post_id", "article-8818"); + googletag.pubads().setTargeting("pid", "article-8818"); + googletag.pubads().setTargeting("tags", ["fleeting wonders", "news", "fire", "minor threats"]); +}); + + + - + - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + - - - -
- - - - - -
+ + + + + + + + +
+ + + + + + + + +
+
+
+ +
+ +
-
-
-
+ + + + + + + + + + + + + + + + + + +
+ +
-