Skip to content

Commit

Permalink
add scripts to simple scraper.
Browse files Browse the repository at this point in the history
  • Loading branch information
Jason Schwarzenberger committed Dec 4, 2020
1 parent 006be62 commit d50481b
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 5 deletions.
4 changes: 2 additions & 2 deletions scraper/headless/get-details.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ module.exports.getDetails = async (url) => {
});

await tab.addInitScript({ path: "vendor/bypass-paywalls-chrome/src/js/contentScript.js" });
await tab.addInitScript({ path: "scraper/headless/scripts/cosmetic-filter.js" });
await tab.addInitScript({ path: "scraper/headless/scripts/fix-relative-links.js" });
await tab.addInitScript({ path: "scripts/cosmetic-filter.js" });
await tab.addInitScript({ path: "scripts/fix-relative-links.js" });
await tab.goto(url, { timeout: 90000, waitUntil: "domcontentloaded" });
await tab.waitForTimeout(2000);

Expand Down
38 changes: 37 additions & 1 deletion scraper/simple/get-details.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,30 @@
const fetch = require('node-fetch');
const { JSDOM } = require('jsdom');
const { Script, createContext } = require("vm");
const { readFile } = require('fs');


const { getUserAgent } = require('../../utils/user-agent');
const { extractReadable } = require('../../utils/extract-metadata');

async function runScript(filename, context) {
try {
return await new Promise((resolve, reject) => {
readFile(filename, {}, (e, content) => {
if (e) {
reject(e);
}
const script = new Script(content, { filename: `(internal):${filename}` })
script.runInContext(context);
resolve(context);
});
})
} catch (e) {
console.error(e);
}
return context;
}

module.exports.getDetails = async (url) => {
try {
const { userAgent, headers } = getUserAgent(url);
Expand All @@ -15,7 +37,21 @@ module.exports.getDetails = async (url) => {
if (!response.ok) {
throw response.statusText;
}
const html = await response.text();

const { window } = new JSDOM(await response.text(), { url });
window.window = window;
window.setTimeout = cb => cb();
window.setInterval = cb => cb();
const context = createContext(window);

await runScript('vendor/bypass-paywalls-chrome/src/js/contentScript.js', context);
await runScript('scripts/cosmetic-filter.js', context);
await runScript('scripts/fix-relative-links.js', context);

const script = new Script(`window.dispatchEvent(new window.Event('DOMContentLoaded'));`);
script.runInContext(context);
const html = context.document.querySelector('html').innerHTML;

const readable = await extractReadable(html, url);
return readable;
} catch (e) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
window.addEventListener('DOMContentLoaded', function (event) {
window.addEventListener('DOMContentLoaded', function () {
removeHiddenElements();

if (matchDomain("stuff.co.nz")) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
window.addEventListener('DOMContentLoaded', function (event) {
window.addEventListener('DOMContentLoaded', function () {
const { host, protocol } = window.location;
const url = `${protocol}//${host}`;
[
Expand Down

0 comments on commit d50481b

Please sign in to comment.