Skip to content

Commit 5213c35

Browse files
authored
Speed up wiki scraping (#42)
* Scrape 20 pages at a time This significantly speeds up wiki scraping * Wipe output folder by default Is there a reason not to do this?
1 parent 4356d7c commit 5213c35

File tree

2 files changed

+9
-2
lines changed

2 files changed

+9
-2
lines changed

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
"scripts": {
2121
"clear-output": "rm -rf ./output/",
2222
"wiki-check-changed": "tsx ./src/cli-change-checker.ts",
23-
"scrape-wiki": "tsx ./src/cli-scraper.ts --output ./output/ --customOverrides ./custom/",
23+
"scrape-wiki": "tsx ./src/cli-scraper.ts --output ./output/ --customOverrides ./custom/ --wipe",
2424
"pack-release": "tsx ./src/cli-release-packer.ts --input ./output/ --output ./dist/release/",
2525
"publish-library": "tsx ./src/cli-library-publisher.ts --input ./output/ --output ./dist/libraries/garrysmod",
2626
"stylua-custom": "npx --yes @johnnymorganz/stylua-bin@0.17.1 ./custom",

src/cli-scraper.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ async function startScrape() {
8282

8383
const pageIndexes = await scrapeAndCollect(pageListScraper);
8484

85+
let queue: Promise<any>[] = [];
8586
for (const pageIndex of pageIndexes) {
8687
const pageMarkupScraper = new WikiPageMarkupScraper(`${baseUrl}/${pageIndex.address}?format=text`);
8788

@@ -119,7 +120,13 @@ async function startScrape() {
119120
fs.writeFileSync(path.join(baseDirectory, moduleName, `${fileName}.json`), json);
120121
});
121122

122-
await pageMarkupScraper.scrape();
123+
queue.push(pageMarkupScraper.scrape());
124+
125+
if (queue.length > 20)
126+
{
127+
await Promise.allSettled(queue);
128+
queue = [];
129+
}
123130
}
124131

125132
console.log(`Done with scraping! You can find the output in ${baseDirectory}`);

0 commit comments

Comments
 (0)