-
Notifications
You must be signed in to change notification settings - Fork 123
/
crawler.ts
89 lines (69 loc) · 2.5 KB
/
crawler.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import cheerio from 'cheerio';
import { NodeHtmlMarkdown } from 'node-html-markdown';
interface Page {
url: string;
content: string;
}
class Crawler {
private seen = new Set<string>();
private pages: Page[] = [];
private queue: { url: string; depth: number }[] = [];
constructor(private maxDepth = 2, private maxPages = 1) { }
async crawl(startUrl: string): Promise<Page[]> {
// Add the start URL to the queue
this.addToQueue(startUrl);
// While there are URLs in the queue and we haven't reached the maximum number of pages...
while (this.shouldContinueCrawling()) {
// Dequeue the next URL and depth
const { url, depth } = this.queue.shift()!;
// If the depth is too great or we've already seen this URL, skip it
if (this.isTooDeep(depth) || this.isAlreadySeen(url)) continue;
// Add the URL to the set of seen URLs
this.seen.add(url);
// Fetch the page HTML
const html = await this.fetchPage(url);
// Parse the HTML and add the page to the list of crawled pages
this.pages.push({ url, content: this.parseHtml(html) });
// Extract new URLs from the page HTML and add them to the queue
this.addNewUrlsToQueue(this.extractUrls(html, url), depth);
}
// Return the list of crawled pages
return this.pages;
}
private isTooDeep(depth: number) {
return depth > this.maxDepth;
}
private isAlreadySeen(url: string) {
return this.seen.has(url);
}
private shouldContinueCrawling() {
return this.queue.length > 0 && this.pages.length < this.maxPages;
}
private addToQueue(url: string, depth = 0) {
this.queue.push({ url, depth });
}
private addNewUrlsToQueue(urls: string[], depth: number) {
this.queue.push(...urls.map(url => ({ url, depth: depth + 1 })));
}
private async fetchPage(url: string): Promise<string> {
try {
const response = await fetch(url);
return await response.text();
} catch (error) {
console.error(`Failed to fetch ${url}: ${error}`);
return '';
}
}
private parseHtml(html: string): string {
const $ = cheerio.load(html);
$('a').removeAttr('href');
return NodeHtmlMarkdown.translate($.html());
}
private extractUrls(html: string, baseUrl: string): string[] {
const $ = cheerio.load(html);
const relativeUrls = $('a').map((_, link) => $(link).attr('href')).get() as string[];
return relativeUrls.map(relativeUrl => new URL(relativeUrl, baseUrl).href);
}
}
export { Crawler };
export type { Page };