forked from mongodb/chatbot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
DevCenterDataSource.ts
134 lines (122 loc) · 3.79 KB
/
DevCenterDataSource.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import { MongoClient } from "mongodb-rag-core";
import { strict as assert } from "assert";
import { convert } from "html-to-text";
import { Page, assertEnvVars, logger } from "mongodb-rag-core";
import { INGEST_ENV_VARS } from "../IngestEnvVars";
import { removeMarkdownImagesAndLinks } from "./removeMarkdownImagesAndLinks";
import { DataSource } from "./DataSource";
import { ProjectBase } from "./ProjectBase";
export type DevCenterProjectConfig = ProjectBase & {
type: "devcenter";
databaseName: string;
collectionName: string;
baseUrl: string;
};
// This type is based on what's in the DevCenter search_content_prod collection
export type DevCenterEntry = {
name: string;
description: string;
content: string | null;
calculated_slug: string;
tags: DevCenterEntryTag[];
type: string;
};
export interface DevCenterEntryTag {
name: string;
type: string;
}
export const makeDevCenterDataSource = async ({
name,
databaseName,
collectionName,
baseUrl,
}: DevCenterProjectConfig): Promise<DataSource> => {
const { DEVCENTER_CONNECTION_URI } = assertEnvVars(INGEST_ENV_VARS);
return {
name,
async fetchPages() {
const client = await new MongoClient(DEVCENTER_CONNECTION_URI).connect();
try {
const db = client.db(databaseName);
const collection = db.collection<DevCenterEntry>(collectionName);
const documents = collection.find();
const pages: Page[] = [];
for await (const document of documents) {
if (!document.content) {
logger.warn(
`Discarding empty content document: ${document.calculated_slug}`
);
continue;
}
pages.push(makeDevCenterPage(document, name, baseUrl));
}
return pages;
} finally {
await client.close();
}
},
};
};
export function makeDevCenterPage(
document: DevCenterEntry,
name: string,
baseUrl: string
): Page {
assert(document.content, "document.content must be defined");
return {
title: document.name,
body: makeDevCenterPageBody({
title: document.name,
content: document.content,
}),
format: "md",
sourceName: name,
metadata: {
tags: extractTags(document.tags),
pageDescription: document.description,
contentType: document.type,
},
url: /^https?:\/\//.test(document.calculated_slug)
? document.calculated_slug
: new URL(
document.calculated_slug.replace(/^\/?/, ""), // Strip leading slash (if present) to not clobber baseUrl path
baseUrl.replace(/\/?$/, "/") // Add trailing slash to not lose last segment of baseUrl
).toString(),
};
}
/**
Extract relevant tags from dev center entry tags
*/
export function extractTags(tags: DevCenterEntryTag[]) {
return tags
.filter(
(tag) =>
tag.type === "L1Product" ||
tag.type === "Technology" ||
tag.type === "ProgrammingLanguage"
)
.map((tag) => tag.name);
}
export function makeDevCenterPageBody({
title,
content,
}: {
title?: string;
content: string;
}) {
const mdTitle = title ? `# ${title}\n\n` : "";
content = mdTitle + content;
// Remove HTML <div> and <img> tags
// Replace all spaces with uncommon character 🎃 b/c html-to-text convert() removes new characters
// at the beginning of lines. Then re-replace 🎃 with spaces after convert().
content = convert(content.replaceAll(" ", "🎃"), {
preserveNewlines: true,
selectors: [{ selector: "img", format: "skip" }],
}).replaceAll("🎃", " ");
content = removeMarkdownImagesAndLinks(content);
// remove YouTube markdown directives (e.g. `:youtube[]{some content}`)
content = content.replaceAll(/:youtube\[\]\{(.*)\}/g, "");
// remove unnecessary newlines
content = content.replaceAll(/\n{3,}/g, "\n\n");
return content;
}