forked from mongodb/chatbot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
CodeOnGithubTextDataSource.ts
90 lines (86 loc) · 2.29 KB
/
CodeOnGithubTextDataSource.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import { Page, PageMetadata } from "mongodb-rag-core";
import {
MakeGitHubDataSourceArgs,
makeGitHubDataSource,
} from "./GitHubDataSource";
import path from "path";
import { pageFormat } from "mongodb-rag-core/build/PageFormat";
export type MakeCodeOnGithubTextDataSourceParams =
// MakeGitHubDataSourceArgs & {
Omit<MakeGitHubDataSourceArgs, "handleDocumentInRepo"> & {
/**
Metadata to include with all Pages in DB.
*/
metadata?: PageMetadata;
};
/**
Loads source code files from a GitHub repo.
*/
export const makeCodeOnGithubTextDataSource = async ({
name,
repoUrl,
repoLoaderOptions,
filter,
metadata,
}: MakeCodeOnGithubTextDataSourceParams) => {
return makeGitHubDataSource({
name,
repoUrl,
filter,
repoLoaderOptions: {
...(repoLoaderOptions ?? {}),
ignoreFiles: [
/LICENSE/,
/CONTRIBUTING/,
/\.git/, // Ignores .git/, .gitignore, .github/, etc.
/\.dockerignore/,
/\.gcloudignore/,
/\.editorconfig/,
/\.vscode/,
...(repoLoaderOptions?.ignoreFiles ?? []),
],
},
async handleDocumentInRepo(document) {
const format = pageFormat(getFileExtension(document.metadata.source));
const page: Page = {
body: document.pageContent,
format,
sourceName: name,
url: pageBlobUrl({
repoUrl,
branch: repoLoaderOptions?.branch ?? "master",
filePath: document.metadata.source,
}),
metadata: {
...(metadata ?? {}),
programmingLanguage: format,
},
};
return page;
},
});
};
function getFileExtension(filePath: string) {
// Use regular expression to extract file extension
const match = filePath.match(/\.([^.]+)$/);
// If a match is found, return the extension; otherwise, default to "txt"
return match ? match[1] : "txt";
}
export function pageBlobUrl(args: {
repoUrl: string;
branch: string;
filePath?: string | string[];
}) {
const { origin, pathname: repoUrlPath } = new URL(args.repoUrl);
const urlPath = path.posix.join(
repoUrlPath,
"blob",
args.branch,
...(args.filePath === undefined
? [""]
: Array.isArray(args.filePath)
? args.filePath
: [args.filePath])
);
return new URL(urlPath, origin).toString();
}