In [147]:
import { Document } from "langchain/document";

const test = new Document({ pageContent: "test text", metadata: { source: "ABC Title" } })
test

Document {
  pageContent: [32m"test text"[39m,
  metadata: { source: [32m"ABC Title"[39m }
}

In [148]:
// TextLoader
import { TextLoader } from "langchain/document_loaders/fs/text";
const loader = new TextLoader("data/cafe.txt");

const docs = await loader.load()
docs

[
  Document {
    pageContent: [32m"咖啡（英语：coffee）是指咖啡植物的种子即咖啡豆在经过烘焙磨粉后通过冲泡制成的饮料，咖啡亦是世界上流行范围最为广泛的饮料之一。未经烘焙的咖啡生豆作为世界上最大的农产品出口物，以及世界上交易量为广泛"[39m... 454 more characters,
    metadata: { source: [32m"data/cafe.txt"[39m }
  }
]

In [149]:
// PDFLoader
import * as pdfParse from "pdf-parse";
import { PDFLoader } from "langchain/document_loaders/fs/pdf";
const loader = new PDFLoader("data/github-copliot.pdf", { splitPages: false });
const pdfs = await loader.load()
pdfs

[
  Document {
    pageContent: [32m"2024/3/24 20:59\n"[39m +
      [32m"如何使用 github copilot 完成 50% 的日常工作\n"[39m +
      [32m"https://kaiyi.cool/blog/github-copilot1/14\n"[39m +
      [32m"如何使用 git"[39m... 6530 more characters,
    metadata: {
      source: [32m"data/github-copliot.pdf"[39m,
      pdf: {
        version: [32m"1.10.100"[39m,
        info: {
          PDFFormatVersion: [32m"1.4"[39m,
          IsAcroFormPresent: [33mfalse[39m,
          IsXFAPresent: [33mfalse[39m,
          Title: [32m"如何使用 github copilot 完成 50% 的日常工作"[39m,
          Creator: [32m"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0."[39m... 17 more characters,
          Producer: [32m"Skia/PDF m123"[39m,
          CreationDate: [32m"D:20240324125917+00'00'"[39m,
          ModDate: [32m"D:20240324125917+00'00'"[39m
        },
        metadata: [1mnull[22m,
        totalPages: [33m14[39m
      }
    }
  }
]

In [150]:
// DirectoryLoader
import { DirectoryLoader } from "langchain/document_loaders/fs/directory";

const loader = new DirectoryLoader(
    "./data",
    {
        ".pdf": (path) => new PDFLoader(path, { splitPages: false }),
        ".txt": (path) => new TextLoader(path),
    }
)
const docs = await loader.load()
docs

[
  Document {
    pageContent: [32m"2024/3/24 20:59\n"[39m +
      [32m"如何使用 github copilot 完成 50% 的日常工作\n"[39m +
      [32m"https://kaiyi.cool/blog/github-copilot1/14\n"[39m +
      [32m"如何使用 git"[39m... 6530 more characters,
    metadata: {
      source: [32m"/Users/yyl/Documents/yyl/study/yyl-ai/jupyter-ai/js/data/github-copliot.pdf"[39m,
      pdf: {
        version: [32m"1.10.100"[39m,
        info: {
          PDFFormatVersion: [32m"1.4"[39m,
          IsAcroFormPresent: [33mfalse[39m,
          IsXFAPresent: [33mfalse[39m,
          Title: [32m"如何使用 github copilot 完成 50% 的日常工作"[39m,
          Creator: [32m"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0."[39m... 17 more characters,
          Producer: [32m"Skia/PDF m123"[39m,
          CreationDate: [32m"D:20240324125917+00'00'"[39m,
          ModDate: [32m"D:20240324125917+00'00'"[39m
        },
        metadata: [1mnull[22m,
        totalPages: 

In [151]:
// Github loader
import { GithubRepoLoader } from "langchain/document_loaders/web/github";
import ignore from "ignore";

const loader = new GithubRepoLoader(
    "https://github.com/RealKai42/qwerty-learner",
    { 
        branch: "master",
        recursive: false, 
        unknown: "warn", 
        ignorePaths: ["*.md", "yarn.lock", "*.json"],
        // accessToken: env["GITHUB_TOKEN"]
        accessToken: Deno.env.get("GITHUB_TOKEN")
    }
  );

loader

GithubRepoLoader {
  baseUrl: [32m"https://github.com"[39m,
  apiUrl: [32m"https://api.github.com"[39m,
  owner: [32m"RealKai42"[39m,
  repo: [32m"qwerty-learner"[39m,
  initialPath: [32m""[39m,
  headers: { [32m"User-Agent"[39m: [32m"langchain"[39m },
  branch: [32m"master"[39m,
  recursive: [33mfalse[39m,
  processSubmodules: [33mfalse[39m,
  unknown: [32m"warn"[39m,
  accessToken: [90mundefined[39m,
  ignoreFiles: [],
  ignore: Ignore {
    _rules: [
      IgnoreRule {
        origin: [32m"*.md"[39m,
        pattern: [32m"*.md"[39m,
        negative: [33mfalse[39m,
        regex: [31m/(?:^|\/)[^\/]*\.md(?=$|\/$)/i[39m
      },
      IgnoreRule {
        origin: [32m"yarn.lock"[39m,
        pattern: [32m"yarn.lock"[39m,
        negative: [33mfalse[39m,
        regex: [31m/(?:^|\/)yarn\.lock(?=$|\/$)/i[39m
      },
      IgnoreRule {
        origin: [32m"*.json"[39m,
        pattern: [32m"*.json"[39m,
        negative: [33mfalse[39m,
     

In [152]:
// WebLoader
import "cheerio";
import { CheerioWebBaseLoader } from "langchain/document_loaders/web/cheerio";

const loader = new CheerioWebBaseLoader(
    "https://kaiyi.cool/blog/github-copilot"
)
const docs = await loader.load();
docs

In [None]:
const loader = new CheerioWebBaseLoader(
    "https://kaiyi.cool/blog/github-copilot",
    {
      selector: "h3",
    }
  );
  
  const docs = await loader.load();
  console.log(docs[0].pageContent)
  

In [None]:
// Search API
import { SerpAPILoader } from "langchain/document_loaders/web/serpapi";
// const apiKey = env["SERP_KEY"]
const apiKey = Deno.env.get("SERP_KEY")
const quesrion = "什么是 github copliot"
const loader = new SerpAPILoader({ q: quesrion, apiKey });
const docs = await loader.load();
docs

In [156]:
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import { TextLoader } from "langchain/document_loaders/fs/text";
const loader = new TextLoader("data/cafe.txt");
const docs = await loader.load();
const splitter = new RecursiveCharacterTextSplitter({
    chunkSize: 64,
    chunkOverlap: 16,
})

const splitDocs = await splitter.splitDocuments(docs);
splitDocs


[
  Document {
    pageContent: [32m"咖啡（英语：coffee）是指咖啡植物的种子即咖啡豆在经过烘焙磨粉后通过冲泡制成的饮料，咖啡亦是世界上流行范围最为广泛的饮料之一"[39m,
    metadata: { source: [32m"data/cafe.txt"[39m, loc: { lines: { from: [33m1[39m, to: [33m1[39m } } }
  },
  Document {
    pageContent: [32m"世界上流行范围最为广泛的饮料之一。未经烘焙的咖啡生豆作为世界上最大的农产品出口物，以及世界上交易量为广泛的热带农产品之一，也是发"[39m,
    metadata: { source: [32m"data/cafe.txt"[39m, loc: { lines: { from: [33m1[39m, to: [33m1[39m } } }
  },
  Document {
    pageContent: [32m"量为广泛的热带农产品之一，也是发展中国家出口中最有价值的商品之一。[3][4][5]咖啡原产于非洲东岸的埃塞俄比亚，[6]15-"[39m,
    metadata: { source: [32m"data/cafe.txt"[39m, loc: { lines: { from: [33m1[39m, to: [33m1[39m } } }
  },
  Document {
    pageContent: [32m"洲东岸的埃塞俄比亚，[6]15-16世纪咖啡从也门被传播至穆斯林世界，[1][7][8]16世纪的威尼斯商人将咖啡引入意大利，["[39m,
    metadata: { source: [32m"data/cafe.txt"[39m, loc: { lines: { from: [33m1[39m, to: [33m1[39m } } }
  },
  Document {
    pageContent: [32m"的威尼斯商人将咖啡引入意大利，[9]随后17-18世纪由于欧洲对咖啡的需求，促使殖民者将咖啡树传播并栽种到美洲、东南亚和印度等热"[39m,
    me

In [157]:
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";

const js = `
function myFunction(name,job){
	console.log("Welcome " + name + ", the " + job);
}

myFunction('Harry Potter','Wizard')

function forFunction(){
	for (let i=0; i<5; i++){
        console.log("这个数字是" + i)
	}
}

forFunction()
`;

const splitter = RecursiveCharacterTextSplitter.fromLanguage("js", {
  chunkSize: 64,
  chunkOverlap: 0,
});
const jsOutput = await splitter.createDocuments([js])
jsOutput
 

[
  Document {
    pageContent: [32m"function myFunction(name,job){"[39m,
    metadata: { loc: { lines: { from: [33m2[39m, to: [33m2[39m } } }
  },
  Document {
    pageContent: [32m'console.log("Welcome " + name + ", the " + job);\n}'[39m,
    metadata: { loc: { lines: { from: [33m3[39m, to: [33m4[39m } } }
  },
  Document {
    pageContent: [32m"myFunction('Harry Potter','Wizard')"[39m,
    metadata: { loc: { lines: { from: [33m6[39m, to: [33m6[39m } } }
  },
  Document {
    pageContent: [32m"function forFunction(){\n\tfor (let i=0; i<5; i++){"[39m,
    metadata: { loc: { lines: { from: [33m8[39m, to: [33m9[39m } } }
  },
  Document {
    pageContent: [32m'console.log("这个数字是" + i)\n\t}\n}'[39m,
    metadata: { loc: { lines: { from: [33m10[39m, to: [33m12[39m } } }
  },
  Document {
    pageContent: [32m"forFunction()"[39m,
    metadata: { loc: { lines: { from: [33m14[39m, to: [33m14[39m } } }
  }
]

In [158]:
import { TokenTextSplitter } from "langchain/text_splitter";

const text = "I stand before you today the representative of a family in grief, in a country in mourning before a world in shock.";

const splitter = new TokenTextSplitter({
  chunkSize: 10,
  chunkOverlap: 0,
});

const docs = await splitter.createDocuments([text]);
docs

[
  Document {
    pageContent: [32m"I stand before you today the representative of a family"[39m,
    metadata: { loc: { lines: { from: [33m1[39m, to: [33m1[39m } } }
  },
  Document {
    pageContent: [32m" in grief, in a country in mourning before a"[39m,
    metadata: { loc: { lines: { from: [33m1[39m, to: [33m1[39m } } }
  },
  Document {
    pageContent: [32m" world in shock."[39m,
    metadata: { loc: { lines: { from: [33m1[39m, to: [33m1[39m } } }
  }
]