<a href="https://colab.research.google.com/github/nagababumo/LangChain.js/blob/main/vectorstores%20and%20embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lesson 3: Vectorstores and embeddings

# Vectorstore ingestion

In [None]:
import "dotenv/config";

[Module: null prototype] { default: {} }

In [None]:
import { OpenAIEmbeddings } from "@langchain/openai";

const embeddings = new OpenAIEmbeddings();

await embeddings.embedQuery("This is some sample text");

[
   [33m-0.010454769[39m,   [33m0.0023495338[39m, [33m-0.0008236494[39m,  [33m-0.010953553[39m,    [33m-0.01150484[39m,
    [33m0.022891548[39m,   [33m-0.014687868[39m,  [33m0.0017490244[39m,  [33m-0.017601814[39m,   [33m-0.019386934[39m,
    [33m0.005135503[39m,    [33m0.034127306[39m,  [33m-0.012298956[39m,  [33m0.0019163794[39m,    [33m0.004646564[39m,
    [33m0.013079946[39m,    [33m0.024689794[39m,   [33m0.001944272[39m,  [33m0.0044529573[39m,  [33m-0.0063594924[39m,
  [33m-0.0051158145[39m, [33m-0.00077852915[39m,  [33m-0.008078983[39m,   [33m0.014005321[39m,   [33m-0.009076551[39m,
   [33m-0.003957455[39m,  [33m-0.0008121643[39m,  [33m-0.019701956[39m,    [33m0.00401324[39m,   [33m-0.001540651[39m,
    [33m0.015934827[39m,   [33m-0.021972734[39m,  [33m-0.000726846[39m,  [33m-0.022261504[39m,    [33m0.006261048[39m,
    [33m0.007081416[39m,   [33m-0.011340766[39m, [33m-0.0135459155[39m,    [33m0.0287194

In [None]:
import { similarity } from "ml-distance";

const vector1 = await embeddings.embedQuery(
    "What are vectors useful for in machine learning?"
);
const unrelatedVector = await embeddings.embedQuery(
    "A group of parrots is called a pandemonium."
);

In [None]:
similarity.cosine(vector1, unrelatedVector);

[33m0.6962144676957391[39m

In [None]:
const similarVector = await embeddings.embedQuery(
    "Vectors are representations of information."
);

similarity.cosine(vector1, similarVector);

[33m0.859059927625929[39m

In [None]:
// Peer dependency
import * as parse from "pdf-parse";
import { PDFLoader } from "langchain/document_loaders/fs/pdf";
import {
    RecursiveCharacterTextSplitter
} from "langchain/text_splitter";

const loader = new PDFLoader("./data/MachineLearning-Lecture01.pdf");

const rawCS229Docs = await loader.load();

const splitter = new RecursiveCharacterTextSplitter({
  chunkSize: 128,
  chunkOverlap: 0,
});

const splitDocs = await splitter.splitDocuments(rawCS229Docs);

In [None]:
import { MemoryVectorStore } from "langchain/vectorstores/memory";

const vectorstore = new MemoryVectorStore(embeddings);

In [None]:
await vectorstore.addDocuments(splitDocs);

In [None]:
const retrievedDocs = await vectorstore.similaritySearch(
    "What is deep learning?",
    4
);

const pageContents = retrievedDocs.map(doc => doc.pageContent);

pageContents

[
  [32m"piece of research in machine learning, okay?"[39m,
  [32m"are using a learning algorithm, perhaps without even being aware of it."[39m,
  [32m"some of my own excitement about machine learning to you."[39m,
  [32m"of the class, and then we'll start to talk a bit about machine learning."[39m
]

# Retrievers

In [None]:
const retriever = vectorstore.asRetriever();

In [None]:
await retriever.invoke("What is deep learning?")

[
  Document {
    pageContent: [32m"piece of research in machine learning, okay?"[39m,
    metadata: {
      source: [32m"./data/MachineLearning-Lecture01.pdf"[39m,
      pdf: {
        version: [32m"1.10.100"[39m,
        info: {
          PDFFormatVersion: [32m"1.4"[39m,
          IsAcroFormPresent: [33mfalse[39m,
          IsXFAPresent: [33mfalse[39m,
          Title: [32m""[39m,
          Author: [32m""[39m,
          Creator: [32m"PScript5.dll Version 5.2.2"[39m,
          Producer: [32m"Acrobat Distiller 8.1.0 (Windows)"[39m,
          CreationDate: [32m"D:20080711112523-07'00'"[39m,
          ModDate: [32m"D:20080711112523-07'00'"[39m
        },
        metadata: Metadata { _metadata: [36m[Object: null prototype][39m },
        totalPages: [33m22[39m
      },
      loc: { pageNumber: [33m8[39m, lines: { from: [33m2[39m, to: [33m2[39m } }
    }
  },
  Document {
    pageContent: [32m"are using a learning algorithm, perhaps without even being aw