# Lesson 3: Vectorstores and embeddings

# Vectorstore ingestion

In [1]:
import "dotenv/config";

[Module: null prototype] { default: {} }

In [2]:
import { OpenAIEmbeddings } from "@langchain/openai";

const embeddings = new OpenAIEmbeddings();

await embeddings.embedQuery("This is some sample text");

[
   [33m-0.010372737[39m,  [33m0.0023619137[39m,  [33m-0.000738098[39m,  [33m-0.010897608[39m,   [33m-0.011474964[39m,
    [33m0.022949928[39m,  [33m-0.014656986[39m,  [33m0.0017665146[39m,   [33m-0.01755689[39m,    [33m-0.01927584[39m,
    [33m0.005156845[39m,    [33m0.03419526[39m,  [33m-0.012183538[39m,  [33m0.0019223352[39m,    [33m0.004691023[39m,
    [33m0.013141425[39m,   [33m0.024642633[39m,  [33m0.0017369906[39m,   [33m0.004533562[39m,   [33m-0.006232828[39m,
   [33m-0.005114199[39m, [33m-0.0006815105[39m,  [33m-0.008187967[39m,    [33m0.01393529[39m,   [33m-0.008916224[39m,
  [33m-0.0040480574[39m, [33m-0.0007487594[39m,  [33m-0.019617004[39m,  [33m0.0040316554[39m,  [33m-0.0017140276[39m,
    [33m0.016061014[39m,  [33m-0.021873945[39m, [33m-0.0007725426[39m,  [33m-0.022254474[39m,    [33m0.006242669[39m,
    [33m0.007033254[39m,  [33m-0.011205968[39m,  [33m-0.013416981[39m,   [33m0.028815346[39m, 

In [3]:
import { similarity } from "ml-distance";

const vector1 = await embeddings.embedQuery(
    "What are vectors useful for in machine learning?"
);
const unrelatedVector = await embeddings.embedQuery(
    "A group of parrots is called a pandemonium."
);

In [4]:
similarity.cosine(vector1, unrelatedVector);

[33m0.6957264527346025[39m

In [5]:
const similarVector = await embeddings.embedQuery(
    "Vectors are representations of information."
);

similarity.cosine(vector1, similarVector);

[33m0.8588144744020122[39m

In [6]:
// Peer dependency
import * as parse from "pdf-parse";
import { PDFLoader } from "langchain/document_loaders/fs/pdf";
import { 
    RecursiveCharacterTextSplitter
} from "langchain/text_splitter";

const loader = new PDFLoader("./data/MachineLearning-Lecture01.pdf");

const rawCS229Docs = await loader.load();

const splitter = new RecursiveCharacterTextSplitter({
  chunkSize: 128,
  chunkOverlap: 0,
});

const splitDocs = await splitter.splitDocuments(rawCS229Docs);

In [7]:
import { MemoryVectorStore } from "langchain/vectorstores/memory";

const vectorstore = new MemoryVectorStore(embeddings);

In [8]:
await vectorstore.addDocuments(splitDocs);

In [9]:
const retrievedDocs = await vectorstore.similaritySearch(
    "What is deep learning?", 
    4
);

const pageContents = retrievedDocs.map(doc => doc.pageContent);

pageContents

[
  [32m"piece of research in machine learning, okay?"[39m,
  [32m"are using a learning algorithm, perhaps without even being aware of it."[39m,
  [32m"some of my own excitement about machine learning to you."[39m,
  [32m"of the class, and then we'll start to talk a bit about machine learning."[39m
]

# Retrievers

In [10]:
const retriever = vectorstore.asRetriever();

In [11]:
await retriever.invoke("What is deep learning?")

[
  Document {
    pageContent: [32m"piece of research in machine learning, okay?"[39m,
    metadata: {
      source: [32m"./data/MachineLearning-Lecture01.pdf"[39m,
      pdf: {
        version: [32m"1.10.100"[39m,
        info: {
          PDFFormatVersion: [32m"1.4"[39m,
          IsAcroFormPresent: [33mfalse[39m,
          IsXFAPresent: [33mfalse[39m,
          Title: [32m""[39m,
          Author: [32m""[39m,
          Creator: [32m"PScript5.dll Version 5.2.2"[39m,
          Producer: [32m"Acrobat Distiller 8.1.0 (Windows)"[39m,
          CreationDate: [32m"D:20080711112523-07'00'"[39m,
          ModDate: [32m"D:20080711112523-07'00'"[39m
        },
        metadata: Metadata { _metadata: [36m[Object: null prototype][39m },
        totalPages: [33m22[39m
      },
      loc: { pageNumber: [33m8[39m, lines: { from: [33m2[39m, to: [33m2[39m } }
    }
  },
  Document {
    pageContent: [32m"are using a learning algorithm, perhaps without even being aw