# Indexing: Creating a Chroma Vectorstore

In [1]:
# Run the line of code below to check the version of langchain in the current environment.
# Substitute "langchain" with any other package name to check their version.

In [2]:
pip show langchain

Name: langchain
Version: 0.3.26
Summary: Building applications with LLMs through composability
Home-page: 
Author: 
Author-email: 
License: MIT
Location: C:\Users\Marcus\anaconda3\envs\langchain_env_py312\Lib\site-packages
Requires: langchain-core, langchain-text-splitters, langsmith, pydantic, PyYAML, requests, SQLAlchemy
Required-by: langchain-community
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Load environment variable with OpenAI key
%load_ext dotenv
%dotenv

In [11]:
# Import necessary modules and classes
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma

In [5]:
#Load document
loader_docx = Docx2txtLoader("Introduction_to_Data_and_Data_Science_2.docx")
pages = loader_docx.load()

#Split document based on course and lecture titles
md_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on = [("#", "Course Title"), 
                           ("##", "Lecture Title")]
)

#Remove newline characters from all texts
pages_md_split = md_splitter.split_text(pages[0].page_content)

for i in range(len(pages_md_split)):
    pages_md_split[i].page_content = ' '.join(pages_md_split[i].page_content.split())
    
# Split document on number of characters using a chunk overlap and period separator
char_splitter = CharacterTextSplitter(
    separator = ".",
    chunk_size = 500,
    chunk_overlap  = 50
)

pages_char_split = char_splitter.split_documents(pages_md_split)

# Create instance of OpenAI embeddings class with embeddings model
embedding = OpenAIEmbeddings(model='text-embedding-ada-002')

In [6]:
# Check number of documents stored in the pages character split list
len(pages_char_split)

20

In [7]:
# Create vectorstore and embed all documents simultaneously. Persis directory is not required but if you do not specify
# vectorstore will only exist until the kernel is restarted or shut down
vectorstore = Chroma.from_documents(documents = pages_char_split, 
                                    embedding = embedding, 
                                    persist_directory = "./intro-to-ds-lectures")

In [12]:
# Create a new vectorstore from existing vectorstore directory
vectorstore_from_directory = Chroma(persist_directory = "./intro-to-ds-lectures", 
                                    embedding_function = embedding)