In [4]:
import os

from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
#from langchain_openai import OpenAIEmbeddings

# Define the directory containing the text file and the persistent directory
current_dir = os.path.dirname(os.path.abspath('Learning-LangChain'))
file_path = os.path.join(current_dir, "books", "odyssey.txt")
persistent_directory = os.path.join(current_dir, "db_test", "chroma_db")

# Check if the Chroma vector store already exists
if not os.path.exists(persistent_directory):
    print("Persistent directory does not exist. Initializing vector store...")

    # Ensure the text file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(
            f"The file {file_path} does not exist. Please check the path."
        )

    # Read the text content from the file
    loader = TextLoader(file_path, encoding='utf-8')
    documents = loader.load()

    # Split the document into chunks
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    docs = text_splitter.split_documents(documents) #a list of chunk (list of Document object as chunk)

    # Display information about the split documents
    print("\n--- Document Chunks Information ---")
    print(f"Number of document chunks: {len(docs)}")
    print(f"Sample chunk:\n{docs[0].page_content}\n")

    # Create embeddings
    print("\n--- Creating embeddings ---")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # Update to a valid embedding model if needed
    print("\n--- Finished creating embeddings ---")

    # Create the vector store and persist it automatically
    print("\n--- Creating vector store ---")
    db = Chroma.from_documents(
        docs, embeddings, persist_directory=persistent_directory)
    print("\n--- Finished creating vector store ---")

else:
    print("Vector store already exists. No need to initialize.")

Created a chunk of size 1141, which is longer than the specified 1000
Created a chunk of size 2086, which is longer than the specified 1000
Created a chunk of size 1121, which is longer than the specified 1000
Created a chunk of size 1366, which is longer than the specified 1000
Created a chunk of size 1011, which is longer than the specified 1000
Created a chunk of size 1639, which is longer than the specified 1000
Created a chunk of size 1219, which is longer than the specified 1000
Created a chunk of size 1875, which is longer than the specified 1000
Created a chunk of size 1307, which is longer than the specified 1000
Created a chunk of size 2271, which is longer than the specified 1000
Created a chunk of size 1430, which is longer than the specified 1000
Created a chunk of size 1763, which is longer than the specified 1000
Created a chunk of size 1575, which is longer than the specified 1000
Created a chunk of size 1028, which is longer than the specified 1000
Created a chunk of s

Persistent directory does not exist. Initializing vector store...

--- Document Chunks Information ---
Number of document chunks: 826
Sample chunk:
﻿The Project Gutenberg eBook of The Odyssey
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: The Odyssey

Author: Homer

Translator: Samuel Butler

Release date: April 1, 1999 [eBook #1727]
                Most recently updated: December 2, 2023

Language: English

Credits: Jim Tinsley and David Widger


*** START OF THE PROJECT GUTENBERG EBOOK THE ODYSSEY ***


[Illustration]


The Odyssey

by Homer

rendered into English prose for the use 

  from tqdm.autonotebook import tqdm, trange



--- Finished creating embeddings ---

--- Creating vector store ---

--- Finished creating vector store ---


## Some experiments

In [8]:
type(documents)

list

In [14]:
len(documents)

1

In [13]:
documents



In [12]:
len(docs)

826

In [28]:
docs

[Document(metadata={'source': 'C:\\Users\\ASUS\\Learning-LangChain\\books\\odyssey.txt'}, page_content='\ufeffThe Project Gutenberg eBook of The Odyssey\n    \nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this ebook or online\nat www.gutenberg.org. If you are not located in the United States,\nyou will have to check the laws of the country where you are located\nbefore using this eBook.\n\nTitle: The Odyssey\n\nAuthor: Homer\n\nTranslator: Samuel Butler\n\nRelease date: April 1, 1999 [eBook #1727]\n                Most recently updated: December 2, 2023\n\nLanguage: English\n\nCredits: Jim Tinsley and David Widger\n\n\n*** START OF THE PROJECT GUTENBERG EBOOK THE ODYSSEY ***\n\n\n[Illustration]\n\n\nThe Odyssey\n\nby Homer\n\nrendered into English prose for the use of th

Cả documents và docs đều là một list các Document().

In [18]:
print(docs[0])
print('---------------------')
print(docs[1])

page_content='﻿The Project Gutenberg eBook of The Odyssey
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: The Odyssey

Author: Homer

Translator: Samuel Butler

Release date: April 1, 1999 [eBook #1727]
                Most recently updated: December 2, 2023

Language: English

Credits: Jim Tinsley and David Widger


*** START OF THE PROJECT GUTENBERG EBOOK THE ODYSSEY ***


[Illustration]


The Odyssey

by Homer

rendered into English prose for the use of those who cannot read the
original

Contents' metadata={'source': 'C:\\Users\\ASUS\\Learning-LangChain\\books\\odyssey.txt'}
------

In [24]:
print(documents)



In [23]:
for doc in documents:
    print(doc)
    print('--------------------------------')

page_content='﻿The Project Gutenberg eBook of The Odyssey
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: The Odyssey

Author: Homer

Translator: Samuel Butler

Release date: April 1, 1999 [eBook #1727]
                Most recently updated: December 2, 2023

Language: English

Credits: Jim Tinsley and David Widger


*** START OF THE PROJECT GUTENBERG EBOOK THE ODYSSEY ***




[Illustration]




The Odyssey

by Homer

rendered into English prose for the use of those who cannot read the
original

Contents

 PREFACE TO FIRST EDITION
 PREFACE TO SECOND EDITION
 THE ODYSSEY
 BOOK I.
 BOOK 

In [25]:
documents_list = []

In [26]:
for doc in documents:
    doc.metadata = {'source':'TESTING SOMETHING'} #doc.metadata là một dictionary
    documents_list.append(doc)

In [27]:
documents_list



In [11]:
text = 'Yann Lecun invented CNNs'
embedded_text = embeddings.embed_documents([text])

In [15]:
len(embedded_text[0]) #the length of embedded vector of the text

384

In [17]:
query = 'CNNs is used for image processing'
embedded_query = embeddings.embed_query(query)

In [19]:
len(embedded_query)

384

## Basic of Vector Strore

In [20]:
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.documents import Document

In [21]:
vector_store = InMemoryVectorStore(embedding=embeddings)

In [23]:
doc_1 = Document(page_content='Yann Lecun invented the CNNs', metadata={'souce':'news'})
doc_2 = Document(page_content='Kingma invented Adam', metadata={'souce':'news'})
docs = [doc_1, doc_2]

In [28]:
vector_store.add_documents(documents=docs, ids=['doc1', 'doc2']) #good practice to use ids

['doc1', 'doc2']

In [29]:
# vector_store.delete(ids=['doc1'])

In [30]:
query = 'Who invented the CNNs?'
results = vector_store.similarity_search(query, k=1)
results

[Document(id='e93bd67b-52bd-4d22-8c57-fff204cff1b2', metadata={'souce': 'news'}, page_content='Yann Lecun invented the CNNs')]