In [1]:
import os

from chromadb.config import Settings
from dotenv import load_dotenv
from langchain.document_loaders import (
    DirectoryLoader,
    PyPDFLoader,
    TextLoader,
    UnstructuredMarkdownLoader,
)
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

In [2]:
load_dotenv()

HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

DB_CHROMA_PATH = "./../vectorstore/db_chroma"
DATA_DIR = "./data"

In [13]:

def create_vector_database():
    """
    Creates a vector database using document loaders and embeddings.

    This function loads data from PDF, markdown and text files in the 'data/' directory,
    splits the loaded documents into chunks, transforms them into embeddings using HuggingFace,
    and finally persists the embeddings into a Chroma vector database.

    """
    # Initialize loaders for different file types
    pdf_loader = DirectoryLoader(DATA_DIR, glob="**/*.pdf", loader_cls=PyPDFLoader)
    markdown_loader = DirectoryLoader(
        DATA_DIR, glob="**/*.md", loader_cls=UnstructuredMarkdownLoader
    )
    text_loader = DirectoryLoader(DATA_DIR, glob="**/*.txt", loader_cls=TextLoader)

    all_loaders = [pdf_loader, markdown_loader, text_loader]

    # Load documents from all loaders
    loaded_documents = []
    for loader in all_loaders:
        loaded_documents.extend(loader.load())
    # print((loaded_documents[0].metadata['source']))

    # text_loader = DirectoryLoader(DATA_DIR, glob="**/*.txt", loader_cls=TextLoader)
    # loaded_documents = text_loader.load()

    # len(loaded_documents)
    # loaded_documents[0]

    # Split loaded documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunked_documents = text_splitter.split_documents(loaded_documents)
    # len(chunked_documents)
    print(chunked_documents[0].metadata["source"])
    
    # Initialize HuggingFace embeddings
    huggingface_embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={"device": "cpu"},
    )

    # Create and persist a Chroma vector database from the chunked documents
    # vector_database = Chroma.from_documents(
    #     documents=chunked_documents,
    #     embedding=huggingface_embeddings,
    #     persist_directory=DB_CHROMA_PATH,
    # )
    # # vector_database.delete()
    # # vector_database.persist()

In [14]:
if __name__ == "__main__":
    create_vector_database()

data\2005.pdf


In [16]:
def create_vector_database2():
    """
    Creates a vector database using document loaders and embeddings.

    This function loads data from PDF, markdown and text files in the 'data/' directory,
    splits the loaded documents into chunks, transforms them into embeddings using HuggingFace,
    and finally persists the embeddings into a Chroma vector database.

    """
    # Initialize loaders for different file types
    pdf_loader = DirectoryLoader(DATA_DIR, glob="**/*.pdf", loader_cls=PyPDFLoader)
    markdown_loader = DirectoryLoader(
        DATA_DIR, glob="**/*.md", loader_cls=UnstructuredMarkdownLoader
    )
    text_loader = DirectoryLoader(DATA_DIR, glob="**/*.txt", loader_cls=TextLoader)

    all_loaders = [pdf_loader, markdown_loader, text_loader]

    # Load documents from all loaders
    loaded_documents = []
    for loader in all_loaders:
        loaded_documents.extend(loader.load())
        print((loader.metadata['source']))
create_vector_database2()

AttributeError: 'DirectoryLoader' object has no attribute 'metadata'

UnpicklingError: invalid load key, '\x00'.

In [5]:
import pandas as pd
sqlite_path=r'F:\python\vectorstore\db_chroma\chroma.sqlite3'

In [6]:
#code from chatgpt that I used to test the database

import sqlite3

# Connect to the SQLite database again
conn = sqlite3.connect(sqlite_path)
cursor = conn.cursor()

# Retrieve the list of tables in the database
tables = cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
table_names = [table[0] for table in tables]

table_names


['migrations',
 'embeddings_queue',
 'collections',
 'collection_metadata',
 'segments',
 'segment_metadata',
 'embeddings',
 'embedding_metadata',
 'max_seq_id',
 'embedding_fulltext_search',
 'embedding_fulltext_search_data',
 'embedding_fulltext_search_idx',
 'embedding_fulltext_search_content',
 'embedding_fulltext_search_docsize',
 'embedding_fulltext_search_config']

In [7]:
import sqlite3
import pandas as pd

con = sqlite3.connect(sqlite_path)

df = pd.read_sql("SELECT * FROM embedding_metadata", con)
print(df.columns)
dic=set()
for i in range(len(df)):
    row=df.iloc[i]
    if row.key=='source':
        dic.add(row.string_value.split('\\')[-1])
print(dic)



Index(['id', 'key', 'string_value', 'int_value', 'float_value', 'bool_value'], dtype='object')
{'Training Transformers with 4-bit Integers.pdf', 'QLoRA- Efficient Finetuning of Quantized LLMs.txt', 'state_of_the_union.txt', '2005.pdf'}


In [8]:
con = sqlite3.connect(sqlite_path)
df = pd.read_sql("SELECT * FROM embeddings_queue", con)
df.to_sql("embeddings_queue", con, if_exists="replace", index=False)

1260

In [9]:
#get all the unique values in the metadata column

all_files={}
for i in range(len(df)):
    name=(df['metadata'][i].split(',')[0].split(':')[1].replace('"','').replace('}','').replace('{','').replace(' ','').split(',')[0].split('\\')[-1])
    if name not in all_files:
        all_files[name]=1
    else:
        all_files[name]+=1
all_files
# df['metadata'][0].split(',')[0].split(':')[1].replace('"','').replace('}','').replace('{','').replace(' ','').split(',')[0].split('\\')[-1]

{'2005.pdf': 25,
 'TrainingTransformerswith4-bitIntegers.pdf': 430,
 'QLoRA-EfficientFinetuningofQuantizedLLMs.txt': 590,
 'state_of_the_union.txt': 215}

In [10]:
con.close()

In [11]:
#read the folowing pickle file to get the embeddings
path=r"F:\python\vectorstore\db_chroma\6230c921-d4ae-425d-b0f0-d1cf7bd55133\index_metadata.pickle"


df2=pd.read_pickle(path)
df2


FileNotFoundError: [Errno 2] No such file or directory: 'F:\\python\\vectorstore\\db_chroma\\6230c921-d4ae-425d-b0f0-d1cf7bd55133\\index_metadata.pickle'