In [1]:
import glob
import os
import shutil
from dotenv import load_dotenv


import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go

from langchain_community.document_loaders import (
    DirectoryLoader,
    PyPDFLoader,
    TextLoader
)
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

from huggingface_hub import login, HfApi



from langchain_core.documents import Document
from langchain_chroma import Chroma

from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

from langchain_classic.memory import ConversationBufferMemory
from langchain_classic.chains import ConversationalRetrievalChain
from langchain_community.embeddings import HuggingFaceEmbeddings


In [2]:

# ==============================
# Config
# ==============================
load_dotenv(override=True)

HF_TOKEN = os.getenv("HF_TOKEN")
REPO_ID = "marufmullah50/rag-vector-db"   # NEW repo
LOCAL_DB_PATH = "chroma_db"

In [3]:
if not HF_TOKEN:
    raise ValueError("HF_TOKEN not found.")

login(token=HF_TOKEN)

print("Loading documents...")

folders = glob.glob("file/*")
documents = []

for folder in folders:
    doc_type = os.path.basename(folder)

    pdf_loader = DirectoryLoader(
        folder,
        glob="**/*.pdf",
        loader_cls=PyPDFLoader
    )

    md_loader = DirectoryLoader(
        folder,
        glob="**/*.md",
        loader_cls=TextLoader,
        loader_kwargs={"encoding": "utf-8"}
    )

    for loader in [pdf_loader, md_loader]:
        try:
            folder_docs = loader.load()
            for doc in folder_docs:
                doc.metadata["doc_type"] = doc_type
                documents.append(doc)
        except Exception as e:
            print(f"Skipping some files: {e}")

print(f"Loaded {len(documents)} documents.")

if not documents:
    print("No documents found.")
    

# ==============================
# Split
# ==============================
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

chunks = splitter.split_documents(documents)

print(f"Created {len(chunks)} chunks.")

# ==============================
# Create Embeddings
# ==============================
print("Creating embeddings...")

from langchain_community.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Delete if already exists

if os.path.exists(LOCAL_DB_PATH):
    Chroma(persist_directory=LOCAL_DB_PATH, embedding_function=embeddings).delete_collection()

# Create vectorstore

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=LOCAL_DB_PATH)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

# ==============================
# Push to HF
# ==============================
print(f"Pushing {LOCAL_DB_PATH} to {REPO_ID} (private)...")

api = HfApi()

api.create_repo(
    repo_id=REPO_ID,
    repo_type="model",
    private=True,
    exist_ok=True
)

api.upload_folder(
    folder_path=LOCAL_DB_PATH,
    repo_id=REPO_ID,
    repo_type="model"
)

print("✅ Vector DB pushed successfully!")


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Loading documents...
Loaded 6 documents.
Created 25 chunks.
Creating embeddings...


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Vectorstore created with 25 documents
Pushing chroma_db to marufmullah50/rag-vector-db (private)...


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

✅ Vector DB pushed successfully!
