# üèóÔ∏è Colab Knowledge Builder

This notebook runs on Google Colab (using T4 GPU) to process your PDF/Docx files and build a local Vector Database.

**Steps:**
1. Install Dependencies
2. Clone your GitHub Repo (to get the `source` files)
3. Process Documents
4. Generate Embeddings & Vector DB
5. Zip and Download the Database

In [None]:
# @title 1. Install Dependencies
!pip install -q langchain langchain-community langchain-text-splitters sentence-transformers chromadb pypdf python-docx docx2txt

In [None]:
# @title 2. Clone Repository
import os
import shutil

# ‚ö†Ô∏è CHANGE THIS TO YOUR REPO URL
REPO_URL = "https://github.com/pisces9187-a11y/st_eng.git"

REPO_NAME = REPO_URL.split("/")[-1].replace(".git", "")

if os.path.exists(REPO_NAME):
    shutil.rmtree(REPO_NAME)

!git clone {REPO_URL}

In [None]:
# @title 3. Define Parsing Functions (Robust)
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import glob

SOURCE_DIR = f"{REPO_NAME}/source"  # Expects a 'source' folder in your repo

def load_documents(source_dir):
    documents = []
    
    # 1. Find all files
    pdf_files = glob.glob(f"{source_dir}/**/*.pdf", recursive=True)
    docx_files = glob.glob(f"{source_dir}/**/*.docx", recursive=True)
    
    print(f"Found {len(pdf_files)} PDFs and {len(docx_files)} Docx files.")
    
    # 2. Load PDFs safely
    for file_path in pdf_files:
        try:
            loader = PyPDFLoader(file_path)
            docs = loader.load()
            documents.extend(docs)
            print(f"‚úÖ Loaded: {os.path.basename(file_path)}")
        except Exception as e:
            print(f"‚ùå Error loading {os.path.basename(file_path)}: {e}")

    # 3. Load Docx safely
    for file_path in docx_files:
        try:
            loader = Docx2txtLoader(file_path)
            docs = loader.load()
            documents.extend(docs)
            print(f"‚úÖ Loaded: {os.path.basename(file_path)}")
        except Exception as e:
            print(f"‚ùå Error loading {os.path.basename(file_path)}: {e}")
            
    print(f"Total documents loaded: {len(documents)}")
    return documents

def split_text(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        add_start_index=True,
    )
    if not documents:
        print("‚ö†Ô∏è No documents to split!")
        return []
    
    chunks = text_splitter.split_documents(documents)
    print(f"Split into {len(chunks)} chunks.")
    return chunks

In [None]:
# @title 4. Build Vector DB (Using GPU)
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
import torch

# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Initialize Embedding Model
embedding_function = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': device}
)

# Load and Split
docs = load_documents(SOURCE_DIR)
chunks = split_text(docs)

# Persist DB
CHROMA_PATH = "chroma_db"
if os.path.exists(CHROMA_PATH):
    shutil.rmtree(CHROMA_PATH)

if chunks:
    db = Chroma.from_documents(
        documents=chunks, 
        embedding=embedding_function, 
        persist_directory=CHROMA_PATH
    )
    print(f"‚úÖ Database processed and saved to '{CHROMA_PATH}'.")
else:
    print("‚ùå No chunks created. Database not built.")

In [None]:
# @title 5. Zip and Download
if os.path.exists("chroma_db"):
    !zip -r chroma_db.zip chroma_db
    from google.colab import files
    files.download('chroma_db.zip')
else:
    print("‚ùå No database to zip.")