In [None]:
# @title  Clay City Times Vector DB (Persistent in Drive)
# ======================================================

# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 2. Install dependencies
!pip install -q chromadb lxml sentence-transformers tqdm

# 3. Imports and setup
import os, glob, json
from tqdm import tqdm
from lxml import etree
import chromadb
from sentence_transformers import SentenceTransformer

# ======================================================
# CONFIG
# ======================================================
BASE_DIR = '/content/drive/MyDrive/research_2'
DB_DIR = os.path.join(BASE_DIR, 'vector_db')
os.makedirs(DB_DIR, exist_ok=True)

# Persistent Chroma configuration
# Use PersistentClient, which is the modern way to create a persistent DB
client = chromadb.PersistentClient(path=DB_DIR)

collection = client.get_or_create_collection("clay_city_times")

model = SentenceTransformer('all-MiniLM-L6-v2')

# ======================================================
# 4. Index Builder
# ======================================================
def extract_metadata(xml_path):
    meta = {}
    try:
        tree = etree.parse(xml_path)
        for el in tree.findall('.//'):
            if el.text and el.text.strip():
                tag = el.tag.lower()
                if tag not in meta:
                    meta[tag] = el.text.strip()
    except Exception:
        pass
    return meta

def add_issue_to_db(folder):
    txt_files = glob.glob(os.path.join(folder, '*_djvu.txt'))
    xml_files = glob.glob(os.path.join(folder, '*_meta.xml'))
    if not txt_files:
        return

    text = ""
    try:
        with open(txt_files[0], encoding='utf-8', errors='ignore') as f:
            text = f.read().replace('\n', ' ')
    except FileNotFoundError:
        print(f"Warning: Text file not found in {folder}")
        return

    if not text:
        return

    text = text[:20000]  # limit for Colab memory safety
    metadata = extract_metadata(xml_files[0]) if xml_files else {}

    emb = model.encode(text)
    doc_id = os.path.basename(folder)

    # Use upsert instead of add
    # This will add the document if it doesn't exist,
    # or update it if it already exists (based on doc_id).
    # This prevents 'IDAlreadyExistsError' on subsequent runs.
    collection.upsert(
        ids=[doc_id],
        documents=[text],
        metadatas=[metadata],
        embeddings=[emb]
    )

# ======================================================
# 5. Build DB from all folders
# ======================================================
folders = sorted(glob.glob(os.path.join(BASE_DIR, 'clay_city_times_*')))
for f in tqdm(folders, desc="Indexing issues"):
    add_issue_to_db(f)

# client.persist() is no longer needed with PersistentClient
print("âœ… Vector DB created and saved persistently in:", DB_DIR)