In [None]:
# @title  Clay City Times (1901–1922) – High-Speed Downloader
# Mode: Static link list + aria2c parallel bulk fetch
# Target: /MyDrive/research_2/
# ==========================================================

# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 2. Install tools
!pip install -q internetarchive tqdm
!apt-get -qq install aria2

import os, re, shutil
from tqdm import tqdm
import internetarchive as ia

# ==========================================================
# CONFIG
# ==========================================================
BASE_DIR = '/content/drive/MyDrive/research_2'
os.makedirs(BASE_DIR, exist_ok=True)

LINK_LIST = '/content/clay_city_times_links.txt'
YEARS = [str(y) for y in range(1901, 1923)]
QUERY = 'creator:"Clay City times" AND (' + ' OR '.join(f'year:{y}' for y in YEARS) + ')'

# ==========================================================
# 3. Build Static Download List (only once)
# ==========================================================
if not os.path.exists(LINK_LIST):
    print("Generating item list from Archive.org (first run only)...")
    results = list(ia.search_items(QUERY))
    with open(LINK_LIST, 'w') as f:
        for r in tqdm(results, desc="Building link list"):
            ident = r['identifier']
            f.write(f"https://archive.org/download/{ident}/{ident}_djvu.txt\n")
            f.write(f"https://archive.org/download/{ident}/{ident}_meta.xml\n")
    print(f"List written to {LINK_LIST} ({len(results)} issues, 2 links each).")
else:
    print(f"Using existing link list: {LINK_LIST}")

# ==========================================================
# 4. High-speed Bulk Download
# ==========================================================
print("\nStarting aria2c parallel download...")
!aria2c -x 8 -s 8 -i /content/clay_city_times_links.txt \
  -d "/content/drive/MyDrive/research_2" \
  --continue=true --auto-file-renaming=false --summary-interval=30

# ==========================================================
# 5. Organize files into per-issue folders
# ==========================================================
print("\nOrganizing downloaded files...")

for f in tqdm(os.listdir(BASE_DIR), desc="Organizing"):
    if not os.path.isfile(os.path.join(BASE_DIR, f)):
        continue
    m = re.match(r'(claycitytimes|clay_city_times|claycity_times|claycity\-times)?_?(\d{4}[\-_]\d{2}[\-_]\d{2})?', f)
    date = None
    if m and m.group(2):
        date = m.group(2).replace('_', '-')
    else:
        # fallback: parse year if available
        yr_match = re.search(r'(19\d{2})', f)
        if yr_match:
            date = yr_match.group(1)
    folder = os.path.join(BASE_DIR, f"clay_city_times_{date or 'unknown'}")
    os.makedirs(folder, exist_ok=True)
    try:
        shutil.move(os.path.join(BASE_DIR, f), os.path.join(folder, f))
    except:
        pass

print("\n✅ All issues downloaded and organized under MyDrive/research_2/")


In [None]:
# @title
# SCRIPT C: (NEW) "ENRICH & BUILD" DATABASE SCRIPT
#
# This script REPLACES your old, simple database builder.
# It uses Chunking (langchain) and Entity Extraction (spacy)
# to build a much smarter database.
#
# Run this AFTER Script A (Converter) and Script B (Deduplicator).
#
# It will:
# 1. Install langchain and spacy.
# 2. Load both the Embedding model (for search) and an NER model (for metadata).
# 3. Scan the clean 'Text_Files' directory.
# 4. For each file, find its matching .xml file to get source metadata.
# 5. Break the text into small, 500-character chunks.
# 6. "Read" each chunk with spacy to find People, Places, and Orgs.
# 7. Add each chunk to ChromaDB with all this new metadata.
# 8. Move the final, enriched DB to Google Drive.

import os
import shutil
import time
import pickle
import sys
import xml.etree.ElementTree as ET # For reading the .xml files
from google.colab import drive
from tqdm.auto import tqdm # Progress bar

# ------------------------------------------------------------------
# Phase 1: Install Dependencies
# ------------------------------------------------------------------
print("--- Phase 1: Installing Dependencies ---")
!pip install chromadb sentence-transformers langchain spacy -q
# Download the small, fast spacy model for Named Entity Recognition
!python -m spacy download en_core_web_sm -q
print("Dependencies (Chroma, Langchain, Spacy) installed.")

import chromadb
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import spacy

# ------------------------------------------------------------------
# Phase 2: Load Models
# ------------------------------------------------------------------
print("\n--- Phase 2: Loading AI Models ---")

try:
    print("Loading SentenceTransformer model (all-MiniLM-L6-v2) for embedding...")
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    print("Embedding model loaded.")

    print("Loading Spacy NER model (en_core_web_sm) for entity extraction...")
    nlp = spacy.load("en_core_web_sm")
    print("NER model loaded.")
except Exception as e:
    print(f"Error loading models: {e}")
    raise e

# ------------------------------------------------------------------
# Phase 3: Configuration
# ------------------------------------------------------------------
print("\n--- Phase 3: Configuring Paths ---")

try:
    drive.mount('/content/drive', force_remount=True)
    print("Google Drive mounted.")
except Exception as e:
    print(f"Error mounting drive: {e}")
    raise e

# --- CONFIGURED FOR CLAY CITY TIMES ---
# This is the main output folder for your project
MAIN_OUTPUT_DIRECTORY = '/content/drive/MyDrive/clay_city_times-2'

# This is the ORIGINAL download folder where the .xml files are
# We need this to read the source metadata
XML_SOURCE_DIRECTORY = '/content/drive/MyDrive/clay_city_times-2'
# -------------------------------------

# This is the folder of CLEANED and DEDUPLICATED .txt files (from Script B)
SOURCE_DIRECTORY = os.path.join(MAIN_OUTPUT_DIRECTORY, 'Text_Files')

# --- DB Build Paths (Local first, then GDrive) ---
LOCAL_DB_PATH = '/content/Clay_City_Enriched_DB'
FINAL_DB_PATH_ON_DRIVE = os.path.join(MAIN_OUTPUT_DIRECTORY, 'Vector_Database_Enriched')
COLLECTION_NAME = "clay_city_archive_v2" # New collection name

# State file for the database log
STATE_DIRECTORY = os.path.join(MAIN_OUTPUT_DIRECTORY, 'Script_State')
DB_LOG_FILE = os.path.join(STATE_DIRECTORY, 'db_processed_files_v2.log')

print(f"Source (Input): {SOURCE_DIRECTORY}")
print(f"XML Source (Metadata): {XML_SOURCE_DIRECTORY}")
print(f"Local DB Build Path: {LOCAL_DB_PATH}")
print(f"Final DB GDrive Path: {FINAL_DB_PATH_ON_DRIVE}")

os.makedirs(STATE_DIRECTORY, exist_ok=True)
os.makedirs(FINAL_DB_PATH_ON_DRIVE, exist_ok=True)

# ------------------------------------------------------------------
# Phase 4: Helper Functions
# ------------------------------------------------------------------

def parse_xml_metadata(txt_file_path):
    """
    Finds the matching _meta.xml file for a .txt file and extracts
    key metadata.
    """
    try:
        # Construct the expected XML file path
        # e.g., .../xt7b2r3nwv4x_djvu.txt -> .../xt7b2r3nwv4x_meta.xml
        base_name = os.path.basename(txt_file_path).replace('_djvu.txt', '')
        relative_dir = os.path.relpath(os.path.dirname(txt_file_path), SOURCE_DIRECTORY)
        xml_file_path = os.path.join(XML_SOURCE_DIRECTORY, relative_dir, f"{base_name}_meta.xml")

        if not os.path.exists(xml_file_path):
            # No XML file found (this is normal for your other projects)
            return {"source_file": os.path.basename(txt_file_path)}

        tree = ET.parse(xml_file_path)
        root = tree.getroot()

        # Define a namespace (often present in XML files)
        ns = {'': root.tag.split('}')[0].strip('{') if '}' in root.tag else ''}
        def find_tag(tag):
            el = root.find(f"{{}}{tag}", ns)
            return el.text if el is not None else "unknown"

        metadata = {
            "source_file": os.path.basename(txt_file_path),
            "date": find_tag("date"),
            "publisher": find_tag("publisher"),
            "county": find_tag("county"),
            "title": find_tag("title")
        }
        return metadata
    except Exception as e:
        print(f"  WARN: Could not parse XML for {txt_file_path}. {e}")
        return {"source_file": os.path.basename(txt_file_path)}

def extract_entities(chunk_text):
    """
    Uses Spacy to extract People, Places, and Organizations from a chunk.
    """
    # We use lists to avoid duplicate entries
    people = set()
    places = set()
    orgs = set()

    doc = nlp(chunk_text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            people.add(ent.text)
        elif ent.label_ in ["GPE", "LOC"]: # GPE=Geo-Political, LOC=Location
            places.add(ent.text)
        elif ent.label_ == "ORG":
            orgs.add(ent.text)

    # Return as comma-separated strings (cleaner for ChromaDB metadata)
    return {
        "people": ", ".join(people),
        "places": ", ".join(places),
        "orgs": ", ".join(orgs)
    }

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # Size of each chunk
    chunk_overlap=50 # Overlap to keep context
)

# ------------------------------------------------------------------
# Phase 5: Scan, Chunk, Enrich, and Build Database
# ------------------------------------------------------------------
print("\n--- Phase 5: Building Enriched Vector Database ---")

print("Loading processing state...")
processed_files = set()
try:
    if os.path.exists(DB_LOG_FILE):
        with open(DB_LOG_FILE, 'r', encoding='utf-8') as f:
            processed_files = set(f.read().splitlines())
        print(f"Loaded {len(processed_files)} *file* (not chunk) records.")
except Exception as e:
    print(f"Could not load DB state, starting fresh. Error: {e}")

client = chromadb.PersistentClient(path=LOCAL_DB_PATH)
collection = client.get_or_create_collection(name=COLLECTION_NAME)
print(f"DB client running. Collection '{COLLECTION_NAME}' has {collection.count()} *chunks*.")

start_time = time.time()
new_files_processed = 0
new_chunks_added = 0
batch_documents, batch_metadatas, batch_ids = [], [], []
BATCH_SIZE = 100 # We can use a larger batch size now

try:
    # First, get a list of all files that need processing
    files_to_process = []
    for dirpath, dirnames, filenames in os.walk(SOURCE_DIRECTORY):
        for filename in filenames:
            if filename.endswith(".txt"):
                file_path = os.path.join(dirpath, filename)
                # --- RESUMABILITY ---
                if file_path not in processed_files:
                    files_to_process.append(file_path)

    print(f"Found {len(files_to_process)} new files to chunk and embed.")

    # Now, process them with a progress bar
    with open(DB_LOG_FILE, 'a', encoding='utf-8') as f_log:
        for file_path in tqdm(files_to_process, desc="Processing files"):
            try:
                # 1. Get Source Metadata (from XML)
                source_metadata = parse_xml_metadata(file_path)

                # 2. Read and Chunk Text
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()

                if not content.strip():
                    f_log.write(file_path + '\n') # Log as "processed"
                    continue

                chunks = text_splitter.split_text(content)

                # 3. Process each chunk
                for i, chunk_text in enumerate(chunks):

                    # 4. Extract Entities (from Spacy)
                    entity_metadata = extract_entities(chunk_text)

                    # 5. Combine all metadata
                    full_metadata = {**source_metadata, **entity_metadata, "chunk_num": i}

                    # 6. Create a unique ID for this specific chunk
                    chunk_id = f"{file_path}_{i}"

                    # Add to our batch
                    batch_documents.append(chunk_text)
                    batch_metadatas.append(full_metadata)
                    batch_ids.append(chunk_id)
                    new_chunks_added += 1

                    # 7. Add to DB when batch is full
                    if len(batch_ids) >= BATCH_SIZE:
                        embeddings = embedding_model.encode(batch_documents).tolist()
                        collection.add(
                            embeddings=embeddings,
                            documents=batch_documents,
                            metadatas=batch_metadatas,
                            ids=batch_ids
                        )
                        batch_documents, batch_metadatas, batch_ids = [], [], []

                # After all chunks for a file are processed, log the file
                f_log.write(file_path + '\n')
                new_files_processed += 1

            except Exception as e:
                print(f"\n  ERROR: Failed to process file {file_path}. {e}")

    # Add the final batch
    if batch_ids:
        print(f"\n... Adding final batch of {len(batch_ids)} chunks ...")
        embeddings = embedding_model.encode(batch_documents).tolist()
        collection.add(
            embeddings=embeddings,
            documents=batch_documents,
            metadatas=batch_metadatas,
            ids=batch_ids
        )

except Exception as e:
    print(f"\n--- An error occurred during embedding: {e} ---")
finally:
    print("\n--- Phase 5 Embedding finished ---")
    print(f"Time taken: {time.time() - start_time:.2f} seconds.")
    print(f"Total new files processed: {new_files_processed}")
    print(f"Total new CHUNKS added to DB: {new_chunks_added}")
    print(f"\nDatabase now contains {collection.count()} total chunks.")
    print(f"Database is built locally at: {LOCAL_DB_PATH}")

# ------------------------------------------------------------------
# Phase 6: Test Query (Now with metadata filtering)
# ------------------------------------------------------------------
print("\n\n--- Phase 6: Running a Test Query ---")

if collection.count() > 0:
    # We'll do an advanced query
    query_text = "Who died in Stanton?"

    print(f"Query: '{query_text}'")

    # 1. Create the embedding for the query
    query_embedding = embedding_model.encode([query_text]).tolist()

    # 2. Query the collection WITH a metadata filter
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=3,
        # This is the new, powerful part:
        where={"places": {"$like": "%Stanton%"}},
        include=['documents', 'metadatas', 'distances']
    )

    print("--- Top 3 Results (Filtered for 'Stanton') ---")
    if 'ids' in results and results['ids'][0]:
        for i in range(len(results['ids'][0])):
            distance = results['distances'][0][i]
            metadata = results['metadatas'][0][i]
            document = results['documents'][0][i][:350] + "..."

            print(f"\nResult {i+1} (Distance: {distance:.4f})")
            print(f"  Source: {metadata.get('source_file', 'N/A')} (Chunk {metadata.get('chunk_num', 'N/A')})")
            print(f"  Date: {metadata.get('date', 'N/A')}")
            print(f"  People: {metadata.get('people', 'N/A')}")
            print(f"  Places: {metadata.get('places', 'N/A')}")
            print(f"  Content: {document}")
    else:
        print("No results found matching both the query and the filter.")
else:
    print("Database is empty. No query performed.")

# ------------------------------------------------------------------
# Phase 7: Copy Finished DB to Google Drive
# ------------------------------------------------------------------
print("\n\n--- Phase 7: Moving Database to Google Drive ---")
print(f"Moving local DB from: {LOCAL_DB_PATH}")
print(f"                to: {FINAL_DB_PATH_ON_DRIVE}")

try:
    if os.path.exists(FINAL_DB_PATH_ON_DRIVE):
        print("Removing empty placeholder folder from Google Drive...")
        shutil.rmtree(FINAL_DB_PATH_ON_DRIVE)

    !mv {LOCAL_DB_PATH} {FINAL_DB_PATH_ON_DRIVE}

    print("\n--- Move complete! ---")
    print(f"Your persistent, ENRICHED database is now saved at: {FINAL_DB_PATH_ON_DRIVE}")
except Exception as e:
    print(f"\n--- ERROR: Could not move database to Google Drive ---")
    print(f"Error: {e}")
    print(f"Your database is still safe on the local Colab disk at: {LOCAL_DB_PATH}")

print("\n--- 'ENRICH & BUILD' SCRIPT FINISHED ---")