In [5]:
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import umap
from sentence_transformers import SentenceTransformer
from chromadb import Client
from chromadb.config import Settings

# Paths
output_path = "/workspaces/langgraph/data/"
json_file = os.path.join(output_path, "govexec_articles_latest.json")
vector_db_path = "./webscraping_vectors"

# Load scraped articles
def load_articles(filepath):
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: {filepath}")
    with open(filepath, "r", encoding="utf-8") as f:
        return json.load(f)

# Initialize embedding model
def initialize_model():
    return SentenceTransformer("all-MiniLM-L6-v2")

# Create or load vector database
def initialize_vector_db(persist_directory):
    os.makedirs(persist_directory, exist_ok=True)
    return Client(Settings(persist_directory=persist_directory))

# Process articles and store embeddings with detailed metadata
def embed_and_store(articles, model, vector_db):
    collection = vector_db.get_or_create_collection("webscraping_articles")
    detailed_metadata = []  # List to store detailed metadata for all articles

    for i, article in enumerate(articles):
        content = article.get("content", "")
        title = article.get("title", "No Title")
        link = article.get("link", "")
        date = article.get("date", "Unknown Date")  # Assuming articles have a 'date' field
        citations = article.get("citations", [])  # Assuming articles have a 'citations' field
        high_level_ideas = article.get("high_level_ideas", [])  # Assuming articles have this field

        if content:
            embedding = model.encode(content)
            doc_id = f"article_{i}"

            # Add to vector database
            collection.add(
                ids=[doc_id],
                documents=[content],
                metadatas=[{
                    "title": title,
                    "link": link,
                    "date": date,
                    "citations": citations,
                    "high_level_ideas": high_level_ideas
                }],
                embeddings=[embedding]
            )

            # Save detailed metadata
            detailed_metadata.append({
                "id": doc_id,
                "title": title,
                "link": link,
                "date": date,
                "citations": citations,
                "high_level_ideas": high_level_ideas,
                "content": content
            })

    # Save detailed metadata to a JSON file
    metadata_file = os.path.join(output_path, "detailed_metadata.json")
    with open(metadata_file, "w", encoding="utf-8") as f:
        json.dump(detailed_metadata, f, indent=4, ensure_ascii=False)

    print(f"Stored {len(articles)} articles in the vector database and saved detailed metadata.")

# Extract embeddings from the vector database
def extract_embeddings(vector_db):
    collection = vector_db.get_collection("webscraping_articles")
    results = collection.get(include=["embeddings", "metadatas"])
    embeddings = np.array(results["embeddings"])
    titles = [meta["title"] for meta in results["metadatas"]]
    return embeddings, titles

# Visualize embeddings using PCA and UMAP
def visualize_first_article(embeddings, titles):
    if len(embeddings) == 0:
        print("No embeddings available for visualization.")
        return

    # Select the first embedding and title
    first_embedding = embeddings[0].reshape(1, -1)  # Reshape for PCA/UMAP compatibility
    first_title = titles[0]

    if first_embedding.shape[0] < 2:
        print("Not enough data points for PCA/UMAP visualization.")
        return

    # PCA
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(first_embedding)

    # UMAP
    umap_reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
    umap_result = umap_reducer.fit_transform(first_embedding)

    # Plot PCA and UMAP
    plt.figure(figsize=(12, 6))

    # PCA Plot
    plt.subplot(1, 2, 1)
    plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.7, s=50, label=first_title)
    plt.title("PCA Visualization (First Article)")
    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.legend()

    # UMAP Plot
    plt.subplot(1, 2, 2)
    plt.scatter(umap_result[:, 0], umap_result[:, 1], alpha=0.7, s=50, label=first_title)
    plt.title("UMAP Visualization (First Article)")
    plt.xlabel("UMAP Component 1")
    plt.ylabel("UMAP Component 2")
    plt.legend()

    plt.tight_layout()
    plt.show()

# Main workflow
if __name__ == "__main__":
    try:
        # Load articles
        articles = load_articles(json_file)
        print(f"Loaded {len(articles)} articles.")

        # Initialize model and vector database
        model = initialize_model()
        vector_db = initialize_vector_db(vector_db_path)

        # Embed and store articles
        embed_and_store(articles, model, vector_db)
        print("Embeddings stored successfully!")

        # # Extract embeddings
        # embeddings, titles = extract_embeddings(vector_db)
        # print(f"Extracted {len(embeddings)} embeddings.")

        # # Visualize the first article
        # visualize_first_article(embeddings, titles)

    except Exception as e:
        print(f"Error: {e}")

Loaded 25 articles.
Error: Expected metadata value to be a str, int, float, bool, or None, got [] which is a list in add.
