In [None]:
# Import required libraries
import sys
import os

# Load .env first
from dotenv import load_dotenv
load_dotenv()

# IMPORTANT: Override DATABASE_URL for local notebook use (Docker PostgreSQL on port 5433)
# This MUST come after load_dotenv() to override the .env file
db_user = os.getenv('POSTGRES_USER')
db_pass = os.getenv('POSTGRES_PASSWORD')
db_name = os.getenv('POSTGRES_DB')

# Override with correct port for Docker
os.environ['DATABASE_URL'] = f'postgresql+psycopg2://{db_user}:{db_pass}@localhost:5433/{db_name}'

# Add parent directory to path
sys.path.append('..')

# Now import backend modules (DatabaseManager will use the DATABASE_URL we just set)
from backend.database import get_session
from backend.models import Document
import pandas as pd

print(f"✅ Configured for local connection to Docker PostgreSQL:")
print(f"   Connection: postgresql://{db_user}:***@localhost:5433/{db_name}")
print(f"\nTesting connection...")

# Quick connection test
try:
    from backend.database import health_check
    if health_check():
        print("✅ Database connection successful!")
    else:
        print("❌ Database connection failed!")
except Exception as e:
    print(f"❌ Connection error: {e}")

In [None]:
# Connect to database and load documents into DataFrame
with get_session() as session:
    # Query all documents
    documents = session.query(Document).all()
    
    # Convert to list of dicts for pandas
    doc_data = []
    for doc in documents:
        doc_data.append({
            'doc_id': doc.doc_id,
            'title': doc.title,
            'date': doc.date,
            'salience': doc.salience,
            'salience_bool': doc.salience_bool,
            'raw_text': doc.raw_text[:200] if doc.raw_text else None,  # First 200 chars
            'summary': doc.summary,
            'source': doc.source,
            'url': doc.url
        })
    
    df = pd.DataFrame(doc_data)

print(f"Loaded {len(df)} documents")
df.head()

In [None]:
# Check data types and null values
df.info()

In [None]:
# View a sample document with full details
if len(df) > 0:
    sample_doc_id = df.iloc[0]['doc_id']
    
    with get_session() as session:
        doc = session.get(Document, sample_doc_id)
        
        print(f"Document ID: {doc.doc_id}")
        print(f"Title: {doc.title}")
        print(f"Date: {doc.date}")
        print(f"Salience: {doc.salience}")
        print(f"Salience Bool: {doc.salience_bool}")
        print(f"\nCategories: {[cat.category for cat in doc.categories]}")
        print(f"Subcategories: {[sub.subcategory for sub in doc.subcategories]}")
        print(f"Initiating Countries: {[country.initiating_country for country in doc.initiating_countries]}")
        print(f"Recipient Countries: {[country.recipient_country for country in doc.recipient_countries]}")
        print(f"Projects: {[proj.project for proj in doc.projects]}")
        print(f"\nSummary: {doc.summary}")
        print(f"\nRaw Text (first 500 chars):\n{doc.raw_text[:500] if doc.raw_text else 'N/A'}")