# BPC Hybrid Keyword + Embedding Analysis
## GDELT-Style Two-Stage Article Analysis

This notebook demonstrates:
1. **Keyword Filtering** (Stage 1) - Like GDELT's GKG filtering
2. **Semantic Analysis** (Stage 2) - Like GDELT's GSG embeddings
3. **Combined workflows** for powerful article discovery and clustering

In [None]:
# Setup
import sys
sys.path.append('../scripts')

from bpc_hybrid_analysis import BPCHybridSearch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
import plotly.express as px
import plotly.graph_objects as go

# Initialize
searcher = BPCHybridSearch(embeddings_path='../data/bpc_embeddings.pkl')

print(f"Loaded {len(searcher.embeddings_data)} articles")
print(f"Embedding dimensions: {searcher.embeddings_matrix.shape[1]}")

## 1. Explore Your Data
### See available tags and policy areas

In [None]:
# Get all unique values
all_tags = set()
all_policy_areas = set()
all_people = set()

for item in searcher.embeddings_data:
    all_tags.update(item.get('tags', []))
    all_policy_areas.update(item.get('policy_areas', []))
    all_people.update(item.get('related_people', []))

print(f"Total unique tags: {len(all_tags)}")
print(f"\nTop 20 tags:")
tag_counts = {}
for item in searcher.embeddings_data:
    for tag in item.get('tags', []):
        tag_counts[tag] = tag_counts.get(tag, 0) + 1

for tag, count in sorted(tag_counts.items(), key=lambda x: x[1], reverse=True)[:20]:
    print(f"  {tag}: {count}")

print(f"\nTotal policy areas: {len(all_policy_areas)}")
print(f"Policy areas: {sorted(all_policy_areas)}")

## 2. Two-Stage Analysis: Keyword Filter + Semantic Clustering
### Example: Analyzing AI/Technology articles

In [None]:
# Stage 1: Filter by keywords
tech_indices = searcher.keyword_filter(
    tags=['Technology', 'Artificial Intelligence']  # Adjust based on your actual tags
)

print(f"Found {len(tech_indices)} technology articles")

# Stage 2: Cluster semantically
n_clusters = 5
clusters = searcher.cluster_articles(tech_indices, n_clusters=n_clusters)

# Display clusters
for cluster_id, articles in clusters.items():
    print(f"\n{'='*60}")
    print(f"CLUSTER {cluster_id} ({len(articles)} articles)")
    print('='*60)
    for i, article in enumerate(articles[:5], 1):  # Show top 5
        print(f"{i}. {article['title']}")
        print(f"   Date: {article['date']}")
        print(f"   Policy Areas: {', '.join(article.get('policy_areas', []))}")
        print()

## 3. Semantic Search Within Filtered Set
### Example: Find articles about specific topics within a policy area

In [None]:
# Stage 1: Filter to a policy area
health_indices = searcher.keyword_filter(
    policy_areas=['Health']
)

print(f"Searching within {len(health_indices)} health policy articles...\n")

# Stage 2: Semantic search for specific concept
query = "mental health crisis response and suicide prevention"
results = searcher.semantic_search(query, health_indices, top_k=10)

print(f"Top 10 matches for: '{query}'\n")
for i, result in enumerate(results, 1):
    print(f"{i}. {result['title']}")
    print(f"   Similarity: {result['similarity_score']:.3f}")
    print(f"   Date: {result['date']}")
    print(f"   URL: {result['url']}")
    print()

## 4. "More Like This" Analysis
### Find similar articles to a given article

In [None]:
# Pick an article (use any URL from your dataset)
target_url = searcher.embeddings_data[0]['url']  # Example: first article
target_title = searcher.embeddings_data[0]['title']

print(f"Finding articles similar to:\n'{target_title}'\n")

# Find similar articles
similar = searcher.find_similar_articles(target_url, top_k=10)

print("Most similar articles:")
for i, result in enumerate(similar, 1):
    print(f"\n{i}. {result['title']}")
    print(f"   Similarity: {result['similarity_score']:.3f}")
    print(f"   Date: {result['date']}")
    print(f"   Policy: {', '.join(result.get('policy_areas', []))}")

## 5. Visualize Article Landscape
### Use t-SNE to create 2D visualization (like GDELT's visualizations)

In [None]:
# Get a subset for visualization (all articles or filtered)
# For speed, let's use a sample if you have > 500 articles
vis_indices = list(range(min(500, len(searcher.embeddings_data))))
vis_embeddings = searcher.embeddings_matrix[vis_indices]

# Reduce to 2D using t-SNE
print("Running t-SNE dimensionality reduction...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
embeddings_2d = tsne.fit_transform(vis_embeddings)

# Prepare data for plotting
plot_data = []
for i, idx in enumerate(vis_indices):
    item = searcher.embeddings_data[idx]
    plot_data.append({
        'x': embeddings_2d[i, 0],
        'y': embeddings_2d[i, 1],
        'title': item['title'][:50],
        'policy_area': ', '.join(item.get('policy_areas', ['Other']))[:30],
        'date': item.get('date', ''),
        'url': item.get('url', '')
    })

df_plot = pd.DataFrame(plot_data)

# Create interactive plot
fig = px.scatter(
    df_plot, 
    x='x', 
    y='y',
    color='policy_area',
    hover_data=['title', 'date'],
    title='BPC Article Landscape (Semantic Embeddings)',
    width=1000,
    height=700
)

fig.update_traces(marker=dict(size=8, opacity=0.7))
fig.show()

## 6. Advanced: Compare Policy Areas Semantically
### Analyze how different policy areas cluster together

In [None]:
# Get average embedding for each policy area
policy_area_embeddings = {}

for item in searcher.embeddings_data:
    for area in item.get('policy_areas', []):
        if area not in policy_area_embeddings:
            policy_area_embeddings[area] = []
        policy_area_embeddings[area].append(item['embedding'])

# Calculate mean embeddings
area_means = {}
for area, embeddings in policy_area_embeddings.items():
    if len(embeddings) >= 5:  # Only areas with 5+ articles
        area_means[area] = np.mean(embeddings, axis=0)

print(f"Analyzing {len(area_means)} major policy areas")

# Compute similarity matrix between policy areas
areas = list(area_means.keys())
area_matrix = np.array([area_means[area] for area in areas])

from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(area_matrix)

# Visualize as heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(
    similarity_matrix,
    xticklabels=areas,
    yticklabels=areas,
    cmap='YlOrRd',
    annot=False,
    square=True
)
plt.title('Semantic Similarity Between Policy Areas')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Find most/least similar policy area pairs
print("\nMost semantically similar policy areas:")
pairs = []
for i in range(len(areas)):
    for j in range(i+1, len(areas)):
        pairs.append((areas[i], areas[j], similarity_matrix[i, j]))

for area1, area2, sim in sorted(pairs, key=lambda x: x[2], reverse=True)[:10]:
    print(f"  {area1} ↔ {area2}: {sim:.3f}")

## 7. Export Results
### Save filtered/analyzed results for further use

In [None]:
# Example: Export a filtered set with similarity scores
query = "climate change and energy transition"
energy_indices = searcher.keyword_filter(policy_areas=['Energy'])
results = searcher.semantic_search(query, energy_indices, top_k=50)

# Convert to DataFrame
df_results = pd.DataFrame([{
    'title': r['title'],
    'date': r['date'],
    'url': r['url'],
    'similarity_score': r['similarity_score'],
    'policy_areas': ', '.join(r.get('policy_areas', [])),
    'tags': ', '.join(r.get('tags', []))
} for r in results])

# Save to CSV
df_results.to_csv('../data/energy_climate_results.csv', index=False)
print(f"Saved {len(df_results)} results to energy_climate_results.csv")
df_results.head(10)