# Ingest News Articles for Scrantenna
This notebook pulls news articles from multiple sources and saves them for further processing.

In [ ]:
import requests
import json
import os
import re
from datetime import datetime
from typing import List, Dict, Tuple

# Try to import OpenAI for LLM-based distillation
try:
    import openai
    llm_available = True
    # Set up OpenAI client (will use OPENAI_API_KEY env var)
    client = openai.OpenAI()
except ImportError:
    print("OpenAI not available. Install with: pip install openai")
    llm_available = False
    client = None

# Load SpaCy as fallback
try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
    spacy_available = True
except:
    print("SpaCy model not found. Install with: python -m spacy download en_core_web_sm")
    nlp = None
    spacy_available = False

# Define constants
API_KEY = 'be93936988fd4df185bd56e8a11125a0'
QUERY = "Scranton"
DATA_DIR = "../data/daily"
NEWS_URL = f"https://newsapi.org/v2/everything?q={QUERY}&apiKey={API_KEY}"

def create_distilled_version_llm(text: str) -> str:
    """Create distilled version using LLM."""
    if not llm_available or not text:
        return ""
    
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "system", 
                    "content": "Extract the core facts from news text into direct, precise statements. Use simple subject-verb-object format. Avoid referring to 'the article' or 'the story'. State facts directly as if reporting them yourself. Keep it under 100 characters."
                },
                {
                    "role": "user", 
                    "content": f"Distill this news text: {text}"
                }
            ],
            max_tokens=50,
            temperature=0.1
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"LLM distillation failed: {e}")
        return create_distilled_version_fallback(text)

def create_distilled_version_fallback(text: str) -> str:
    """Fallback distillation using simple text processing."""
    if not text:
        return ""
    
    # Extract first sentence
    sentences = text.split('.')
    first_sentence = sentences[0].strip() if sentences else text
    
    # Remove common article references
    first_sentence = re.sub(r'\(.*?\)', '', first_sentence)  # Remove parentheses
    first_sentence = re.sub(r'^(The|A|An)\s+', '', first_sentence)  # Remove articles
    first_sentence = first_sentence[:80] + "." if len(first_sentence) > 80 else first_sentence + "."
    
    return first_sentence

def create_distilled_version(text: str) -> str:
    """Create distilled version using best available method."""
    if llm_available:
        return create_distilled_version_llm(text)
    else:
        return create_distilled_version_fallback(text)

def process_article(article: Dict) -> Dict:
    """Process article to include distilled versions alongside original text."""
    processed = article.copy()
    
    # Use description as main content since API content is truncated
    main_content = article.get('description', '') or article.get('title', '')
    
    # Add distilled versions
    if article.get('title'):
        processed['title_distilled'] = create_distilled_version(article['title'])
    
    if article.get('description'):
        processed['description_distilled'] = create_distilled_version(article['description'])
    
    # Since content is truncated, use description as main content
    processed['content_distilled'] = create_distilled_version(main_content)
    
    return processed

def fetch_news():
    """Fetch news articles from NewsAPI"""
    response = requests.get(NEWS_URL)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch news articles: {response.text}")
    return response.json()

def save_news(news_data):
    """Save news articles with both original and distilled formats"""
    if not os.path.exists(DATA_DIR):
        os.makedirs(DATA_DIR)
    
    # Process articles to add distilled versions
    processed_articles = []
    for article in news_data.get('articles', []):
        processed_articles.append(process_article(article))
    
    # Create data structure with metadata
    output_data = {
        "query": QUERY,
        "fetched_at": datetime.now().isoformat(),
        "total_articles": len(processed_articles),
        "has_distilled": llm_available or spacy_available,
        "distillation_method": "llm" if llm_available else "fallback",
        "articles": processed_articles
    }
    
    # Save with date-based filename
    file_path = os.path.join(DATA_DIR, f"scranton_news_{datetime.now().strftime('%Y-%m-%d')}.json")
    with open(file_path, 'w') as f:
        json.dump(output_data, f, indent=2)
    
    print(f"Saved {len(processed_articles)} articles to {file_path}")
    if llm_available:
        print("✓ LLM-based distilled versions included")
    elif spacy_available:
        print("✓ Fallback distilled versions included")
    else:
        print("⚠ Distilled versions not generated")
    
    return file_path

# Fetch and save articles with intelligent distilled processing
news_data = fetch_news()
saved_file = save_news(news_data)

In [ ]:
# Display sample articles with toggle between original and SVO
def display_sample_articles(file_path, num_samples=3):
    """Display sample articles showing both original and SVO versions"""
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    print(f"\n{'='*80}")
    print(f"Sample Articles from {file_path}")
    print(f"{'='*80}\n")
    
    for i, article in enumerate(data['articles'][:num_samples]):
        print(f"Article {i+1}:")
        print(f"Source: {article.get('source', {}).get('name', 'Unknown')}")
        print(f"\nOriginal Title: {article.get('title', 'N/A')}")
        print(f"SVO Title: {article.get('title_svo', 'N/A')}")
        print(f"\nOriginal Description: {article.get('description', 'N/A')[:150]}...")
        print(f"SVO Description: {article.get('description_svo', 'N/A')}")
        print(f"{'-'*80}\n")

# Show samples from the saved file
if 'saved_file' in locals():
    display_sample_articles(saved_file)