# SVO News Transformer

This notebook transforms news articles into boringly precise Subject-Verb-Object (SVO) format.

In [None]:
import json
import spacy
import pandas as pd
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Tuple
import re

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

In [None]:
def extract_svo_triplets(text: str) -> List[Tuple[str, str, str]]:
    """Extract Subject-Verb-Object triplets from text using SpaCy dependency parsing."""
    doc = nlp(text)
    triplets = []
    
    for sent in doc.sents:
        # Find main verb
        for token in sent:
            if token.pos_ == "VERB" and token.dep_ == "ROOT":
                verb = token.text
                subject = None
                obj = None
                
                # Find subject
                for child in token.children:
                    if child.dep_ in ["nsubj", "nsubjpass"]:
                        # Get the full subject phrase
                        subject_phrase = [child.text]
                        for subchild in child.children:
                            if subchild.dep_ in ["compound", "amod", "det"]:
                                subject_phrase.insert(0, subchild.text)
                        subject = " ".join(subject_phrase)
                    
                    # Find object
                    elif child.dep_ in ["dobj", "pobj", "attr"]:
                        # Get the full object phrase
                        obj_phrase = [child.text]
                        for subchild in child.children:
                            if subchild.dep_ in ["compound", "amod", "det"]:
                                obj_phrase.insert(0, subchild.text)
                        obj = " ".join(obj_phrase)
                
                # If we have at least subject and verb, add the triplet
                if subject and verb:
                    triplets.append((subject, verb, obj or "[no object]"))
    
    return triplets

In [None]:
def simplify_to_svo(text: str) -> str:
    """Convert text to boring, precise SVO statements."""
    triplets = extract_svo_triplets(text)
    
    if not triplets:
        # Fallback: try to extract basic facts
        doc = nlp(text)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        if entities:
            return f"Article mentions {', '.join([f'{e[0]} ({e[1]})' for e in entities[:3]])}"
        return "No clear statements extracted."
    
    # Convert triplets to simple sentences
    svo_sentences = []
    for subj, verb, obj in triplets[:5]:  # Limit to 5 main statements
        sentence = f"{subj.capitalize()} {verb} {obj}."
        svo_sentences.append(sentence)
    
    return " ".join(svo_sentences)

In [None]:
def transform_article_to_svo(article: Dict) -> Dict:
    """Transform a news article to SVO format while preserving metadata."""
    svo_article = article.copy()
    
    # Transform title
    if article.get('title'):
        svo_article['original_title'] = article['title']
        svo_article['title'] = simplify_to_svo(article['title'])
    
    # Transform description
    if article.get('description'):
        svo_article['original_description'] = article['description']
        svo_article['description'] = simplify_to_svo(article['description'])
    
    # Transform content
    if article.get('content'):
        svo_article['original_content'] = article['content']
        # Remove character count from content
        clean_content = re.sub(r'\[\+\d+ chars\]', '', article['content'])
        svo_article['content'] = simplify_to_svo(clean_content)
    
    svo_article['format'] = 'svo'
    
    return svo_article

In [None]:
# Load the latest news data
data_dir = Path("../data/daily")
news_files = sorted(data_dir.glob("scranton_news_*.json"), reverse=True)

if news_files:
    latest_file = news_files[0]
    print(f"Loading news from: {latest_file}")
    
    with open(latest_file, 'r') as f:
        news_data = json.load(f)
    
    print(f"Found {len(news_data['articles'])} articles")
else:
    print("No news files found")

In [None]:
# Transform articles to SVO format
svo_articles = []
for article in news_data['articles']:
    svo_article = transform_article_to_svo(article)
    svo_articles.append(svo_article)

# Save SVO version
svo_data = {
    "format": "svo",
    "original_file": str(latest_file),
    "generated_at": datetime.now().isoformat(),
    "articles": svo_articles
}

# Create SVO directory if it doesn't exist
svo_dir = Path("../data/svo")
svo_dir.mkdir(exist_ok=True)

# Save with matching filename
svo_filename = svo_dir / f"svo_{latest_file.name}"
with open(svo_filename, 'w') as f:
    json.dump(svo_data, f, indent=2)

print(f"SVO version saved to: {svo_filename}")

In [None]:
# Create a toggle view function
def display_article_toggle(article_idx: int, show_svo: bool = True):
    """Display an article in either original or SVO format."""
    article = svo_articles[article_idx]
    
    print("="*80)
    print(f"Article {article_idx + 1} - Format: {'SVO' if show_svo else 'Original'}")
    print("="*80)
    
    if show_svo:
        print(f"Title: {article.get('title', 'N/A')}")
        print(f"\nDescription: {article.get('description', 'N/A')}")
        print(f"\nContent: {article.get('content', 'N/A')}")
    else:
        print(f"Title: {article.get('original_title', article.get('title', 'N/A'))}")
        print(f"\nDescription: {article.get('original_description', article.get('description', 'N/A'))}")
        print(f"\nContent: {article.get('original_content', article.get('content', 'N/A'))}")
    
    print(f"\nSource: {article.get('source', {}).get('name', 'Unknown')}")
    print(f"Published: {article.get('publishedAt', 'Unknown')}")

In [None]:
# Interactive toggle demo
from IPython.display import clear_output
import ipywidgets as widgets

# Create widgets
article_selector = widgets.IntSlider(
    value=0, 
    min=0, 
    max=len(svo_articles)-1, 
    description='Article:'
)

format_toggle = widgets.ToggleButtons(
    options=['SVO', 'Original'],
    description='Format:',
    tooltips=['Boring precise SVO format', 'Original article text']
)

output = widgets.Output()

def update_display(change):
    with output:
        clear_output()
        display_article_toggle(
            article_selector.value, 
            show_svo=(format_toggle.value == 'SVO')
        )

# Set up observers
article_selector.observe(update_display, names='value')
format_toggle.observe(update_display, names='value')

# Display interface
display(widgets.VBox([article_selector, format_toggle, output]))

# Initial display
update_display(None)

In [None]:
# Example comparisons
print("Sample transformations:")
print("=" * 80)

for i in range(min(3, len(svo_articles))):
    article = svo_articles[i]
    print(f"\nArticle {i+1}:")
    print(f"Original: {article.get('original_title', 'N/A')[:100]}...")
    print(f"SVO:      {article.get('title', 'N/A')}")
    print("-" * 40)