# News Feed Function
Use Google News RSS to get relevant items within a selected time period.

In [None]:
import feedparser
import urllib.parse
import time
from datetime import datetime
import os
from dotenv import load_dotenv
from cerebras.cloud.sdk import Cerebras
from newspaper import Article
import json
import difflib
import subprocess
import requests

#  Only include items on/after Sept 25, 2025
start = datetime(2025, 9, 25, 0, 0, 0)

def get_news_feed(query: str, limit: int = 15, start_date: datetime = start):
    # Encode the query into a URL
    encoded_query = urllib.parse.quote(query)
    feed_url = f"https://news.google.com/rss/search?q={encoded_query}"
    
    feed = feedparser.parse(feed_url)
    length = len(feed.entries)

    output = ""
    output_dict = {}

    for i in range(length):
        entry = feed.entries[i]
        published = getattr(entry, "published", None)
        published_parsed = getattr(entry, "published_parsed", None)

        # Skip if no timestamp
        if not published_parsed:
            continue

        entry_date = datetime.fromtimestamp(time.mktime(published_parsed))

        # Filter: skip if before start_date
        if start_date and entry_date < start_date:
            continue

        title = entry.title
        link = entry.link

        output += f"{title} - {published if published else 'No timestamp'}\n\n"
        output_dict[title] = {
            "link": link,
            "published": published
        }

        if len(output_dict) >= limit:
            break

    return output_dict, output.strip()

# Starter Prompt and Tools
The main prompt which is used for all future prompts, along with the initial tools for hooking and marking items.

In [None]:
start_messages = [
  {"role": "system", "content": """
  You are a helpful AI. You will be connected to an RSS feed based on the user's request. 
  Instead of being reactive, you will be proactive to the RSS feed and contact the user 
  when any item matches. If something might match, call the 'Mark' tool.
   
  There will not always be relevant items, so do not call the 'Mark' tool because you feel obligated.

  Workflow:
  1. User sends request.
  2. Use the 'Hook' tool to create EXACTLY 7 distinct searches. YOU MAY NOT DO LESS THAN 7.
  3. You will receive the top 5 results per search. Use 'Mark' to flag relevant ones.
   
  Rules for searches:
  - No superficial variations. Do not create searches that only differ by one vague word 
    (e.g., "OpenAI research" vs. "OpenAI innovation").
  - Each search must represent a distinct angle of the request (e.g., policy, technical 
    breakthroughs, collaborations, controversies, societal impacts).
  - All searches must remain clearly relevant to the user request. Do not drift into 
    unrelated areas just to make them different.
  - Keep searches concise: 2-5 words each.
  - Prioritize recall. Err on the side of including items that might be relevant. 
    Avoid narrowing too much.

  Example user request: "I want to be notified if there's any news about governments 
  creating new regulations specifically for AI safety research."
  Three good searches:
  - "government AI safety regulation"
  - "policy frameworks for AI risk research"
  - "AI governance oversight research initiatives"

  These are all relevant, but capture different aspects of the request. 

  Aim to reduce false negatives at all costs. If an item has ANY possibility of being relevant, you must include it. ONLY remove the titles that are OBVIOUSLY irrelevant to the user's request.
  """},
  {"role": "user", "content": "Hello! I want to be notified if there's any news about climate change affecting global food security. Thanks!"},
]

user_query = start_messages[1]["content"]

start_tools = [
    {
        "type": "function",
        "function": {
            "name": "mark",
            "strict": True,
            "description": "Mark an RSS item as relevant to the user's request.",
            "parameters": {
                "type": "object",
                "properties": {
                    "titles": {
                        "type": "array",
                        "items": {
                          "type": "string"
                        },
                        "description": "The titles of the relevant articles."
                    }
                },
                "required": ["titles"]
            }
        }
    },
    {
      "type": "function",
      "function": {
          "name": "hook",
          "strict": True,
          "description": "Create an RSS feed with Google News.",
          "parameters": {
              "type": "object",
              "properties": {
                  "searches": {
                      "type": "array",
                      "items": {
                        "type": "string"
                      },
                      "description": "The RSS searches you want to make. You can use spaces"
                  }
              },
              "required": ["searches"]
          }
      }
    }
]

# Chat Function
Sends API call to Cerebras with optional tools, and repeats the call when tools are needed but not provided. If Cerebras has reached the daily limit, send it to OpenRouter. Manual switching is needed.

In [None]:
load_dotenv()

api_key = os.getenv("API_KEY")
or_key = os.getenv("OR_KEY")

client = Cerebras(
  api_key=api_key,
)

def cerebras_completion(messages, tools):
  chat_completion = client.chat.completions.create(
    messages=messages,
    tools=tools,
    model="llama-4-scout-17b-16e-instruct"
  )

  message = chat_completion.choices[0].message
  message_resp = message.content
  tool_name, tool_contents = None, None

  # Case 1: structured tool call
  if message.tool_calls:
      tool_name = message.tool_calls[0].function.name
      tool_contents = message.tool_calls[0].function.arguments

  # Case 2: raw JSON in content (sometimes the model forgets to use parenthesis for tools and uses brackets instead)
  else:
      try:
          parsed = json.loads(message_resp)
          if "name" in parsed and "arguments" in parsed:
              tool_name = parsed["name"]
              tool_contents = parsed["arguments"]
          message_resp = None
      except Exception:
          pass

  return message_resp, tool_name, tool_contents

def openrouter_completion(messages, tools):
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {or_key}",
            "Content-Type": "application/json",
        },
        data=json.dumps({
            "model": "meta-llama/llama-4-scout",
            "messages": messages,
            "tools": tools,
            "provider": {
                "order": ["cerebras"],
                "allow_fallbacks": False
            }
        })
    )

    # Parse top-level JSON
    resp = response.json()
    message = resp["choices"][0]["message"]
    message_resp = message.get("content")
    tool_name, tool_contents = None, None

    # Case 1: structured tool call
    if "tool_calls" in message and message["tool_calls"]:
        tool_name = message["tool_calls"][0]["function"]["name"]
        tool_contents = message["tool_calls"][0]["function"]["arguments"]

    # Case 2: raw JSON in content
    elif message_resp:
        try:
            parsed = json.loads(message_resp)
            if "name" in parsed and "arguments" in parsed:
                tool_name = parsed["name"]
                tool_contents = parsed["arguments"]
                message_resp = None  # tool call, not plain content
        except Exception:
            pass

    return message_resp, tool_name, tool_contents

def chat(messages, tools=None, need_tool=False):
    for _ in range(3):
        # message, tool_name, tool_contents = cerebras_completion(messages, tools)

        message, tool_name, tool_contents = openrouter_completion(messages, tools)

        if need_tool and not tool_name:
            continue # retry if a tool is required

        return message, tool_name, tool_contents

    # If it's still nothing
    return message, tool_name, tool_contents

# Filter by Title
Get the LLM to go through all items by title and filter out obviously irrelevant ones, avoiding False Negatives.

In [None]:
# Fuzzy matching since the AI sometimes does not include parts of the title
def find_best_match(model_title, news_dict):
    matches = difflib.get_close_matches(model_title, news_dict.keys(), n=1, cutoff=0.5)
    if matches:
        return news_dict[matches[0]]
    return ""

# Initial chat, get RSS setup
message, tool_name, tool_contents = chat(start_messages, start_tools, True)
searches = json.loads(tool_contents)["searches"]

all_rss_items = [] # everything pulled from RSS
chosen_titles = [] # everything chosen by model
all_news_dicts = [] # raw rss dicts

print("=== USER PROMPT ===")
print(user_query)

print("\n=== ALL SEARCHES ===")

valid_items = 0

for search in searches:
    if valid_items >= 12:
        print("! STOPPED SEARCHING !")
        break

    output_dict, output_str = get_news_feed(search)

    if output_str == '':
        print(f"{search} - EMPTY")
        continue

    valid_items += len(output_dict)
    print(search)

    loop_messages = list(start_messages) + [
        {"role": "assistant", "content": f"{output_str} This is a list of the most recent RSS items for the search '{search}'. I will now use tool 'mark' if any of the items' titles seem like they could possibly apply to the user's query. I will avoid False Negatives, preferring False Positives. I will NOT use 'hook' because I already did that."},
    ]
    _, tool_name, tool_contents = chat(loop_messages, start_tools, True)

    # Handle tool calling issues
    if isinstance(tool_contents, str):
        titles = json.loads(tool_contents)["titles"]
    elif isinstance(tool_contents, dict):
        titles = tool_contents["titles"]

    all_rss_items.extend(output_dict.keys())
    chosen_titles.extend(titles)
    all_news_dicts.append(output_dict)

# Deduplicate RSS titles
all_rss_items = list(dict.fromkeys(all_rss_items))
chosen_titles = list(dict.fromkeys(chosen_titles))

# Merge all dicts
combined_news_dict = {}
for nd in all_news_dicts:
    combined_news_dict.update(nd)

# Map chosen titles to links
chosen_dict = {}
for t in chosen_titles:
    chosen_dict[t] = find_best_match(t, combined_news_dict)

print("\n=== ALL RSS ITEMS LOOKED AT ===")
for item in all_rss_items:
    print(item)

print("\n=== ALL CHOSEN ITEMS ===")
for t, meta in chosen_dict.items():
    if isinstance(meta, dict):
        print(f"{t} -> {meta['link']}")
    else:
        print(f"{t} -> {meta}")

# Filter by Content
Get the LLM to go through the items by content, given the first 2500 chars to reduce token amount. If it passes the second filter, create ~200 word summary with important information, facts, etc. for report creation.

In [None]:
def resolve_google_news_url(url: str) -> str:
    return subprocess.check_output(
        ["python", "resolve_google_news.py", url],
        text=True
    ).strip()

def get_main_content(url: str) -> str:
    try:
        article = Article(url)
        article.download()
        article.parse()
        
        return article.text
    except:
        return ""

eval_tools = [
    {
        "type": "function",
        "function": {
            "name": "mark",
            "strict": True,
            "description": "Mark the article/page as relevant or not.",
            "parameters": {
                "type": "object",
                "properties": {
                    "relevant": {
                        "type": "boolean",
                        "description": "Use True if it is relevant, False if it is not."
                    },
                    "reason": {
                        "type": "string",
                        "description": "ALWAYS FILL THIS OUT IF RELEVANT IS TRUE. A detailed explanation (approximately 200 words). Specifically, spend most of the words on the important details and info the page contains, and the last bit on how it is relevant to the user query. For the main content, make sure to include all specific information such as names, proper nouns, places, events, times, groups, etc. Your goal is not summarization as much as it is to gather the very relevent and specific information. Do not make this generic - include the important stuff that the article goes over. Pack as much specific information in the main content as you can, and then at the very end tie it back to the user."
                    }
                },
                "required": ["relevant"]
            }
        }
    }
]

passed_items = []

for item, meta in chosen_dict.items():
    print(f"=== ITEM ===")
    print(item)

    date = meta["published"]

    link = meta["link"]
    link = resolve_google_news_url(link)

    content = get_main_content(link)[:3000]

    if (len(content) < 200):
        print("! Article is empty or a stub !")
        continue

    eval_messages = list(start_messages) + [
        {
            "role": "assistant",
            "content": f"""
    I am currently evaluating whether this article is relevant to the user query: '{user_query}'.

    ARTICLE TITLE: {item}
    ARTICLE CONTENT (first 3000 chars):
    {content}

    INSTRUCTIONS:
    1. I will decide strictly if the article is relevant to the query. I will NOT mark it relevant just because it mentions a keyword. If it does not address the query, I will mark `relevant = false`.
    2. If relevant = true, I'll:
    - Write a detailed explanation (200-250 words).
    - Focus on concrete details that appear in the article. I will NOT generalize.
    - Cover at least 90% of the important content from this excerpt.
    - End the explanation by explicitly tying the article back to the user query.
    3. If relevant = false:
    - I will not write any explanation or summary. I'll only return `relevant = false`.

    I will not say things such as "contains specific details". Instead, I will provide the exact specific details, not just mention that they exist.
    
    Specific Details to Always Include (when present):
    - Numbers, dates, and statistics (percentages, counts, totals, averages, ranges, rankings)
    - Names of people and groups (individuals, organizations, companies, institutions, agencies)
    - Geographic references (countries, cities, regions, local areas)
    - Events and milestones (announcements, launches, agreements, disasters, protests, meetings)
    - Quotes and statements (from officials, experts, witnesses, participants)
    - Policies and rules (laws, regulations, programs, reforms, restrictions, standards)
    - Technologies and methods (tools, systems, processes, techniques)
    - Economic indicators (prices, costs, investments, budgets, trade figures)
    - Social impacts (effects on communities, health, education, migration, lifestyles)
    - Environmental factors (weather, climate, land, water, resources, ecosystems)
    - Other obviously relevant things not on this list.

    I am REQUIRED to say ALL specific details I see that are relevant. I will NOT cut ANY of them.
    
    Additionally, I will use quotes for important information that matters verbatum.
    I will *literally* use a minimum of 200 words.

    My output must strictly use the `mark` function schema.
    """
        }
    ]
    _, tool_name, tool_contents = chat(eval_messages, eval_tools, True)

    # Handle tool calling issues
    parsed = None
    if tool_contents:
        if isinstance(tool_contents, dict):
            parsed = tool_contents
        else:
            try:
                parsed = json.loads(tool_contents)
            except json.JSONDecodeError:
                fixed = tool_contents.replace("true", "True").replace("false", "False")
                try:
                    parsed = eval(fixed, {"__builtins__": None}, {})
                except Exception:
                    parsed = tool_contents

    # If the AI marked the item as relevant, add to list
    if parsed and isinstance(parsed, dict) and parsed.get("relevant") == True:
        passed_items.append([item, link, date, parsed.get("reason", "")])

    print(tool_name, parsed)

# Report Generation
Generate the report given the list of items. Report is made in Markdown, using links for all sources.

In [None]:
def create_content_str(items):
    full = ""
    for name, link, date, reason in items:
        full += f"=== ITEM NAME ===\n{name}\n"
        full += f"=== ITEM LINK (To cite) ===\n{link}\n"
        full += f"=== ITEM DATE ===\n{date}\n"
        full += f"=== ITEM INFO (LLM generated) ===\n{reason}\n\n"
    return full

report_messages = list(start_messages) + [
    {"role": "assistant", "content": f"""
    {create_content_str(passed_items)}
    These are all items relevant to the query: '{user_query}'.

    INSTRUCTIONS:
    1. Write 750 words AT MINIMUM in Markdown with clear structure using # (H1) and ## (H2) headings. Do NOT overuse these - you can and SHOULD do multiple paragraphs under one.
    2. Use the most reputable source for each piece of information and avoid duplication.
    3. Use specific information such as numbers, events, people, etc. where useful; do not avoid using these.
    4. Naturally connect all information back to the query, and combine sources when appropriate.
    5. Begin by addressing the query directly, explaining what has developed since the last interaction ({start}) up to today ({datetime.now()}), including how much time has passed.
    6. Never write dates (like "2025-09-29", "Sep 29, 2025", or UTC strings). Always write relative time only, e.g. "3 hours ago", "2 days ago", or "2 weeks ago".
    7. Conclude by explaining why the updates matter, adding context rather than summarizing obvious knowledge.
    8. Do not mention being an AI or proactive agent, and do not use words like "proactive." Write directly to the reader ("you") when appropriate.
    9. Always cite inline like this: ([Source Website Name](https://example.com) - TIME AGO). 
       - Parentheses must wrap the citation. 
       - The clickable text must ALWAYS be the EXACT website name, NOT the article title, NOT the raw link.
       - Place citations immediately after the information, not at the end.
       - YOU ARE NOT ALLOWED TO CASUALLY CITE THINGS LIKE "For example, SOURCE said..." YOU ARE REQUIRED TO CITE IT AT THE END OF TALKING ABOUT THE CONTENT.
    
    CONTEXT:
    The goal is to provide timely updates on new developments since the last interaction, not background knowledge. The writing should feel polished, informative, and up-to-date.

    Remember, 750 words MINIMUM.
    """}
]
message, _, _ = chat(report_messages)

print(message)