#Setup

In [1]:

# 1. Install necessary libraries (Run this cell once)
!pip install google-genai pandas requests feedparser beautifulsoup4
# 2. Import libraries
import pandas as pd
import json
from google import genai
from google.genai.errors import APIError
import os
from google.colab import drive
import requests
try:
    # Use Colab's specific method for reliable secret access
    from google.colab import userdata
except ImportError:
    # Fallback for non-Colab environments
    userdata = None

print("Setup complete. Libraries imported.")

# 3. Mount Google Drive (Essential for accessing the JSON file)
# Force remount to avoid mounting issues
drive.mount('/content/drive', force_remount=True)
print("Google Drive mounted.")

# 4. LLM API Key and Initialization (Loads key from Colab Secrets)
GEMINI_API_KEY = None
if userdata:
    GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')
else:
    GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')

if not GEMINI_API_KEY:
    raise ValueError("GEMINI_API_KEY not found. Please set it in Colab Secrets (üîë).")

try:
    # Initialize the Gemini Client
    client = genai.Client(api_key=GEMINI_API_KEY)
    print("Gemini Client initialized successfully.")
except Exception as e:
    print(f"Error initializing Gemini client: {e}")

# Define the model to use for rewriting
REWRITING_MODEL = 'gemini-2.5-flash'
print(f"Using model: {REWRITING_MODEL}")

Collecting feedparser
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.12-py3-none-any.whl (81 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m81.5/81.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6046 sha256=e3eea24190201b192b5f3305ed8d1fc5854230bedc5d942de275ed0c24ae0938
  Stored in directory: /root/.cache/pip/wheels/03/f5/1a/23761066dac1d0e8e683e5fdb27e12de53209d05a4a37e6246
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.12 sgmllib3k-1.0.0


#Data Loading and Preparation

In [3]:
# IMPORTANT: Adjust this path if your file is in a different folder!
FILE_NAME = '/content/drive/MyDrive/NewsBot_AI/fetched_articles.json'

try:
    articles_df = pd.read_json(FILE_NAME, orient='records')

    # Simple cleanup: fill any potential empty body text to prevent LLM errors
    articles_df['body_text_html'] = articles_df['body_text_html'].fillna('')

    print(f"\nSuccessfully loaded {articles_df.shape[0]} articles from {FILE_NAME}")

    # Display the first article's details to confirm data integrity
    sample_article = articles_df.iloc[0]
    print(f"Sample Article Title: {sample_article['title']}")

except FileNotFoundError:
    print(f"\nERROR: File '{FILE_NAME}' not found.")
    print("ACTION: Running fetch below to generate it.")
    articles_df = pd.DataFrame()

# Fetch News via RSS Aggregation (Simulating FreshRSS: Curated feeds + full extraction/normalization)
import feedparser
import requests
from bs4 import BeautifulSoup  # For XPath-like extraction (e.g., //div[@class='content'])
import json
import pandas as pd  # Already imported
import os
from hashlib import md5  # For deduplication by title hash

# Curated RSS feeds for technology news (full text/excerpts, per current sources)
rss_feeds = [
    "https://www.techrepublic.com/feed/",  # TechRepublic: Business tech news
    "https://www.cnet.com/rss/news/",  # CNET: Tech reviews/news
    "https://feeds.bbci.co.uk/news/technology/rss.xml"  # BBC Technology: Global tech coverage
]

raw_articles = []
seen_titles = set()  # For deduplication

for feed_url in rss_feeds:
    try:
        print(f"Fetching from {feed_url}...")
        feed = feedparser.parse(feed_url)

        for entry in feed.entries[:4]:  # Limit to 4 per feed for demo (total ~12, deduped)
            title = entry.get('title', '').strip()
            title_hash = md5(title.encode()).hexdigest()  # Normalize/dedup
            if title_hash in seen_titles:
                continue  # Skip duplicates
            seen_titles.add(title_hash)

            link = entry.get('link', '')
            topic = 'technology'  # Default; refine from categories if available
            author = entry.get('author', '')
            pub_date = entry.get('published', '')

            # Extract topic tags from tags/categories if present
            topic_tags = [tag.term for tag in entry.get('tags', [])] or ['technology']

            # Get full body: Use summary if available, else fetch from link (FreshRSS-style scraping)
            body_html = entry.get('summary', '') or entry.get('content', [{}])[0].get('value', '')
            if len(body_html) < 200 and link:  # If short, fetch full (respectful)
                headers = {'User-Agent': 'Mozilla/5.0 (compatible; NewsBot Student Project)'}
                resp = requests.get(link, headers=headers, timeout=10)
                if resp.status_code == 200:
                    soup = BeautifulSoup(resp.text, 'html.parser')
                    # XPath-like extraction: //article or //div[@class='content'] or //main
                    content_div = soup.find('article') or soup.find('div', class_='content') or soup.find('main') or soup.find('div', id='article-body')
                    if content_div:
                        body_html = content_div.get_text(separator=' ', strip=True)
                    else:
                        body_html = resp.text[:3000]  # Fallback truncate
                    # Clean: Remove extra whitespace, limit for LLM
                    body_html = ' '.join(body_html.split())[:5000]

            article = {
                "original_id": f"freshrss-{hash(title) % 1000}",  # Unique ID
                "title": title,
                "topic": topic,
                "body_text_html": body_html,  # Full extracted text
                "author": author,
                "publication_date": pub_date,
                "tags": topic_tags[:5]  # Limit tags
            }
            raw_articles.append(article)

    except Exception as e:
        print(f"Error parsing {feed_url}: {e}")

if raw_articles:
    # Save to JSON (in your Google Drive path)
    fetched_file = FILE_NAME
    os.makedirs(os.path.dirname(fetched_file), exist_ok=True)
    with open(fetched_file, 'w') as f:
        json.dump(raw_articles, f, indent=4)

    print(f"Successfully fetched {len(raw_articles)} articles via RSS aggregation (FreshRSS-style) and saved to {fetched_file}")

    # Load into DataFrame
    articles_df = pd.DataFrame(raw_articles)
    articles_df['body_text_html'] = articles_df['body_text_html'].fillna('')

    # Display sample
    print(f"Sample Title: {articles_df.iloc[0]['title'][:100]}...")
    print(f"Sample Body Snippet: {articles_df.iloc[0]['body_text_html'][:150]}...")
else:
    print("No articles fetched. Check internet or feeds.")


Successfully loaded 10 articles from /content/drive/MyDrive/NewsBot_AI/fetched_articles.json
Sample Article Title: Cybersecurity Starts With You: Lessons From Phishing, Ransomware, and Real-World Mistakes
Fetching from https://www.techrepublic.com/feed/...
Fetching from https://www.cnet.com/rss/news/...
Fetching from https://feeds.bbci.co.uk/news/technology/rss.xml...
Successfully fetched 12 articles via RSS aggregation (FreshRSS-style) and saved to /content/drive/MyDrive/NewsBot_AI/fetched_articles.json
Sample Title: Cybersecurity Starts With You: Lessons From Phishing, Ransomware, and Real-World Mistakes...
Sample Body Snippet: <p>This Cybersecurity Awareness Month, see how real-world phishing and ransomware attacks reveal why every employee plays a role in protection.</p>
<p...


#Persona definition

In [4]:
PERSONAS = {
    "Gen Z Trailblazer": {
        "age": 19,
        "tone": "Witty, meme-savvy, and packed with Gen Z flair (e.g., 'spill the tea,' 'no cap,' 'big mood,' 'vibes on fleek').",
        "style_description": "Ultra-casual, bold, and dripping with pop culture references, like a TikTok live sesh.",
        "length_target": "Keep it short, punchy, and scroll-friendly (max 120 words)."
    },
    "Corporate Visionary": {
        "age": 50,
        "tone": "Confident, strategic, with a dash of futurist jargon (e.g., 'disruptive innovation,' 'ecosystem leverage,' 'next-gen paradigm').",
        "style_description": "Polished, forward-thinking, and tailored for C-suite briefings.",
        "length_target": "Deliver a crisp executive insight (max 90 words)."
    },
    "Eco-Philosopher": {
        "age": 35,
        "tone": "Thoughtful, eco-conscious, with poetic undertones and ethical musings (e.g., 'harmony with nature,' 'sustainable ethos,' 'planetary balance').",
        "style_description": "Reflective, nature-inspired, blending tech with environmental wisdom.",
        "length_target": "Craft a meditative summary with depth (max 200 words)."
    },
    "Cyberpunk Dreamer": {
        "age": 28,
        "tone": "Edgy, dystopian, infused with neon-lit slang (e.g., 'data jack,' 'synth-life,' 'grid runner,' 'chrome hustle').",
        "style_description": "Gritty, futuristic, like a cyberpunk novel excerpt.",
        "length_target": "Keep it sleek and immersive (max 150 words)."
    }
}

print("\nDefined 4 creative User Personas for personalized rewriting.")


Defined 4 creative User Personas for personalized rewriting.


#The News Rewriting Function (LLM Engine)

In [5]:
def rewrite_article(article: pd.Series, persona_name: str) -> dict:
    """Constructs a detailed prompt and calls the Gemini API to rewrite the article."""
    persona = PERSONAS.get(persona_name)
    if not persona:
        return {"error": f"Persona '{persona_name}' not found."}

    # 1. System Prompt (High-level instruction for the model)
    system_instruction = (
        "You are an expert content personalizer running on an AI News Platform. "
        "Your task is to rewrite the provided news article to perfectly match the target user persona's style, tone, and length. "
        "DO NOT use HTML tags. ONLY output the rewritten text."
    )

    # 2. User Prompt (The payload sent to the model)
    user_prompt = f"""
    --- REWRITING INSTRUCTIONS (PERSONA: {persona_name}) ---

    * **TARGET STYLE:** Rewrite the content in a {persona['style_description']} style.
    * **TONE:** The tone must be {persona['tone']}.
    * **LENGTH:** {persona['length_target']}
    * **AGE/DEMO:** Address a reader who is approximately {persona['age']} years old.

    --- ORIGINAL ARTICLE CONTENT ---

    Title: {article['title']}
    Original Source Topic: {article['topic']}
    Original Text:

    {article['body_text_html']}
    """

    print(f"-> Rewriting '{article['title'][:50]}...' for persona: {persona_name}...")

    try:
        # Call the Gemini API
        response = client.models.generate_content(
            model=REWRITING_MODEL,
            contents=[user_prompt],
            config=genai.types.GenerateContentConfig(
                system_instruction=system_instruction
            )
        )

        # Structure the output as a RewrittenArticle object
        return {
            "title": f"[{persona_name} Rewritten] {article['title']}",
            "content_text": response.text,
            "original_article": article['original_id'],
            "persona": persona_name
        }

    except APIError as e:
        return {
            "title": "API Error",
            "content_text": f"API ERROR: Could not rewrite article. {e}",
            "original_article": article['original_id'],
            "persona": persona_name
        }
    except Exception as e:
         return {
            "title": "Error",
            "content_text": f"An unexpected error occurred: {e}",
            "original_article": article['original_id'],
            "persona": persona_name
        }

#Demonstration

In [6]:
if not articles_df.empty:
    # 1. Select the first article for demonstration
    article_to_rewrite = articles_df.iloc[0]

    print(f"\n=======================================================")
    print(f"  DEMONSTRATION START")
    print(f"  Source Article: {article_to_rewrite['title']}")
    print(f"=======================================================\n")

    # 2. Rewrite the article for the Gen Z Trailblazer Persona
    genz_result = rewrite_article(article_to_rewrite, "Gen Z Trailblazer")

    # 3. Rewrite the SAME article for the Corporate Visionary Persona
    corp_result = rewrite_article(article_to_rewrite, "Corporate Visionary")

    # 4. Rewrite for the Eco-Philosopher Persona
    eco_result = rewrite_article(article_to_rewrite, "Eco-Philosopher")

    # 5. Rewrite for the Cyberpunk Dreamer Persona
    cyber_result = rewrite_article(article_to_rewrite, "Cyberpunk Dreamer")

    # 6. Display the results clearly
    print("\n-------------------------------------------------------")
    print(f"  ORIGINAL TEXT SNIPPET (First 150 chars)")
    print("-------------------------------------------------------")
    print(f"{article_to_rewrite['body_text_html'][:150]}...\n")

    # Gen Z Trailblazer Output
    print(f"\n--- Output for Persona: {genz_result['persona']} ---")
    print(genz_result['content_text'])

    # Corporate Visionary Output
    print(f"\n--- Output for Persona: {corp_result['persona']} ---")
    print(corp_result['content_text'])

    # Eco-Philosopher Output
    print(f"\n--- Output for Persona: {eco_result['persona']} ---")
    print(eco_result['content_text'])

    # Cyberpunk Dreamer Output
    print(f"\n--- Output for Persona: {cyber_result['persona']} ---")
    print(cyber_result['content_text'])

else:
    print("\nSkipping demonstration because no articles were loaded. Check Section 2.")


  DEMONSTRATION START
  Source Article: Cybersecurity Starts With You: Lessons From Phishing, Ransomware, and Real-World Mistakes

-> Rewriting 'Cybersecurity Starts With You: Lessons From Phishi...' for persona: Gen Z Trailblazer...
-> Rewriting 'Cybersecurity Starts With You: Lessons From Phishi...' for persona: Corporate Visionary...
-> Rewriting 'Cybersecurity Starts With You: Lessons From Phishi...' for persona: Eco-Philosopher...
-> Rewriting 'Cybersecurity Starts With You: Lessons From Phishi...' for persona: Cyberpunk Dreamer...

-------------------------------------------------------
  ORIGINAL TEXT SNIPPET (First 150 chars)
-------------------------------------------------------
<p>This Cybersecurity Awareness Month, see how real-world phishing and ransomware attacks reveal why every employee plays a role in protection.</p>
<p...


--- Output for Persona: Gen Z Trailblazer ---
Okay, besties, spill the tea: it's Cybersecurity Awareness Month, and we need to talk. Ever heard o