In [None]:
!mkdir -p /content/NewsAgent/app
!mkdir -p /content/NewsAgent/agents


In [None]:
import os
os.environ['GEMINI_API_KEY'] = "AIzaSyCZELmN8Re-v2eMaVRHROKguyjEYpSU9hY"
os.environ['GCS_BUCKET'] = 'news-hub'
os.environ['GCS_PREFIX'] = 'news_data'


In [None]:
%cd /content/NewsAgent
#!chmod +x deploy.sh
#!./deploy.sh


/content/NewsAgent


In [None]:
%%writefile app/news_agent_main.py
"""
Fixed FastAPI Application with improved dashboard rendering
"""

from fastapi import FastAPI, HTTPException
from fastapi.responses import HTMLResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import os
import sys
import logging
from datetime import datetime
from html import escape

# Add paths
sys.path.append('/app')
sys.path.append('/app/agents')

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = FastAPI(
    title="Multi-Agent News Intelligence API",
    description="AI-powered trending news analysis",
    version="1.0.0"
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Initialize orchestrator
try:
    from news_orchestrator_main import MainNewsOrchestrator
    orchestrator = MainNewsOrchestrator(os.getenv("GEMINI_API_KEY"))
    logger.info("✅ Orchestrator initialized successfully")
except Exception as e:
    logger.error(f"❌ Failed to initialize orchestrator: {str(e)}")
    orchestrator = None

class QueryRequest(BaseModel):
    query: str
    max_results: int = 5

def safe_truncate(text, max_length=250):
    """Safely truncate text and add ellipsis"""
    if not text:
        return ""
    text = str(text).strip()
    if len(text) <= max_length:
        return text
    return text[:max_length].rsplit(' ', 1)[0] + "..."

def safe_html_escape(text):
    """Safely escape HTML characters"""
    if not text:
        return ""
    return escape(str(text))

@app.get("/", response_class=HTMLResponse)
async def dashboard():
    """Improved trending news dashboard with fixed rendering"""

    if not orchestrator:
        return """
        <html><body style="font-family: Arial, sans-serif; padding: 20px;">
        <h1>📰 Multi-Agent News Intelligence</h1>
        <div style="color: red; padding: 15px; border: 1px solid red; background: #ffe6e6;">
            ❌ System not available. Check configuration.
        </div>
        </body></html>
        """

    try:
        trending = orchestrator.get_trending_news()

        if not trending['success']:
            return """
            <html><body style="font-family: Arial, sans-serif; padding: 20px;">
            <h1>📰 Multi-Agent News Intelligence</h1>
            <div style="color: orange; padding: 15px; border: 1px solid orange; background: #fff3cd;">
                ⚠️ No trending news available. Run batch pipeline first.
            </div>
            </body></html>
            """

        # Calculate statistics
        total_india = sum(data.get('count', 0) for data in trending['data'].get('India', {}).values())
        total_global = sum(data.get('count', 0) for data in trending['data'].get('Global', {}).values())
        total_articles = total_india + total_global

        # Generate professional HTML dashboard
        html = f"""
        <!DOCTYPE html>
        <html lang="en">
        <head>
            <meta charset="UTF-8">
            <meta name="viewport" content="width=device-width, initial-scale=1.0">
            <title>Multi-Agent News Intelligence</title>
            <style>
                body {{
                    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
                    margin: 0; padding: 20px; background: #f5f7fa; line-height: 1.6;
                }}
                .header {{
                    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
                    color: white; padding: 30px; border-radius: 12px; margin-bottom: 30px;
                    box-shadow: 0 4px 20px rgba(0,0,0,0.1);
                }}
                .header h1 {{ margin: 0 0 10px 0; font-size: 2.5em; }}
                .header p {{ margin: 5px 0; opacity: 0.9; }}
                .header small {{ opacity: 0.8; }}
                .stats {{
                    display: flex; gap: 20px; margin-bottom: 30px; flex-wrap: wrap;
                }}
                .stat-card {{
                    background: white; padding: 25px; border-radius: 10px;
                    box-shadow: 0 3px 15px rgba(0,0,0,0.08); flex: 1; min-width: 200px;
                    text-align: center; transition: transform 0.2s ease;
                }}
                .stat-card:hover {{ transform: translateY(-2px); }}
                .stat-card h3 {{ margin: 0 0 10px 0; color: #666; font-size: 1.1em; }}
                .stat-card h2 {{ margin: 0 0 5px 0; font-size: 2.5em; color: #333; }}
                .stat-card p {{ margin: 0; color: #888; }}
                .region {{
                    background: white; border-radius: 12px; padding: 30px;
                    margin: 25px 0; box-shadow: 0 4px 20px rgba(0,0,0,0.08);
                }}
                .region h2 {{
                    margin: 0 0 25px 0; color: #333; border-bottom: 3px solid #667eea;
                    padding-bottom: 10px; font-size: 1.8em;
                }}
                .category {{
                    margin: 25px 0; padding: 20px; background: #f8faff;
                    border-radius: 10px; border-left: 5px solid #667eea;
                }}
                .category h3 {{
                    margin: 0 0 15px 0; color: #333; font-size: 1.4em;
                    display: flex; align-items: center; gap: 10px;
                }}
                .category-count {{
                    background: #667eea; color: white; padding: 4px 12px;
                    border-radius: 20px; font-size: 0.9em; font-weight: normal;
                }}
                .ai-summary {{
                    background: #e8f4fd; border-left: 4px solid #2196f3;
                    padding: 15px; margin: 15px 0; border-radius: 5px; font-style: italic;
                }}
                .story {{
                    background: white; margin: 15px 0; padding: 20px;
                    border-radius: 8px; border-left: 4px solid #4caf50;
                    box-shadow: 0 2px 8px rgba(0,0,0,0.05);
                }}
                .story h4 {{
                    margin: 0 0 8px 0; color: #333; font-size: 1.1em; line-height: 1.4;
                }}
                .story-meta {{
                    color: #666; font-size: 0.9em; margin-bottom: 10px;
                    display: flex; align-items: center; gap: 15px;
                }}
                .story p {{
                    margin: 0; color: #555; line-height: 1.5;
                }}
                .api-section {{
                    background: linear-gradient(135deg, #e3f2fd 0%, #f3e5f5 100%);
                    border: 1px solid #2196f3; padding: 25px; border-radius: 10px;
                    margin: 40px 0 20px 0;
                }}
                .api-section h2 {{ margin: 0 0 15px 0; color: #1976d2; }}
                .api-section ul {{ margin: 10px 0; padding-left: 20px; }}
                .api-section li {{ margin: 8px 0; }}
                .api-section code {{
                    background: #f5f5f5; padding: 2px 6px; border-radius: 3px;
                    font-family: 'Courier New', monospace; color: #d32f2f;
                }}
                .footer {{
                    text-align: center; margin-top: 40px; padding: 20px;
                    color: #666; border-top: 1px solid #eee;
                }}
            </style>
        </head>
        <body>
            <div class="header">
                <h1>🔥 Multi-Agent News Intelligence Dashboard</h1>
                <p>AI-powered trending news analysis across India and Global markets</p>
                <small>Last updated: {safe_html_escape(trending.get('generation_time', 'Unknown'))}</small>
            </div>

            <div class="stats">
                <div class="stat-card">
                    <h3>🇮🇳 India News</h3>
                    <h2>{total_india}</h2>
                    <p>Trending articles</p>
                </div>
                <div class="stat-card">
                    <h3>🌍 Global News</h3>
                    <h2>{total_global}</h2>
                    <p>International stories</p>
                </div>
                <div class="stat-card">
                    <h3>📊 Total Coverage</h3>
                    <h2>{total_articles}</h2>
                    <p>Articles analyzed</p>
                </div>
            </div>
        """

        # Add trending news by region
        category_icons = {
            'sports': '⚽', 'politics': '🏛️', 'technology': '💻',
            'health': '🏥', 'crime': '🚨', 'entertainment': '🎬'
        }

        for region_name, region_data in trending['data'].items():
            if not region_data:
                continue

            region_icon = "🇮🇳" if region_name == "India" else "🌍"
            html += f"""
            <div class="region">
                <h2>{region_icon} {safe_html_escape(region_name)} Trending News</h2>
            """

            # Limit categories to prevent page being too long
            for category, category_data in list(region_data.items())[:6]:  # Max 6 categories
                icon = category_icons.get(category, '📰')

                html += f"""
                <div class="category">
                    <h3>
                        {icon} {safe_html_escape(category.title())}
                        <span class="category-count">{category_data.get('count', 0)} stories</span>
                    </h3>
                """

                # Add AI summary if available
                if 'ai_summary' in category_data:
                    html += f"""
                    <div class="ai-summary">
                        <strong>AI Summary:</strong> {safe_html_escape(safe_truncate(category_data['ai_summary'], 200))}
                    </div>
                    """

                # Add top stories (limit to 3)
                for i, story in enumerate(category_data.get('top_stories', [])[:3], 1):
                    title = safe_html_escape(safe_truncate(story.get('title', 'No title'), 120))
                    source = safe_html_escape(story.get('source', 'Unknown source'))
                    description = safe_html_escape(safe_truncate(story.get('description', 'No description'), 200))

                    html += f"""
                    <div class="story">
                        <h4>{i}. {title}</h4>
                        <div class="story-meta">
                            <span><strong>Source:</strong> {source}</span>
                        </div>
                        <p>{description}</p>
                    </div>
                    """

                html += "</div>"

            html += "</div>"

        # Add API documentation
        html += """
            <div class="api-section">
                <h2>🔗 API Endpoints</h2>
                <p><strong>Access your data programmatically:</strong></p>
                <ul>
                    <li><code>GET /api/trending</code> - Get all trending news</li>
                    <li><code>GET /api/trending/India</code> - Get India trending news</li>
                    <li><code>GET /api/trending/Global</code> - Get Global trending news</li>
                    <li><code>GET /api/trending/{region}/{category}</code> - Get specific category</li>
                    <li><code>POST /api/query</code> - Ask questions about news</li>
                    <li><code>GET /docs</code> - Interactive API documentation</li>
                </ul>
            </div>

            <div class="footer">
                <p>Powered by Multi-Agent AI • Deployed on Google Cloud Run</p>
            </div>
        </body>
        </html>
        """

        return html

    except Exception as e:
        logger.error(f"Dashboard error: {str(e)}")
        return f"""
        <html><body style="font-family: Arial, sans-serif; padding: 20px;">
        <h1>📰 Multi-Agent News Intelligence</h1>
        <div style="color: red; padding: 15px; border: 1px solid red; background: #ffe6e6;">
            ❌ Error loading dashboard: {safe_html_escape(str(e))}
        </div>
        </body></html>
        """

# Keep all other endpoints the same
@app.get("/api/trending")
async def get_all_trending():
    if not orchestrator:
        raise HTTPException(status_code=503, detail="Orchestrator not available")
    return orchestrator.get_trending_news()

@app.get("/api/trending/{region}")
async def get_region_trending(region: str):
    if region not in ['India', 'Global']:
        raise HTTPException(status_code=400, detail="Region must be 'India' or 'Global'")
    if not orchestrator:
        raise HTTPException(status_code=503, detail="Orchestrator not available")
    return orchestrator.get_trending_news(region=region)

@app.get("/api/trending/{region}/{category}")
async def get_category_trending(region: str, category: str):
    valid_categories = ["sports", "politics", "technology", "health", "crime", "entertainment"]
    if region not in ['India', 'Global']:
        raise HTTPException(status_code=400, detail="Region must be 'India' or 'Global'")
    if category not in valid_categories:
        raise HTTPException(status_code=400, detail=f"Category must be one of: {valid_categories}")
    if not orchestrator:
        raise HTTPException(status_code=503, detail="Orchestrator not available")
    return orchestrator.get_trending_news(region=region, category=category)

# Keep your existing POST endpoint
@app.post("/admin/run-trending-extraction")
async def run_trending_extraction_post():
    """Admin endpoint to trigger trending extraction via POST"""

    logger.info("🔥 Triggered trending extraction via POST")

    try:
        from trending_pipeline_fixed import trending_extractor_fixed

        # Run the extraction
        result = await trending_extractor_fixed.run_trending_extraction()

        if result:
            # Clear API cache to load new data
            if orchestrator:
                orchestrator.trending_cache = None

            return {
                "success": True,
                "message": "Trending extraction completed successfully",
                "timestamp": datetime.utcnow().isoformat(),
                "regions_processed": list(result.keys()) if isinstance(result, dict) else [],
                "triggered_by": "POST"
            }
        else:
            return {
                "success": False,
                "message": "Trending extraction failed - check logs",
                "timestamp": datetime.utcnow().isoformat()
            }

    except Exception as e:
        logger.error(f"❌ Trending extraction error: {str(e)}")
        return {
            "success": False,
            "message": f"Exception during extraction: {str(e)}",
            "timestamp": datetime.utcnow().isoformat()
        }

# ADD THIS NEW GET ENDPOINT
@app.get("/admin/run-trending-extraction")
async def run_trending_extraction_get():
    """Admin endpoint to trigger trending extraction via GET (for scheduler compatibility)"""

    logger.info("🔥 Triggered trending extraction via GET")

    try:
        from trending_pipeline_fixed import trending_extractor_fixed

        # Run the extraction
        result = await trending_extractor_fixed.run_trending_extraction()

        if result:
            # Clear API cache to load new data
            if orchestrator:
                orchestrator.trending_cache = None

            return {
                "success": True,
                "message": "Trending extraction completed successfully (via GET)",
                "timestamp": datetime.utcnow().isoformat(),
                "regions_processed": list(result.keys()) if isinstance(result, dict) else [],
                "triggered_by": "GET"
            }
        else:
            return {
                "success": False,
                "message": "Trending extraction failed - check logs",
                "timestamp": datetime.utcnow().isoformat()
            }

    except Exception as e:
        logger.error(f"❌ Trending extraction error: {str(e)}")
        return {
            "success": False,
            "message": f"Exception during extraction: {str(e)}",
            "timestamp": datetime.utcnow().isoformat()
        }

@app.post("/api/query")
async def process_query(request: QueryRequest):
    if not orchestrator:
        raise HTTPException(status_code=503, detail="Orchestrator not available")
    result = await orchestrator.answer_query(request.query, request.max_results)
    return result

@app.get("/health")
async def health_check():
    return {
        "status": "healthy",
        "service": "Multi-Agent News Intelligence",
        "timestamp": datetime.utcnow().isoformat(),
        "orchestrator_status": "available" if orchestrator else "unavailable"
    }

if __name__ == "__main__":
    import uvicorn
    port = int(os.getenv("PORT", 8080))
    uvicorn.run(app, host="0.0.0.0", port=port)


Overwriting app/main_fixed.py


In [None]:
%%writefile Dockerfile
FROM python:3.11-slim

WORKDIR /app

# Install system dependencies
RUN apt-get update && apt-get install -y \
    gcc \
    g++ \
    curl \
    && rm -rf /var/lib/apt/lists/*

# Set environment variables
ENV PYTHONPATH=/app
ENV PYTHONUNBUFFERED=1
ENV PORT=8080

# Copy requirements and install Python dependencies
COPY requirements.txt ./
RUN pip install --no-cache-dir --upgrade pip && \
    pip install --no-cache-dir -r requirements.txt

# Copy application code
COPY ./app ./app
COPY ./agents ./agents
COPY *.py ./

# Create non-root user for security
RUN addgroup --gid 1001 --system appgroup && \
    adduser --system --gid 1001 --disabled-password --no-create-home --home /app appuser && \
    chown -R appuser:appgroup /app

USER appuser

# Health check
HEALTHCHECK --interval=30s --timeout=30s --start-period=60s --retries=3 \
    CMD curl -f http://localhost:$PORT/health || exit 1

# Expose port
EXPOSE $PORT

# Run the application
CMD ["uvicorn", "app.main_fixed:app", "--host", "0.0.0.0", "--port", "8080"]

Writing Dockerfile


In [None]:
%%writefile requirements.txt
fastapi==0.104.1
uvicorn[standard]==0.24.0
pandas==2.0.3
numpy==1.24.3
google-cloud-storage==2.10.0
sentence-transformers==2.2.2
faiss-cpu==1.7.4
pyarrow==12.0.1
protobuf==3.20.3
google-generativeai==0.3.2
newspaper3k==0.2.8
beautifulsoup4==4.12.2
nltk==3.8.1
tf-keras==2.15.0
transformers==4.35.2
torch==2.1.0
huggingface_hub==0.25.2

Writing requirements.txt


In [5]:
%%writefile app/main.py
"""
FastAPI Application for Multi-Agent News Intelligence
Production-ready version
"""

from fastapi import FastAPI, HTTPException
from fastapi.responses import HTMLResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import os
import sys
import logging
from datetime import datetime

# Add paths
sys.path.append('/app')
sys.path.append('/app/agents')

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = FastAPI(
    title="Multi-Agent News Intelligence API",
    description="AI-powered trending news analysis across India and Global markets",
    version="1.0.0",
    docs_url="/docs",
    redoc_url="/redoc"
)

# Add CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Initialize orchestrator
try:
    from news_orchestrator_main import MainNewsOrchestrator
    orchestrator = MainNewsOrchestrator(os.getenv("GEMINI_API_KEY"))
    logger.info("✅ Orchestrator initialized successfully")
except Exception as e:
    logger.error(f"❌ Failed to initialize orchestrator: {str(e)}")
    orchestrator = None

class QueryRequest(BaseModel):
    query: str
    max_results: int = 5

@app.get("/", response_class=HTMLResponse)
async def dashboard():
    """Professional trending news dashboard"""

    if not orchestrator:
        return """
        <html><body style="font-family: Arial, sans-serif; padding: 20px;">
        <h1>📰 Multi-Agent News Intelligence</h1>
        <div style="color: red; padding: 15px; border: 1px solid red; background: #ffe6e6;">
            ❌ System not available. Please check configuration.
        </div>
        </body></html>
        """

    try:
        trending = orchestrator.get_trending_news()

        if not trending['success']:
            return """
            <html><body style="font-family: Arial, sans-serif; padding: 20px;">
            <h1>📰 Multi-Agent News Intelligence</h1>
            <div style="color: orange; padding: 15px; border: 1px solid orange; background: #fff3cd;">
                ⚠️ No trending news available. Please run the batch pipeline first.
            </div>
            </body></html>
            """

        # Generate professional HTML dashboard
        html = f"""
        <!DOCTYPE html>
        <html lang="en">
        <head>
            <meta charset="UTF-8">
            <meta name="viewport" content="width=device-width, initial-scale=1.0">
            <title>Multi-Agent News Intelligence</title>
            <style>
                body {{
                    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
                    margin: 0; padding: 20px; background: #f5f7fa;
                }}
                .header {{
                    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
                    color: white; padding: 30px; border-radius: 10px; margin-bottom: 30px;
                }}
                .stats {{
                    display: flex; gap: 20px; margin-bottom: 30px; flex-wrap: wrap;
                }}
                .stat-card {{
                    background: white; padding: 20px; border-radius: 8px;
                    box-shadow: 0 2px 10px rgba(0,0,0,0.1); flex: 1; min-width: 200px;
                }}
                .region {{
                    background: white; border-radius: 10px; padding: 25px;
                    margin: 20px 0; box-shadow: 0 4px 15px rgba(0,0,0,0.1);
                }}
                .category {{
                    margin: 20px 0; padding: 20px; background: #f8f9ff;
                    border-radius: 8px; border-left: 4px solid #667eea;
                }}
                .story {{
                    background: white; margin: 10px 0; padding: 15px;
                    border-radius: 6px; border-left: 3px solid #28a745;
                    box-shadow: 0 2px 5px rgba(0,0,0,0.05);
                }}
                .api-docs {{
                    background: #e3f2fd; border: 1px solid #2196f3;
                    padding: 20px; border-radius: 8px; margin: 30px 0;
                }}
            </style>
        </head>
        <body>
            <div class="header">
                <h1>🔥 Multi-Agent News Intelligence Dashboard</h1>
                <p>AI-powered trending news analysis across India and Global markets</p>
                <small>Last updated: {trending.get('generation_time', 'Unknown')}</small>
            </div>
        """

        # Add statistics
        total_india = sum(data['count'] for data in trending['data'].get('India', {}).values())
        total_global = sum(data['count'] for data in trending['data'].get('Global', {}).values())

        html += f"""
            <div class="stats">
                <div class="stat-card">
                    <h3>🇮🇳 India News</h3>
                    <h2>{total_india}</h2>
                    <p>Trending articles</p>
                </div>
                <div class="stat-card">
                    <h3>🌍 Global News</h3>
                    <h2>{total_global}</h2>
                    <p>International stories</p>
                </div>
                <div class="stat-card">
                    <h3>📊 Total Coverage</h3>
                    <h2>{total_india + total_global}</h2>
                    <p>Articles analyzed</p>
                </div>
            </div>
        """

        # Add trending news by region
        for region_name, region_data in trending['data'].items():
            if not region_data:
                continue

            region_icon = "🇮🇳" if region_name == "India" else "🌍"
            html += f"""
            <div class="region">
                <h2>{region_icon} {region_name} Trending News</h2>
            """

            for category, category_data in region_data.items():
                category_icons = {
                    'sports': '⚽', 'politics': '🏛️', 'technology': '💻',
                    'health': '🏥', 'crime': '🚨', 'entertainment': '🎬'
                }
                icon = category_icons.get(category, '📰')

                html += f"""
                <div class="category">
                    <h3>{icon} {category.title()}</h3>
                    <p><strong>{category_data['count']} stories</strong></p>
                """

                if 'ai_summary' in category_data:
                    html += f"<p><strong>AI Summary:</strong> {category_data['ai_summary']}</p>"

                # Add top stories
                for i, story in enumerate(category_data.get('top_stories', [])[:3], 1):
                    html += f"""
                    <div class="story">
                        <h4>{i}. {story['title']}</h4>
                        <p><strong>Source:</strong> {story['source']}</p>
                        <p>{story['description']}</p>
                    </div>
                    """

                html += "</div>"

            html += "</div>"

        # Add API documentation
        html += """
            <div class="api-docs">
                <h2>🔗 API Endpoints</h2>
                <p><strong>Access programmatically:</strong></p>
                <ul>
                    <li><code>GET /api/trending</code> - All trending news</li>
                    <li><code>GET /api/trending/India</code> - India trending news</li>
                    <li><code>GET /api/trending/Global</code> - Global trending news</li>
                    <li><code>GET /api/trending/{region}/{category}</code> - Specific category</li>
                    <li><code>POST /api/query</code> - Ask questions about news</li>
                    <li><code>GET /docs</code> - Interactive API documentation</li>
                </ul>
            </div>
        </body>
        </html>
        """

        return html

    except Exception as e:
        logger.error(f"Dashboard error: {str(e)}")
        return f"""
        <html><body style="font-family: Arial, sans-serif; padding: 20px;">
        <h1>📰 Multi-Agent News Intelligence</h1>
        <div style="color: red; padding: 15px; border: 1px solid red; background: #ffe6e6;">
            ❌ Error loading dashboard: {str(e)}
        </div>
        </body></html>
        """

@app.get("/api/trending")
async def get_all_trending():
    """Get all trending news - API endpoint"""
    if not orchestrator:
        raise HTTPException(status_code=503, detail="Orchestrator not available")

    try:
        result = orchestrator.get_trending_news()
        return result
    except Exception as e:
        logger.error(f"Trending API error: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/api/trending/{region}")
async def get_region_trending(region: str):
    """Get trending news for specific region"""
    if region not in ['India', 'Global']:
        raise HTTPException(status_code=400, detail="Region must be 'India' or 'Global'")

    if not orchestrator:
        raise HTTPException(status_code=503, detail="Orchestrator not available")

    try:
        result = orchestrator.get_trending_news(region=region)
        return result
    except Exception as e:
        logger.error(f"Region trending error: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/api/trending/{region}/{category}")
async def get_category_trending(region: str, category: str):
    """Get trending news for specific region and category"""
    valid_categories = ["sports", "politics", "technology", "health", "crime", "entertainment"]

    if region not in ['India', 'Global']:
        raise HTTPException(status_code=400, detail="Region must be 'India' or 'Global'")

    if category not in valid_categories:
        raise HTTPException(status_code=400, detail=f"Category must be one of: {valid_categories}")

    if not orchestrator:
        raise HTTPException(status_code=503, detail="Orchestrator not available")

    try:
        result = orchestrator.get_trending_news(region=region, category=category)
        return result
    except Exception as e:
        logger.error(f"Category trending error: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))

@app.post("/api/query")
async def process_query(request: QueryRequest):
    """Process specific news queries"""
    if not orchestrator:
        raise HTTPException(status_code=503, detail="Orchestrator not available")

    try:
        result = await orchestrator.answer_query(request.query, request.max_results)
        return result
    except Exception as e:
        logger.error(f"Query processing error: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/health")
async def health_check():
    """Health check for Cloud Run"""
    return {
        "status": "healthy",
        "service": "Multi-Agent News Intelligence",
        "timestamp": datetime.utcnow().isoformat(),
        "orchestrator_status": "available" if orchestrator else "unavailable"
    }

@app.get("/api/status")
async def system_status():
    """Detailed system status"""
    if not orchestrator:
        return {"status": "error", "message": "Orchestrator not initialized"}

    try:
        trending = orchestrator.get_trending_news()

        return {
            "status": "operational",
            "trending_available": trending['success'],
            "regions": list(trending.get('data', {}).keys()),
            "last_update": trending.get('generation_time'),
            "total_categories": sum(len(region_data) for region_data in trending.get('data', {}).values())
        }
    except Exception as e:
        return {"status": "degraded", "error": str(e)}

if __name__ == "__main__":
    import uvicorn
    port = int(os.getenv("PORT", 8080))
    uvicorn.run(app, host="0.0.0.0", port=port)


Writing app/main.py


In [6]:
%%writefile news_orchestrator_main.py
"""
Main News Orchestrator: Trending News First Approach
"""

import os
import json
import pandas as pd
import faiss
import asyncio
from datetime import datetime, timedelta
from google.cloud import storage
from sentence_transformers import SentenceTransformer
import google.generativeai as genai

class MainNewsOrchestrator:
    """Main orchestrator focusing on trending news first"""

    def __init__(self, gemini_api_key=None):
        self.gcs_bucket = os.getenv("GCS_BUCKET", "news-hub")
        self.gemini_api_key = gemini_api_key

        # Initialize Gemini for queries
        if self.gemini_api_key:
            genai.configure(api_key=self.gemini_api_key)
            self.llm = genai.GenerativeModel('gemini-1.5-pro')
        else:
            self.llm = None

        # Initialize embedder for queries
        try:
            self.embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        except:
            self.embedder = None

        # Cache for data
        self.trending_cache = None
        self.full_dataset_cache = None

        print("🎯 Main News Orchestrator Ready!")

    def get_trending_news(self, region=None, category=None):
        """Get trending news (primary function)"""
        print("🔥 Fetching trending news...")

        # Load trending summary from cache or GCS
        trending_data = self._load_trending_summary()

        if not trending_data:
            return {
                "success": False,
                "message": "No trending news available. Please run trending extraction.",
                "data": {}
            }

        # Filter by region/category if specified
        if region and region in trending_data:
            if category and category in trending_data[region]:
                return {
                    "success": True,
                    "message": f"Trending {category} news in {region}",
                    "data": {region: {category: trending_data[region][category]}},
                    "generation_time": trending_data.get('generation_time')
                }
            else:
                return {
                    "success": True,
                    "message": f"All trending news in {region}",
                    "data": {region: trending_data[region]},
                    "generation_time": trending_data.get('generation_time')
                }

        # Return all trending news
        return {
            "success": True,
            "message": "All trending news",
            "data": {
                "India": trending_data.get('India', {}),
                "Global": trending_data.get('Global', {})
            },
            "generation_time": trending_data.get('generation_time')
        }

    def _load_trending_summary(self):
        """Load trending summary from GCS"""

        if self.trending_cache:
            return self.trending_cache

        try:
            # Try today and yesterday
            for days_back in range(2):
                date_obj = datetime.utcnow().date() - timedelta(days=days_back)
                date_str = date_obj.strftime("%Y-%m-%d")

                storage_client = storage.Client()
                bucket = storage_client.bucket(self.gcs_bucket)
                blob = bucket.blob(f"trending/{date_str}/summary.json")

                if blob.exists():
                    content = blob.download_as_text()
                    trending_data = json.loads(content)

                    # Cache it
                    self.trending_cache = trending_data
                    print(f"✅ Loaded trending data from {date_str}")
                    return trending_data

            return None

        except Exception as e:
            print(f"⚠️ Failed to load trending summary: {str(e)}")
            return None

    async def answer_query(self, user_query, max_results=5):
        """Answer specific queries using full dataset"""
        print(f"🔍 Processing query: '{user_query}'")

        if not self.embedder:
            return {
                "success": False,
                "message": "Query system not available",
                "query": user_query
            }

        # Load full dataset index
        success = self._load_full_dataset()

        if not success:
            return {
                "success": False,
                "message": "Full dataset not available for queries",
                "query": user_query
            }

        try:
            # Create query embedding
            query_embedding = self.embedder.encode([user_query], normalize_embeddings=True)
            query_embedding = query_embedding.astype('float32')

            # Search full dataset
            distances, indices = self.full_dataset_index.search(query_embedding, max_results)

            # Get relevant articles
            relevant_articles = []
            for dist, idx in zip(distances[0], indices[0]):
                if dist > 0.3 and idx < len(self.full_dataset_meta):
                    article = self.full_dataset_meta.iloc[idx].to_dict()
                    article['relevance_score'] = float(dist)
                    relevant_articles.append(article)

            if not relevant_articles:
                return {
                    "success": True,
                    "message": f"No relevant articles found for '{user_query}'",
                    "query": user_query,
                    "articles": []
                }

            # Generate AI summary if available
            if self.llm:
                summary = await self._generate_query_summary(user_query, relevant_articles)
            else:
                summary = self._generate_basic_query_summary(user_query, relevant_articles)

            return {
                "success": True,
                "message": summary,
                "query": user_query,
                "articles": relevant_articles[:3],
                "total_found": len(relevant_articles)
            }

        except Exception as e:
            return {
                "success": False,
                "message": f"Query processing failed: {str(e)}",
                "query": user_query
            }

    def _load_full_dataset(self):
        """Load full dataset index for queries"""

        if hasattr(self, 'full_dataset_index'):
            return True

        try:
            today_str = datetime.utcnow().date().strftime("%Y-%m-%d")

            storage_client = storage.Client()
            bucket = storage_client.bucket(self.gcs_bucket)

            # Download index and metadata
            index_blob = bucket.blob(f"full_dataset/{today_str}/index.faiss")
            meta_blob = bucket.blob(f"full_dataset/{today_str}/metadata.parquet")

            if index_blob.exists() and meta_blob.exists():
                # Download files
                local_index = f"/tmp/query_index.faiss"
                local_meta = f"/tmp/query_meta.parquet"

                index_blob.download_to_filename(local_index)
                meta_blob.download_to_filename(local_meta)

                # Load into memory
                self.full_dataset_index = faiss.read_index(local_index)
                self.full_dataset_meta = pd.read_parquet(local_meta)

                print(f"✅ Loaded full dataset: {len(self.full_dataset_meta)} articles")
                return True

            return False

        except Exception as e:
            print(f"⚠️ Failed to load full dataset: {str(e)}")
            return False

    async def _generate_query_summary(self, query, articles):
        """Generate AI summary for query results"""

        # Prepare article summaries
        article_texts = []
        for article in articles[:3]:
            title = article.get('title', 'No title')
            source = article.get('source', 'Unknown')
            desc = article.get('description', '')[:100]

            article_texts.append(f"- {title} ({source}): {desc}")

        newline = '\n'
        prompt = f"""Answer this news query based on the provided articles: "{query}"

Relevant articles:
{newline.join(article_texts)}

Provide a concise, informative answer that directly addresses the user's question using information from these articles. Keep it under 150 words."""

        try:
            response = await asyncio.to_thread(self.llm.generate_content, prompt)
            return response.text.strip()
        except Exception as e:
            return self._generate_basic_query_summary(query, articles)

    def _generate_basic_query_summary(self, query, articles):
        """Generate basic summary without AI"""

        if not articles:
            return f"No relevant articles found for '{query}'"

        top_article = articles[0]
        return f"Found {len(articles)} articles related to '{query}'. Top result: '{top_article.get('title', 'No title')}' from {top_article.get('source', 'Unknown source')}."

# Initialize main orchestrator
main_orchestrator = MainNewsOrchestrator


Overwriting news_orchestrator_main.py


In [7]:
%%writefile trending_pipeline.py
"""
Fixed Trending Pipeline: Better region detection for Global vs India
"""

import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path
from google.cloud import storage
from transformers import pipeline
import json
import asyncio
import google.generativeai as genai

class TrendingNewsExtractor:
    """Extract trending news with improved region detection"""

    def __init__(self):
        self.gcs_bucket = os.getenv("GCS_BUCKET", "news-hub")
        self.gcs_prefix = os.getenv("GCS_PREFIX", "news_data")
        self.categories = ["sports", "politics", "technology", "health", "crime", "entertainment"]

        # Initialize Gemini
        self.gemini_api_key = os.getenv("GEMINI_API_KEY")
        if self.gemini_api_key:
            genai.configure(api_key=self.gemini_api_key)
            self.llm = genai.GenerativeModel('gemini-1.5-pro')
            print("🤖 Gemini enabled for trending analysis")
        else:
            self.llm = None

        # Load classifier
        try:
            self.classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=-1)
            print("✅ Classification model loaded")
        except Exception as e:
            print(f"⚠️ Classifier failed: {str(e)}")
            self.classifier = None

    def get_latest_24hr_data(self):
        """Download latest 24hr news data"""
        today = datetime.utcnow().date()
        yesterday = today - timedelta(days=1)

        for date_to_try in [today, yesterday]:
            date_str = date_to_try.strftime("%Y-%m-%d")

            try:
                storage_client = storage.Client()
                bucket = storage_client.bucket(self.gcs_bucket)

                for filename in [f"news_with_content_{date_str}.csv", f"data_{date_str}.csv"]:
                    blob_name = f"{self.gcs_prefix}/{filename}"
                    blob = bucket.blob(blob_name)

                    if blob.exists():
                        local_path = f"/tmp/{filename}"
                        blob.download_to_filename(local_path)
                        print(f"✅ Downloaded 24hr data: {blob_name}")
                        return local_path, date_str
            except Exception as e:
                continue

        return None, None

    def assign_region_smart(self, df):
        """Improved region assignment logic"""
        print("🌍 Assigning regions with improved logic...")

        def detect_region(row):
            """Detect if article is India-focused or Global"""

            # Check country field first
            if 'country' in row and pd.notna(row['country']):
                country = str(row['country']).strip().upper()
                if country in ['IN', 'IND', 'INDIA']:
                    return 'India'

            # Check source field for Indian sources
            if 'source' in row and pd.notna(row['source']):
                source = str(row['source']).lower()
                indian_sources = [
                    'times of india', 'toi', 'hindustan times', 'indian express',
                    'ndtv', 'zee news', 'aaj tak', 'india today', 'news18',
                    'firstpost', 'livemint', 'economic times', 'dna india',
                    'deccan herald', 'the hindu', 'outlook india'
                ]

                if any(indian_source in source for indian_source in indian_sources):
                    return 'India'

            # Check title and description for India-related keywords
            text_fields = []
            for field in ['title', 'description', 'content']:
                if field in row and pd.notna(row[field]):
                    text_fields.append(str(row[field]).lower())

            combined_text = ' '.join(text_fields)

            # Strong India indicators
            india_keywords = [
                'india', 'indian', 'delhi', 'mumbai', 'bangalore', 'chennai',
                'kolkata', 'hyderabad', 'pune', 'modi', 'bjp', 'congress',
                'rupee', 'bollywood', 'ipl', 'bcci'
            ]

            # Count India-related mentions
            india_score = sum(1 for keyword in india_keywords if keyword in combined_text)

            # If significant India mentions, classify as India
            if india_score >= 2:
                return 'India'

            # Check for global indicators
            global_keywords = [
                'usa', 'america', 'uk', 'britain', 'china', 'europe', 'russia',
                'ukraine', 'nato', 'un ', 'world', 'international', 'global'
            ]

            global_score = sum(1 for keyword in global_keywords if keyword in combined_text)

            # Default classification logic
            if global_score > india_score:
                return 'Global'
            elif india_score > 0:
                return 'India'
            else:
                return 'Global'  # Default to Global for unclear cases

        # Apply region detection
        df['region'] = df.apply(detect_region, axis=1)

        # Show distribution
        region_counts = df['region'].value_counts()
        print(f"📊 Region distribution:")
        for region, count in region_counts.items():
            print(f"  {region}: {count} articles")

        return df

    def categorize_articles(self, df):
        """Categorize articles using local model"""
        print(f"🔄 Categorizing {len(df)} articles...")

        categories = []
        for _, row in df.iterrows():
            title = str(row.get('title', ''))
            description = str(row.get('description', ''))
            content = str(row.get('content', ''))[:200]

            category = self._categorize_single_article(title, description, content)
            categories.append(category)

        df['pred_category'] = categories

        # Filter to target categories only
        df_filtered = df[df['pred_category'].isin(self.categories)]

        print(f"✅ Filtered to {len(df_filtered)} articles in target categories")
        return df_filtered

    def _categorize_single_article(self, title, description, content):
        """Categorize single article"""
        if not self.classifier:
            return self._keyword_categorize(f"{title} {description} {content}")

        try:
            text = f"{title}. {description}. {content}".strip()
            if len(text) < 10:
                return 'general'

            result = self.classifier(text, self.categories)

            if result and 'labels' in result and 'scores' in result:
                best_category = result['labels'][0].lower()
                confidence = result['scores'][0]

                if confidence > 0.4 and best_category in self.categories:
                    return best_category

            return self._keyword_categorize(text)

        except Exception:
            return self._keyword_categorize(f"{title} {description}")

    def _keyword_categorize(self, text):
        """Keyword-based categorization fallback"""
        text_lower = str(text).lower()

        keywords = {
            'sports': ['sport', 'game', 'match', 'player', 'team', 'football', 'cricket', 'tennis'],
            'politics': ['politic', 'government', 'election', 'minister', 'parliament', 'vote'],
            'technology': ['tech', 'ai', 'software', 'computer', 'digital', 'app', 'startup'],
            'health': ['health', 'medical', 'doctor', 'hospital', 'covid', 'vaccine', 'medicine'],
            'crime': ['crime', 'police', 'arrest', 'court', 'murder', 'theft', 'investigation'],
            'entertainment': ['movie', 'film', 'music', 'celebrity', 'actor', 'entertainment']
        }

        scores = {}
        for category, words in keywords.items():
            scores[category] = sum(1 for word in words if word in text_lower)

        return max(scores.items(), key=lambda x: x[1])[0] if max(scores.values()) > 0 else 'general'

    def extract_trending_by_category_region(self, df):
        """Extract trending news by category and region"""
        print("📈 Extracting trending news...")

        trending_summary = {
            'India': {},
            'Global': {},
            'generation_time': datetime.utcnow().isoformat(),
            'data_date': datetime.utcnow().date().strftime("%Y-%m-%d")
        }

        # Process each region
        for region in ['India', 'Global']:
            region_df = df[df['region'] == region]

            print(f"📍 Processing {region}: {len(region_df)} articles")

            if len(region_df) == 0:
                print(f"⚠️ No articles found for {region} region")
                continue

            # Process each category
            for category in self.categories:
                category_df = region_df[region_df['pred_category'] == category]

                if len(category_df) == 0:
                    continue

                trending_stories = self._extract_category_trends(category_df, category, region)

                if trending_stories:
                    trending_summary[region][category] = trending_stories

        return trending_summary

    def _extract_category_trends(self, category_df, category, region):
        """Extract trending stories for a specific category"""

        # Sort by relevance/recency
        category_df = category_df.copy()

        # Simple trending score
        category_df['trending_score'] = (
            category_df['source'].notna().astype(int) * 0.3 +
            category_df['description'].str.len().fillna(0) / 100 * 0.4 +
            np.random.random(len(category_df)) * 0.3
        )

        # Get top 5 trending stories
        top_stories = category_df.nlargest(5, 'trending_score')

        trending_stories = {
            'count': len(category_df),
            'top_stories': [],
            'summary': f"Found {len(category_df)} {category} stories from {region}"
        }

        for _, story in top_stories.iterrows():
            story_data = {
                'title': str(story.get('title', 'No title')),
                'source': str(story.get('source', 'Unknown')),
                'description': str(story.get('description', ''))[:200] + "...",
                'trending_score': float(story.get('trending_score', 0)),
                'region': story.get('region', region)
            }
            trending_stories['top_stories'].append(story_data)

        return trending_stories

    async def generate_ai_summaries(self, trending_summary):
        """Generate AI summaries for trending topics"""
        if not self.llm:
            return trending_summary

        print("🤖 Generating AI summaries for trending topics...")

        for region in ['India', 'Global']:
            if region not in trending_summary:
                continue

            for category in trending_summary[region].keys():
                try:
                    category_data = trending_summary[region][category]

                    stories_text = []
                    for story in category_data['top_stories'][:3]:
                        stories_text.append(f"- {story['title']} ({story['source']})")

                    if stories_text:
                        newline = '\n'
                        prompt = f"""Create a brief trending news summary for {category} news in {region}.

Top stories:
{newline.join(stories_text)}

Write a 2-3 sentence summary highlighting the key trends and developments. Be concise and informative."""

                        response = await asyncio.to_thread(self.llm.generate_content, prompt)

                        if response and response.text:
                            trending_summary[region][category]['ai_summary'] = response.text.strip()

                except Exception as e:
                    print(f"⚠️ AI summary failed for {region}-{category}: {str(e)}")
                    continue

        return trending_summary

    def save_trending_summary(self, trending_summary, date_str):
        """Save trending summary to GCS"""
        print("💾 Saving trending summary...")

        try:
            local_path = f"/tmp/trending_summary_{date_str}.json"
            with open(local_path, 'w') as f:
                json.dump(trending_summary, f, indent=2, ensure_ascii=False)

            storage_client = storage.Client()
            bucket = storage_client.bucket(self.gcs_bucket)

            blob_name = f"trending/{date_str}/summary.json"
            blob = bucket.blob(blob_name)
            blob.upload_from_filename(local_path)

            print(f"✅ Trending summary saved: gs://{self.gcs_bucket}/{blob_name}")
            return True

        except Exception as e:
            print(f"❌ Failed to save trending summary: {str(e)}")
            return False

    async def run_trending_extraction(self):
        """Main trending extraction pipeline"""
        print("🔥 Starting FIXED Trending News Extraction")
        print("=" * 50)

        try:
            # Step 1: Get latest 24hr data
            data_path, date_str = self.get_latest_24hr_data()

            if not data_path:
                print("❌ No 24hr data available")
                return False

            # Step 2: Load and assign regions smartly
            df = pd.read_csv(data_path)
            print(f"✅ Loaded {len(df)} articles from last 24hrs")

            df = self.assign_region_smart(df)

            # Step 3: Categorize articles
            df_categorized = self.categorize_articles(df)

            # Step 4: Extract trending by category/region
            trending_summary = self.extract_trending_by_category_region(df_categorized)

            # Step 5: Generate AI summaries
            trending_summary = await self.generate_ai_summaries(trending_summary)

            # Step 6: Save trending summary
            success = self.save_trending_summary(trending_summary, date_str)

            if success:
                print("\n🎉 FIXED TRENDING EXTRACTION COMPLETE!")
                print("=" * 40)

                # Show detailed summary
                for region in ['India', 'Global']:
                    if region in trending_summary and trending_summary[region]:
                        print(f"\n📍 {region}:")
                        for category, data in trending_summary[region].items():
                            print(f"  ✅ {category}: {data['count']} stories")
                    else:
                        print(f"\n📍 {region}: No trending stories found")

                return trending_summary
            else:
                return False

        except Exception as e:
            print(f"❌ Trending extraction failed: {str(e)}")
            return False

# Initialize fixed trending extractor
trending_extractor_fixed = TrendingNewsExtractor()


Overwriting trending_pipeline_fixed.py


In [8]:
%%writefile agents/base_agent.py
import os
import json
import asyncio
from abc import ABC, abstractmethod
from typing import Dict, Any, List
import google.generativeai as genai
from dataclasses import dataclass
from datetime import datetime

@dataclass
class AgentResponse:
    success: bool
    data: Dict[Any, Any]
    message: str
    agent_name: str
    timestamp: str

class BaseAgent(ABC):
    def __init__(self, name: str):
        self.name = name
        self.gemini_api_key = os.getenv("GEMINI_API_KEY")

        if self.gemini_api_key:
            genai.configure(api_key=self.gemini_api_key)
            self.llm = genai.GenerativeModel('gemini-1.5-pro')
            self.log_activity("✅ Agent initialized successfully")
        else:
            self.llm = None
            self.log_activity("⚠️ No Gemini API key")

    @abstractmethod
    async def execute(self, task: Dict[str, Any]) -> AgentResponse:
        pass

    async def _generate_content(self, prompt: str) -> str:
        """Fixed Gemini API call - using asyncio.to_thread"""
        if not self.llm:
            return "Error: Gemini API not available"

        try:
            self.log_activity("🤖 Calling Gemini API in thread...")
            response = await asyncio.to_thread(self.llm.generate_content, prompt)
            self.log_activity("✅ Gemini API call successful")
            return response.text
        except Exception as e:
            error_msg = f"Gemini API error: {str(e)}"
            self.log_activity(f"❌ {error_msg}")
            return error_msg

    def log_activity(self, message: str):
        timestamp = datetime.now().strftime("%H:%M:%S")
        print(f"[{timestamp}] [{self.name}] {message}")

    def create_response(self, success: bool, data: Dict[Any, Any], message: str) -> AgentResponse:
        return AgentResponse(
            success=success,
            data=data,
            message=message,
            agent_name=self.name,
            timestamp=datetime.now().isoformat()
        )


Overwriting agents/base_agent.py


In [9]:
%%writefile agents/search_agent.py
import pandas as pd
import numpy as np
import faiss
import os
from datetime import datetime, timedelta
from typing import Dict, Any, List
from .base_agent import BaseAgent, AgentResponse

class SearchAgent(BaseAgent):
    def __init__(self):
        super().__init__("SearchAgent")
        self.local_index = None
        self.global_index = None
        self.local_meta = None
        self.global_meta = None
        self._load_indices()

    def _load_indices(self):
        self.log_activity("🔍 Searching for news indices...")

        for days_back in range(10):
            date_str = (datetime.today() - timedelta(days=days_back)).strftime("%Y-%m-%d")

            try:
                for scope in ['local', 'global']:
                    faiss_path = f"/content/NewsAgent/artifacts/{date_str}/{scope}.faiss"
                    meta_path = f"/content/NewsAgent/artifacts/{date_str}/{scope}_meta.parquet"

                    if os.path.exists(faiss_path) and os.path.exists(meta_path):
                        if scope == 'local':
                            self.local_index = faiss.read_index(faiss_path)
                            self.local_meta = pd.read_parquet(meta_path)
                        else:
                            self.global_index = faiss.read_index(faiss_path)
                            self.global_meta = pd.read_parquet(meta_path)

                        self.log_activity(f"✅ Loaded {scope} data from {date_str}")

            except Exception as e:
                continue

        if not self.local_index and not self.global_index:
            self.log_activity("❌ No news indices found! Run data pipeline first.")

    async def execute(self, task: Dict[str, Any]) -> AgentResponse:
        query = task.get('query', '').strip()
        scope = task.get('scope', 'both').lower()
        top_k = task.get('top_k', 5)

        if not query:
            return self.create_response(
                success=False,
                data={},
                message="❌ No search query provided"
            )

        self.log_activity(f"🔍 Searching for: '{query}' (scope: {scope}, top_k: {top_k})")

        try:
            enhanced_query = await self._enhance_query(query)
            results = self._mock_search_results(enhanced_query, scope, top_k)

            return self.create_response(
                success=True,
                data={
                    "articles": results,
                    "original_query": query,
                    "enhanced_query": enhanced_query,
                    "search_scope": scope,
                    "total_results": len(results)
                },
                message=f"✅ Found {len(results)} relevant articles"
            )

        except Exception as e:
            return self.create_response(
                success=False,
                data={"error_details": str(e)},
                message=f"❌ Search failed: {str(e)}"
            )

    async def _enhance_query(self, query: str) -> str:
        prompt = f"""Enhance this search query for better news search: "{query}"
        Add relevant keywords and synonyms. Return only the enhanced query."""

        try:
            enhanced = await self._generate_content(prompt)
            return enhanced.strip() if enhanced else query
        except:
            return query

    def _mock_search_results(self, query: str, scope: str, top_k: int) -> List[Dict]:
        # Mock results when no real data available
        mock_articles = [
            {
                "title": f"Sample {scope} news article about {query}",
                "description": f"This is a sample article related to {query}",
                "source": "Sample News",
                "relevance_score": 0.95,
                "source_type": scope
            }
        ]
        return mock_articles[:top_k]


Overwriting agents/search_agent.py


In [10]:
%%writefile agents/analysis_agent.py
import pandas as pd
import json
from datetime import datetime, timedelta
from collections import Counter
from typing import Dict, Any, List
from .base_agent import BaseAgent, AgentResponse

class AnalysisAgent(BaseAgent):
    def __init__(self):
        super().__init__("AnalysisAgent")
        self.log_activity("🔍 Analysis Agent initialized and ready")

    async def execute(self, task: Dict[str, Any]) -> AgentResponse:
        analysis_type = task.get('type', 'trending').lower()
        days_back = task.get('days', 3)
        category = task.get('category', 'all').lower()
        limit = task.get('limit', 10)

        self.log_activity(f"📊 Starting {analysis_type} analysis")

        try:
            if analysis_type == 'trending':
                result = await self._analyze_trending(days_back, category, limit)
            elif analysis_type == 'sentiment':
                result = await self._analyze_sentiment(days_back, category)
            elif analysis_type == 'sources':
                result = await self._analyze_sources(days_back, category)
            else:
                result = await self._analyze_trending(days_back, category, limit)

            return self.create_response(
                success=True,
                data=result,
                message=f"✅ {analysis_type.title()} analysis completed"
            )

        except Exception as e:
            return self.create_response(
                success=False,
                data={"error_details": str(e)},
                message=f"❌ Analysis failed: {str(e)}"
            )

    async def _analyze_trending(self, days_back: int, category: str, limit: int) -> Dict:
        self.log_activity(f"🔥 Analyzing trending topics for last {days_back} days")

        newline = '\n'
        prompt = f"""Analyze trending topics for {category} news over {days_back} days.
        Return JSON format:
        {{
            "trending_topics": [
                {{"topic": "AI Technology", "frequency": "high", "relevance": "high"}},
                {{"topic": "Sports Updates", "frequency": "medium", "relevance": "medium"}}
            ],
            "key_insights": ["insight1", "insight2"],
            "time_analysis": "summary of temporal patterns"
        }}"""

        try:
            analysis = await self._generate_content(prompt)
            try:
                result = json.loads(analysis)
            except:
                result = {"trending_analysis": analysis}

            result["analysis_metadata"] = {
                "total_articles": "mock_data",
                "time_range": f"{days_back} days",
                "category_filter": category
            }

            return result

        except Exception as e:
            return {
                "error": str(e),
                "mock_trending": [
                    {"topic": "Technology", "frequency": "high"},
                    {"topic": "Sports", "frequency": "medium"}
                ]
            }

    async def _analyze_sentiment(self, days_back: int, category: str) -> Dict:
        self.log_activity(f"😊 Analyzing sentiment for last {days_back} days")

        prompt = f"""Analyze sentiment of {category} news over {days_back} days.
        Return JSON: {{"overall_sentiment": "positive/negative/neutral",
                     "sentiment_distribution": {{"positive": 0.4, "negative": 0.3, "neutral": 0.3}}}}"""

        try:
            analysis = await self._generate_content(prompt)
            try:
                result = json.loads(analysis)
            except:
                result = {"sentiment_analysis": analysis}

            return result
        except:
            return {"mock_sentiment": "neutral", "confidence": "medium"}

    async def _analyze_sources(self, days_back: int, category: str) -> Dict:
        self.log_activity(f"📰 Analyzing sources for last {days_back} days")

        return {
            "source_statistics": {
                "total_sources": 50,
                "credible_sources": 35,
                "diversity_score": 0.8
            },
            "top_sources": {
                "BBC": 25,
                "Reuters": 20,
                "Times of India": 15
            }
        }


Overwriting agents/analysis_agent.py


In [11]:
%%writefile agents/source_agent.py
import json
from urllib.parse import urlparse
from typing import Dict, Any, List
from .base_agent import BaseAgent, AgentResponse

class SourceAgent(BaseAgent):
    def __init__(self):
        super().__init__("SourceAgent")
        self.credible_sources = {
            'high_credibility': ['reuters', 'bbc', 'associated press', 'bloomberg'],
            'medium_credibility': ['cnn', 'times of india', 'hindustan times'],
            'questionable': ['unknown', 'blog', 'unverified']
        }
        self.log_activity("📰 Source credibility database loaded")

    async def execute(self, task: Dict[str, Any]) -> AgentResponse:
        source_url = task.get('url', '').strip()
        source_name = task.get('source', '').strip()
        analysis_type = task.get('analysis_type', 'comprehensive').lower()

        if not source_url and not source_name:
            return self.create_response(
                success=False,
                data={},
                message="❌ Either source URL or name required"
            )

        self.log_activity(f"🔍 Analyzing source: {source_name or source_url}")

        try:
            if analysis_type == 'credibility':
                result = await self._analyze_credibility(source_url, source_name)
            else:
                result = await self._comprehensive_analysis(source_url, source_name)

            return self.create_response(
                success=True,
                data=result,
                message=f"✅ Source analysis completed"
            )

        except Exception as e:
            return self.create_response(
                success=False,
                data={"error_details": str(e)},
                message=f"❌ Analysis failed: {str(e)}"
            )

    async def _analyze_credibility(self, url: str, source: str) -> Dict:
        database_score, level = self._check_source_database(source)

        prompt = f"""Analyze credibility of news source: {source} (URL: {url})
        Return JSON: {{"credibility_score": 0.0-1.0, "credibility_level": "high/medium/low",
                     "trust_indicators": ["indicator1"], "red_flags": ["flag1"]}}"""

        try:
            analysis = await self._generate_content(prompt)
            try:
                ai_result = json.loads(analysis)
            except:
                ai_result = {"credibility_analysis": analysis}
        except:
            ai_result = {}

        return {
            "credibility_assessment": {
                "database_score": database_score,
                "database_level": level,
                **ai_result
            },
            "source_metadata": {"source_name": source, "source_url": url}
        }

    async def _comprehensive_analysis(self, url: str, source: str) -> Dict:
        credibility = await self._analyze_credibility(url, source)

        return {
            "comprehensive_analysis": {
                "credibility": credibility,
                "overall_rating": "good" if credibility["credibility_assessment"]["database_score"] > 0.7 else "fair"
            }
        }

    def _check_source_database(self, source: str) -> tuple:
        if not source:
            return 0.5, "unknown"

        source_lower = source.lower()

        for level, sources in self.credible_sources.items():
            for known_source in sources:
                if known_source in source_lower:
                    if level == 'high_credibility':
                        return 0.9, 'high'
                    elif level == 'medium_credibility':
                        return 0.7, 'medium'
                    else:
                        return 0.3, 'questionable'

        return 0.5, "unknown"


Overwriting agents/source_agent.py


In [12]:
%%writefile deploy.sh
#!/bin/bash

# Configuration
PROJECT_ID="news-471020"  # Replace with your project ID
SERVICE_NAME="news-intelligence-api"
REGION="us-central1"
IMAGE_NAME="gcr.io/${PROJECT_ID}/${SERVICE_NAME}"

echo "🚀 Deploying Multi-Agent News Intelligence to Cloud Run"
echo "=================================================="

# Set active project
gcloud config set project $PROJECT_ID

# Enable required APIs
echo "📋 Enabling required APIs..."
gcloud services enable cloudbuild.googleapis.com
gcloud services enable run.googleapis.com
gcloud services enable containerregistry.googleapis.com

# Build Docker image
echo "🔨 Building Docker image..."
gcloud builds submit --tag $IMAGE_NAME .

# Deploy to Cloud Run
echo "🚀 Deploying to Cloud Run..."
gcloud run deploy $SERVICE_NAME \
  --image $IMAGE_NAME \
  --region $REGION \
  --platform managed \
  --allow-unauthenticated \
  --memory 4Gi \
  --cpu 2 \
  --timeout 900 \
  --concurrency 100 \
  --min-instances 1 \
  --max-instances 10 \
  --set-env-vars GEMINI_API_KEY=$GEMINI_API_KEY,GCS_BUCKET=$GCS_BUCKET,GCS_PREFIX=$GCS_PREFIX

# Get service URL
SERVICE_URL=$(gcloud run services describe $SERVICE_NAME --region=$REGION --format='value(status.url)')

echo ""
echo "✅ Deployment Complete!"
echo "=================================================="
echo "🌐 Service URL: $SERVICE_URL"
echo "📖 API Docs: $SERVICE_URL/docs"
echo "📊 Dashboard: $SERVICE_URL"
echo "💚 Health Check: $SERVICE_URL/health"


Overwriting deploy.sh


In [13]:
%%writefile setup_env.sh
#!/bin/bash

echo "🔧 Setting up environment variables"
echo "=================================="

# Replace these with your actual values
export PROJECT_ID="news-471020"
export GCS_BUCKET="news-hub"
export GCS_PREFIX="news_data"
export GEMINI_API_KEY="AIzaSyCZELmN8Re-v2eMaVRHROKguyjEYpSU9hY"
export REGION="us-central1"

# Authenticate and set project
gcloud auth login
gcloud config set project $PROJECT_ID

# Make deployment script executable
chmod +x deploy.sh

echo "✅ Environment configured!"
echo "🚀 Run './deploy.sh' to deploy your service"


Overwriting setup_env.sh


In [None]:
!chmod +x deploy.sh
!./deploy.sh

🚀 Deploying Multi-Agent News Intelligence to Cloud Run
Updated property [core/project].
📋 Enabling required APIs...
🔨 Building Docker image...
Creating temporary archive of 11 file(s) totalling 53.6 KiB before compression.
Uploading tarball of [.] to [gs://news-471020_cloudbuild/source/1757523977.140903-b34a58f5ced948a0ab254a272def9c63.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/news-471020/locations/global/builds/5298720b-0c06-4cf2-a1f0-3d35a6ba5794].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/5298720b-0c06-4cf2-a1f0-3d35a6ba5794?project=1062778057195 ].
Waiting for build to complete. Polling interval: 1 second(s).
 REMOTE BUILD OUTPUT
starting build "5298720b-0c06-4cf2-a1f0-3d35a6ba5794"

FETCHSOURCE
Fetching storage object: gs://news-471020_cloudbuild/source/1757523977.140903-b34a58f5ced948a0ab254a272def9c63.tgz#1757523977427006
Copying gs://news-471020_cloudbuild/source/1757523977.140903-b34a58f5ced948a0ab254a272def9c63.tgz#175752397

In [None]:
%cd /content/NewsAgent
!gcloud builds submit --tag gcr.io/news-471020/news-intelligence-api .
!gcloud run deploy news-intelligence-api \
  --image gcr.io/news-471020/news-intelligence-api \
  --region us-central1 \
  --platform managed \
  --allow-unauthenticated \
  --set-env-vars GEMINI_API_KEY=$GEMINI_API_KEY,GCS_BUCKET=news-hub,GCS_PREFIX=news_data


/content/NewsAgent
Creating temporary archive of 11 file(s) totalling 57.1 KiB before compression.
Uploading tarball of [.] to [gs://news-471020_cloudbuild/source/1757846637.390885-1b2c1e37167c4e67ab4c6091d6d6ad63.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/news-471020/locations/global/builds/cdb12178-9a23-43e4-ba97-97856555a890].
Logs are available at [ https://console.cloud.google.com/cloud-build/builds/cdb12178-9a23-43e4-ba97-97856555a890?project=1062778057195 ].
Waiting for build to complete. Polling interval: 1 second(s).
 REMOTE BUILD OUTPUT
starting build "cdb12178-9a23-43e4-ba97-97856555a890"

FETCHSOURCE
Fetching storage object: gs://news-471020_cloudbuild/source/1757846637.390885-1b2c1e37167c4e67ab4c6091d6d6ad63.tgz#1757846637672942
Copying gs://news-471020_cloudbuild/source/1757846637.390885-1b2c1e37167c4e67ab4c6091d6d6ad63.tgz#1757846637672942...
/ [1 files][ 15.0 KiB/ 15.0 KiB]                                                
Operation completed over 1 objec

In [None]:
# Run this in your Colab notebook to generate trending data immediately
from trending_pipeline_fixed import trending_extractor_fixed
import asyncio
import os

# Set environment variables
os.environ['GEMINI_API_KEY'] = 'AIzaSyCZELmN8Re-v2eMaVRHROKguyjEYpSU9hY'  # Your API key from logs
os.environ['GCS_BUCKET'] = 'news-hub'
os.environ['GCS_PREFIX'] = 'news_data'

print("🔥 Running trending extraction directly in Colab...")

async def run_trending():
    try:
        result = await trending_extractor_fixed.run_trending_extraction()

        if result:
            print("✅ SUCCESS! Trending data generated")

            # Check if files were created
            from google.cloud import storage
            from datetime import datetime

            today_str = datetime.utcnow().strftime("%Y-%m-%d")
            storage_client = storage.Client()
            bucket = storage_client.bucket("news-hub")
            blob = bucket.blob(f"trending/{today_str}/summary.json")

            if blob.exists():
                print(f"✅ File created: gs://news-hub/trending/{today_str}/summary.json")
                print("🎉 Dashboard should now show today's trending news!")

                # Force API to refresh cache
                print("🔄 Visit your dashboard now to see fresh trending news:")
                print("https://news-intelligence-api-1062778057195.us-central1.run.app")

            else:
                print("⚠️ File not found - check pipeline logs above")
        else:
            print("❌ FAILED - check error messages above")

    except Exception as e:
        print(f"💥 ERROR: {str(e)}")
        import traceback
        print(f"Full traceback: {traceback.format_exc()}")

# Execute the trending extraction
await run_trending()


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

🤖 Gemini enabled for trending analysis


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


✅ Classification model loaded
🔥 Running trending extraction directly in Colab...
🔥 Starting FIXED Trending News Extraction
✅ Downloaded 24hr data: news_data/news_with_content_2025-09-13.csv
✅ Loaded 780 articles from last 24hrs
🌍 Assigning regions with improved logic...
📊 Region distribution:
  Global: 446 articles
  India: 334 articles
🔄 Categorizing 780 articles...
✅ Filtered to 748 articles in target categories
📈 Extracting trending news...
📍 Processing India: 321 articles
📍 Processing Global: 427 articles
🤖 Generating AI summaries for trending topics...
💾 Saving trending summary...
✅ Trending summary saved: gs://news-hub/trending/2025-09-13/summary.json

🎉 FIXED TRENDING EXTRACTION COMPLETE!

📍 India:
  ✅ sports: 52 stories
  ✅ politics: 50 stories
  ✅ technology: 72 stories
  ✅ health: 40 stories
  ✅ crime: 61 stories
  ✅ entertainment: 46 stories

📍 Global:
  ✅ sports: 74 stories
  ✅ politics: 25 stories
  ✅ technology: 112 stories
  ✅ health: 75 stories
  ✅ crime: 59 stories
  ✅