<a href="https://colab.research.google.com/github/navonilmandal/AI-driven-automated-News-Video-Generator-pipeline./blob/main/ai_video_gen_from_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Colab cell 1
!pip install -q requests feedparser beautifulsoup4 python-dateutil


  Preparing metadata (setup.py) ... [?25l[?25hdone
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.5/81.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone


In [None]:
# Colab cell 2
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [None]:
# Colab cell 3: securely read API keys (enter blank to skip)
import getpass, os
NEWSAPI_KEY = getpass.getpass("Enter your NewsAPI key (press Enter to skip): ")
# for future steps (script generation), you'll need OpenAI key
OPENAI_KEY = getpass.getpass("Enter your OpenAI key (press Enter to skip): ")

# set in env for convenience (only for this Colab session)
if NEWSAPI_KEY:
    os.environ["NEWSAPI_KEY"] = NEWSAPI_KEY
if OPENAI_KEY:
    os.environ["OPENAI_KEY"] = OPENAI_KEY


Enter your NewsAPI key (press Enter to skip): ··········
Enter your OpenAI key (press Enter to skip): ··········


In [None]:
# Colab cell 4: optimized scraper function (works with NewsAPI if key given, else Google News RSS)
import os, json, logging
from typing import List, Dict, Optional
from datetime import datetime
from urllib.parse import urlparse

import requests, feedparser
from bs4 import BeautifulSoup
from dateutil import parser as dateparse
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

NEWSAPI_URL = "https://newsapi.org/v2/top-headlines"
GOOGLE_NEWS_RSS = "https://news.google.com/rss?hl=en-IN&gl=IN&ceid=IN:en"
DEFAULT_TIMEOUT = 10

def requests_session_with_retries(total_retries: int = 3, backoff_factor: float = 0.3):
    sess = requests.Session()
    retries = Retry(
        total=total_retries,
        backoff_factor=backoff_factor,
        status_forcelist=(429, 500, 502, 503, 504),
        allowed_methods=["GET", "POST"]
    )
    sess.mount("https://", HTTPAdapter(max_retries=retries))
    sess.mount("http://", HTTPAdapter(max_retries=retries))
    return sess

def clean_html(html: Optional[str]) -> str:
    if not html:
        return ""
    s = BeautifulSoup(html, "html.parser")
    text = s.get_text(separator=" ", strip=True)
    return " ".join(text.split())

def normalize_date(datestr: Optional[str]) -> Optional[str]:
    if not datestr:
        return None
    try:
        dt = dateparse.parse(datestr)
        return dt.isoformat()
    except Exception:
        return None

def _make_key(title: str, link: str) -> str:
    return (title or "").strip().lower() + "|" + (link or "").strip().lower()

def fetch_from_newsapi(api_key: str, country: str = "in", page_size: int = 8) -> List[Dict]:
    sess = requests_session_with_retries()
    params = {"apiKey": api_key, "country": country, "pageSize": page_size}
    try:
        resp = sess.get(NEWSAPI_URL, params=params, timeout=DEFAULT_TIMEOUT)
        resp.raise_for_status()
        data = resp.json()
        articles = []
        for a in data.get("articles", [])[:page_size]:
            articles.append({
                "title": clean_html(a.get("title") or ""),
                "link": a.get("url"),
                "summary": clean_html(a.get("description") or a.get("content") or ""),
                "published": normalize_date(a.get("publishedAt")),
                "source": (a.get("source") or {}).get("name") or "NewsAPI"
            })
        return articles
    except Exception as e:
        logging.warning("NewsAPI fetch failed: %s", e)
        return []

def fetch_from_google_rss(rss_url: str = GOOGLE_NEWS_RSS, max_items: int = 10) -> List[Dict]:
    try:
        feed = feedparser.parse(rss_url)
        items = []
        for entry in feed.entries[:max_items]:
            items.append({
                "title": clean_html(entry.get("title", "")),
                "link": entry.get("link", ""),
                "summary": clean_html(entry.get("summary", "") or entry.get("description", "")),
                "published": normalize_date(entry.get("published") or entry.get("updated")),
                "source": (entry.get("source") or {}).get("title") if entry.get("source") else urlparse(entry.get("link", "")).netloc
            })
        return items
    except Exception as e:
        logging.warning("Google RSS fetch failed: %s", e)
        return []

def get_trending_news(top_k: int = 6, newsapi_key: Optional[str] = None) -> List[Dict]:
    top_k = int(top_k)
    results = []
    # 1) try NewsAPI if provided
    if newsapi_key:
        logging.info("Using NewsAPI...")
        results = fetch_from_newsapi(newsapi_key, page_size=top_k)
        if len(results) >= top_k:
            return results[:top_k]
    # 2) fallback and supplement with RSS
    logging.info("Using Google News RSS fallback/supplement...")
    rss = fetch_from_google_rss(max_items=top_k*2)
    seen = set()
    merged = []
    for a in (results + rss):
        key = _make_key(a.get("title",""), a.get("link",""))
        if key in seen:
            continue
        seen.add(key)
        merged.append(a)
        if len(merged) >= top_k:
            break
    return merged

# Convenience: run and save
def fetch_and_save(top_k: int = 6, save_path: str = "trending_news.json"):
    key = os.environ.get("NEWSAPI_KEY", "") or ""
    articles = get_trending_news(top_k=top_k, newsapi_key=key if key else None)
    with open(save_path, "w", encoding="utf-8") as f:
        json.dump(articles, f, ensure_ascii=False, indent=2)
    logging.info("Saved %d articles to %s", len(articles), save_path)
    return articles

# Example run (uncomment to run here)
# articles = fetch_and_save(top_k=6, save_path="/content/trending_news.json")
# print(articles[:2])


In [None]:
# Colab cell 5: run the fetch, save to local or Drive
articles = fetch_and_save(top_k=6, save_path="/content/trending_news.json")
print("Fetched:", len(articles))
for i,a in enumerate(articles, start=1):
    print(f"\n[{i}] {a['title']}\nSource: {a['source']}\nLink: {a['link']}\nSummary: {a['summary'][:200]}...")


Fetched: 6

[1] Mamata Banerjee demands PM’s apology over ‘Bankim da’ remark in Lok Sabha - The Hindu
Source: The Hindu
Link: https://news.google.com/rss/articles/CBMi2AFBVV95cUxPMkdBLUttOGRhZjN4eWNlekFkTnNRWXgyZ1kteXJLd2Nmdy1DQV9QRDlvRTVKZ1Q5UHpobVF5X3FZLVcwLW51Zy1NSUdrMEZPNXFEOWtwUVozdDNoa2ZqRFZqVHJQclg3bExUd1dNMmUxVVFmZWZSd1RsNFNCWDJiczZRZWhNVE1VVl9aNnVyS1l3T0JIVXpjVnF4dTZHdDFaOEZ3UlNSamZONkU1VXEzUzdqMl9xYWM3MnNfOVlyWDRSSHVZajZiVkdKaTRFSTFmRmxBMEp0UkjSAd8BQVVfeXFMTXRUV2t4SU5IUlZFMWxvTWhvUExMSVJFQ2FyZ2JkRUhWV0tQNlZNanFodTZaYjVqZEgzMUNLU2Z1WC1UZ3E3d2tTZEJ3Q1Vmckl1MWQ3UWY5U1NkV0FfYk9HYU1BQ1BFU2xFUkZ5dVpHbzk4Sm1pV1lZMmN0Sy15MWJIaGJuSUttVXhIalA4NFpwUjB1OGt0Y1M2WTA3R1BPckdnNVhkeEd1YXljRjFwdFVhTF9WcDJKTVEyR0hKUWZRM1Vpc05jaC1uS3A2QWhwMmZBT3Nsa1NSNF96TDN5aw?oc=5
Summary: Mamata Banerjee demands PM’s apology over ‘Bankim da’ remark in Lok Sabha The Hindu Amit Shah Rebuts Priyanka Gandhi On Vande Mataram, Slams Nehru, Indira Gandhi NDTV 'Vande Mataram' debate: TMC MPs h...

[2] How Goa nightcl

In [None]:
# Colab cell: install/upgrade the modern OpenAI client
!pip install -q --upgrade openai


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.0 MB[0m [31m15.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Colab cell 2: securely read OpenAI key (press Enter to skip)
import os, getpass
OPENAI_KEY = os.environ.get("OPENAI_KEY") or getpass.getpass("Enter your OpenAI key (press Enter to skip): ")
if OPENAI_KEY:
    os.environ["OPENAI_KEY"] = OPENAI_KEY
    print("OpenAI key set for this session.")
else:
    print("No OpenAI key provided — using local fallback.")


OpenAI key set for this session.


In [None]:
# Colab cell: updated script generator (works with openai>=1.0.0) and no NLTK fallback

import os, time, json, logging, re
from typing import Dict, List, Optional
from functools import wraps

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

# read OPENAI_KEY from env (you already set via getpass earlier)
OPENAI_KEY = os.environ.get("OPENAI_KEY") or ""
client = None

if OPENAI_KEY:
    try:
        # New client style for openai>=1.0.0
        from openai import OpenAI
        client = OpenAI(api_key=OPENAI_KEY)
        logging.info("OpenAI client initialized.")
    except Exception as e:
        logging.warning("Failed to initialize OpenAI client: %s", e)
        client = None
else:
    logging.info("No OpenAI key found; will use fallback generator only.")

# --- retry decorator ---
def retry_on_exception(max_retries=3, base_delay=1.0, allowed_exceptions=(Exception,)):
    def deco(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            delay = base_delay
            for attempt in range(1, max_retries + 1):
                try:
                    return func(*args, **kwargs)
                except allowed_exceptions as e:
                    logging.warning("Attempt %d/%d failed: %s", attempt, max_retries, e)
                    if attempt == max_retries:
                        raise
                    time.sleep(delay)
                    delay *= 2
        return wrapper
    return deco

# --- OpenAI-based generator (modern client) ---
@retry_on_exception(max_retries=3, base_delay=1.0, allowed_exceptions=(Exception,))
def generate_script_openai(title: str, summary: str, tone: str = "neutral, engaging", word_target: int = 110) -> str:
    """
    Uses modern OpenAI client (OpenAI()) and chat completions endpoint.
    """
    if client is None:
        raise RuntimeError("OpenAI client not available")

    system_prompt = (
        "You are a concise, engaging news narrator for short-form videos. "
        "Produce a single short narration suitable for a 30–60 second video."
    )
    user_prompt = (
        f"Headline: {title}\n\n"
        f"Summary/Context: {summary}\n\n"
        "Requirements:\n"
        f"- Produce ~{word_target} words (approximate) — enough for a 30–60 second spoken narration.\n"
        "- Use short, clear sentences. No bullet lists or extra sections.\n"
        "- Start with a 1-line hook (1 short sentence) to grab attention.\n"
        "- Then provide 2–3 sentences that explain the core info.\n"
        "- End with one 1-line closing sentence (a concise wrap-up or call-to-action).\n"
        f"- Tone: {tone}.\n"
        "Return only the narration text. Do not add metadata or commentary."
    )

    # new API: client.chat.completions.create(...)
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        max_tokens=320,
        temperature=0.25
    )
    # response shape: resp.choices[0].message.content
    text = resp.choices[0].message.get("content") if resp.choices else None
    if not text:
        # fallback to raw string if different shape
        text = str(resp)
    return text.strip()

# --- Local fallback: regex sentence splitter (no NLTK) ---
def split_sentences_regex(text: str) -> List[str]:
    """Split text into sentences using punctuation-based regex."""
    if not text:
        return []
    # split on end-of-sentence punctuation followed by whitespace
    sents = re.split(r'(?<=[.!?])\s+', text.strip())
    # filter empties and trim
    return [s.strip() for s in sents if s.strip()]

def generate_script_fallback(title: str, summary: str, tone: str = "neutral, engaging", word_target: int = 110) -> str:
    """
    Lightweight fallback that avoids NLTK and aims for a 30-60s narration.
    """
    summary = (summary or "").strip()
    if not summary:
        hook = title if len(title.split()) <= 12 else "Here's a quick update:"
        return f"{hook} {title}. Stay tuned for more updates."

    sents = split_sentences_regex(summary)
    # prefer first 2-4 short sentences
    selected = []
    for sent in sents:
        if len(selected) >= 3:
            break
        # ignore extremely long sentences; instead chop at commas if necessary
        if len(sent.split()) > 45:
            parts = re.split(r',\s*', sent)
            candidate = parts[0].strip() + ('.' if not parts[0].strip().endswith(('.', '!', '?')) else '')
            selected.append(candidate)
        else:
            selected.append(sent if sent.endswith(('.', '!', '?')) else sent + '.')

    hook = title if len(title.split()) <= 12 else "Here's an update:"
    body = " ".join(selected).strip()
    closing = "Stay tuned for more updates."
    script = f"{hook} {body} {closing}"

    # crude word-trim to approximate word_target
    words = script.split()
    if len(words) > word_target + 20:
        script = " ".join(words[:word_target]).rstrip()
        if not script.endswith(('.', '!', '?')):
            script = script + '.'
    return script

# --- unified generator ---
def generate_script(title: str, summary: str, tone: str = "neutral, engaging", word_target: int = 110) -> str:
    try:
        if client:
            return generate_script_openai(title=title, summary=summary, tone=tone, word_target=word_target)
    except Exception as e:
        logging.warning("OpenAI generation failed, using fallback: %s", e)
    return generate_script_fallback(title=title, summary=summary, tone=tone, word_target=word_target)

# --- batch util ---
def batch_generate_scripts(articles: List[Dict], tone: str = "neutral, engaging", word_target: int = 110,
                           out_path: str = "/content/generated_scripts.json") -> List[Dict]:
    results = []
    for idx, a in enumerate(articles):
        title = a.get("title") or ""
        summary = a.get("summary") or ""
        logging.info("Generating script %d/%d: %s", idx+1, len(articles), title[:80])
        try:
            script_text = generate_script(title, summary, tone=tone, word_target=word_target)
        except Exception as e:
            logging.error("Generation failed for article %d: %s", idx+1, e)
            script_text = generate_script_fallback(title, summary, tone=tone, word_target=word_target)
        entry = dict(a)
        entry["script"] = script_text
        results.append(entry)
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    logging.info("Saved %d scripts to %s", len(results), out_path)
    return results


In [None]:
# Colab cell: Robust OpenAI extraction + generator (replace previous version)
import os, time, logging, re
from functools import wraps
from typing import List, Dict

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

# use OpenAI client if configured
OPENAI_KEY = os.environ.get("OPENAI_KEY") or ""
client = None
if OPENAI_KEY:
    try:
        from openai import OpenAI
        client = OpenAI(api_key=OPENAI_KEY)
        logging.info("OpenAI client initialized.")
    except Exception as e:
        logging.warning("Failed to initialize OpenAI client: %s", e)
        client = None
else:
    logging.info("No OpenAI key found; will use fallback generator only.")

def retry_on_exception(max_retries=3, base_delay=1.0, allowed_exceptions=(Exception,)):
    def deco(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            delay = base_delay
            for attempt in range(1, max_retries + 1):
                try:
                    return func(*args, **kwargs)
                except allowed_exceptions as e:
                    logging.warning("Attempt %d/%d failed: %s", attempt, max_retries, e)
                    if attempt == max_retries:
                        raise
                    time.sleep(delay)
                    delay *= 2
        return wrapper
    return deco

def _extract_text_from_choice(choice) -> str:
    """
    Return the message text from various SDK shapes:
    - choice.message (object) with .content
    - choice.message (dict) with ['content']
    - choice.get('message') ...
    - choice.text or choice.get('text') (older style)
    - finally, str(choice)
    """
    # 1) try common attribute path: choice.message.content
    try:
        msg = getattr(choice, "message", None)
        if msg is not None:
            # if msg is a dict-like
            if isinstance(msg, dict):
                content = msg.get("content")
                if content:
                    return content
            # if msg is an object with .content
            content = getattr(msg, "content", None)
            if content:
                return content
    except Exception:
        pass

    # 2) try attribute on choice itself: .message.content (already tried) or .text
    try:
        txt = getattr(choice, "text", None)
        if txt:
            return txt
    except Exception:
        pass

    # 3) try dict-like access
    try:
        if isinstance(choice, dict):
            # older response may be {'message': {'content': '...'}}
            msg = choice.get("message")
            if isinstance(msg, dict):
                c = msg.get("content")
                if c:
                    return c
            # older 'text' field
            t = choice.get("text")
            if t:
                return t
    except Exception:
        pass

    # 4) fallback: try str(choice)
    try:
        return str(choice)
    except Exception:
        return ""

@retry_on_exception(max_retries=3, base_delay=1.0, allowed_exceptions=(Exception,))
def generate_script_openai(title: str, summary: str, tone: str = "neutral, engaging", word_target: int = 110) -> str:
    """
    Robust OpenAI generator that extracts the reply text regardless of the
    exact response object shape returned by different openai-python versions.
    """
    if client is None:
        raise RuntimeError("OpenAI client not available")

    system_prompt = (
        "You are a concise, engaging news narrator for short-form videos. "
        "Produce a single short narration suitable for a 30–60 second video."
    )
    user_prompt = (
        f"Headline: {title}\n\n"
        f"Summary/Context: {summary}\n\n"
        "Requirements:\n"
        f"- Produce ~{word_target} words (approximate) — enough for a 30–60 second spoken narration.\n"
        "- Use short, clear sentences. No bullet lists or extra sections.\n"
        "- Start with a 1-line hook (1 short sentence) to grab attention.\n"
        "- Then provide 2–3 sentences that explain the core info.\n"
        "- End with one 1-line closing sentence (a concise wrap-up or call-to-action).\n"
        f"- Tone: {tone}.\n"
        "Return only the narration text. Do not add metadata or commentary."
    )

    # call the chat completions endpoint on the modern client
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        max_tokens=320,
        temperature=0.25
    )

    # resp.choices may be a list-like; try to extract content robustly
    try:
        choices = getattr(resp, "choices", None) or resp.get("choices") if isinstance(resp, dict) else None
        if not choices:
            # sometimes resp itself is a single choice-like object
            text_candidate = _extract_text_from_choice(resp)
            if text_candidate:
                return text_candidate.strip()
        # iterate choices and extract the first non-empty content
        for ch in choices:
            txt = _extract_text_from_choice(ch)
            if txt and txt.strip():
                return txt.strip()
    except Exception as e:
        logging.warning("Failed to parse OpenAI response shape: %s", e)

    # final fallback: try direct string
    try:
        return str(resp)
    except Exception:
        raise RuntimeError("Unable to extract text from OpenAI response")

# Keep the fallback generator you already have (ensure defined in your notebook).
# Example minimal fallback if not present:
def split_sentences_regex(text: str):
    import re
    if not text:
        return []
    sents = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s.strip() for s in sents if s.strip()]

def generate_script_fallback(title: str, summary: str, tone: str = "neutral, engaging", word_target: int = 110) -> str:
    summary = (summary or "").strip()
    if not summary:
        hook = title if len(title.split()) <= 12 else "Here's a quick update:"
        return f"{hook} {title}. Stay tuned for more updates."

    sents = split_sentences_regex(summary)
    selected = []
    for sent in sents:
        if len(selected) >= 3:
            break
        if len(sent.split()) > 45:
            parts = re.split(r',\s*', sent)
            candidate = parts[0].strip() + ('.' if not parts[0].strip().endswith(('.', '!', '?')) else '')
            selected.append(candidate)
        else:
            selected.append(sent if sent.endswith(('.', '!', '?')) else sent + '.')
    hook = title if len(title.split()) <= 12 else "Here's an update:"
    body = " ".join(selected).strip()
    closing = "Stay tuned for more updates."
    script = f"{hook} {body} {closing}"
    words = script.split()
    if len(words) > word_target + 20:
        script = " ".join(words[:word_target]).rstrip()
        if not script.endswith(('.', '!', '?')):
            script = script + '.'
    return script

# Unified generator wrapper remains the same:
def generate_script(title: str, summary: str, tone: str = "neutral, engaging", word_target: int = 110) -> str:
    try:
        if client:
            return generate_script_openai(title=title, summary=summary, tone=tone, word_target=word_target)
    except Exception as e:
        logging.warning("OpenAI generation failed, using fallback: %s", e)
    return generate_script_fallback(title=title, summary=summary, tone=tone, word_target=word_target)

# If you want to sanity-check: call generate_script on a sample:
# print(generate_script("Sample Headline", "This is a sample summary sentence. Another sentence for test."))


In [None]:
# Diagnostic cell — run this to check what's happening (Colab)
import os, json, logging, re, time
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

print("===== Diagnostic start =====")
OPENAI_KEY = os.environ.get("OPENAI_KEY") or ""
print("OPENAI_KEY set in env:", bool(OPENAI_KEY))

client = None
if OPENAI_KEY:
    try:
        from openai import OpenAI
        client = OpenAI(api_key=OPENAI_KEY)
        print("OpenAI client initialized:", bool(client))
    except Exception as e:
        print("OpenAI client initialization FAILED:", repr(e))
else:
    print("No OpenAI key — will use fallback generator only.")

# Simple robust extractor (for safety when we call client)
def _extract_text_from_choice(choice):
    try:
        msg = getattr(choice, "message", None)
        if msg:
            if isinstance(msg, dict):
                c = msg.get("content")
                if c: return c
            c = getattr(msg, "content", None)
            if c: return c
    except Exception:
        pass
    try:
        txt = getattr(choice, "text", None)
        if txt: return txt
    except Exception:
        pass
    try:
        if isinstance(choice, dict):
            msg = choice.get("message")
            if isinstance(msg, dict):
                c = msg.get("content")
                if c: return c
            t = choice.get("text")
            if t: return t
    except Exception:
        pass
    try:
        return str(choice)
    except Exception:
        return ""

# Test OpenAI generation if client present
def test_openai():
    try:
        if not client:
            print("Skipping OpenAI test (no client).")
            return
        print("Running small OpenAI test (one request)...")
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role":"system","content":"You are a tester bot."},
                {"role":"user","content":"Say hi in one short sentence."}
            ],
            max_tokens=30,
            temperature=0.0
        )
        choices = getattr(resp, "choices", None) or (resp.get("choices") if isinstance(resp, dict) else None)
        if not choices:
            txt = _extract_text_from_choice(resp)
            print("OpenAI response (single):", repr(txt))
            return
        for ch in choices:
            txt = _extract_text_from_choice(ch)
            if txt:
                print("OpenAI response (choice):", repr(txt.strip()))
                return
        print("OpenAI response contained no extracted text; raw resp repr:")
        print(resp)
    except Exception as e:
        print("OpenAI test failed:", repr(e))

# Local fallback generator (regex sentence splitter)
def split_sentences_regex(text):
    if not text: return []
    sents = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s.strip() for s in sents if s.strip()]

def generate_script_fallback(title, summary, tone="neutral, engaging", word_target=110):
    summary = (summary or "").strip()
    if not summary:
        hook = title if len(title.split()) <= 12 else "Here's a quick update:"
        return f"{hook} {title}. Stay tuned for more updates."
    sents = split_sentences_regex(summary)
    selected = []
    for sent in sents:
        if len(selected) >= 3:
            break
        if len(sent.split()) > 45:
            parts = re.split(r',\s*', sent)
            candidate = parts[0].strip()
            if candidate and not candidate.endswith(('.', '!', '?')):
                candidate += '.'
            selected.append(candidate)
        else:
            selected.append(sent if sent.endswith(('.', '!', '?')) else sent + '.')
    hook = title if len(title.split()) <= 12 else "Here's an update:"
    body = " ".join(selected).strip()
    closing = "Stay tuned for more updates."
    script = f"{hook} {body} {closing}"
    words = script.split()
    if len(words) > word_target + 20:
        script = " ".join(words[:word_target]).rstrip()
        if not script.endswith(('.', '!', '?')):
            script += '.'
    return script

# Run tests
test_openai()

# Run fallback test
print("\nRunning fallback generator test:")
sample_title = "Sample Headline: Market update"
sample_summary = ("Stock markets saw mixed trading today as investors digested a string of corporate earnings. "
                  "Analysts say tech stocks led gains despite concerns over inflation. Trading volumes were moderate.")
fallback_script = generate_script_fallback(sample_title, sample_summary, word_target=80)
print("Fallback script output:\n", fallback_script)

# Attempt to generate from your articles file if it exists
ARTICLES_PATH = "/content/trending_news.json"
if os.path.exists(ARTICLES_PATH):
    print("\nFound articles file at", ARTICLES_PATH)
    try:
        with open(ARTICLES_PATH, "r", encoding="utf-8") as f:
            articles = json.load(f)
        if not articles:
            print("Articles file empty.")
        else:
            a = articles[0]
            print("\nSample article loaded (title):", a.get("title"))
            # Try OpenAI generation first (if client), else fallback
            if client:
                try:
                    # reuse test call but with prompt for script
                    resp = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=[
                            {"role":"system","content":"You are a concise news narrator."},
                            {"role":"user","content": f"Headline: {a.get('title')}\nSummary: {a.get('summary')}\nProduce one short narration of ~90 words."}
                        ],
                        max_tokens=200,
                        temperature=0.2
                    )
                    choices = getattr(resp, "choices", None) or (resp.get("choices") if isinstance(resp, dict) else None)
                    got = None
                    if choices:
                        for ch in choices:
                            t = _extract_text_from_choice(ch)
                            if t:
                                got = t.strip(); break
                    else:
                        got = _extract_text_from_choice(resp)
                    print("\nOpenAI-generated script (sample):\n", got or "(no text extracted)")
                except Exception as e:
                    print("OpenAI generation for article failed; falling back. Error:", repr(e))
                    print("\nFallback script for article:\n", generate_script_fallback(a.get('title',''), a.get('summary','')))
            else:
                print("\nGenerating fallback script from article:\n", generate_script_fallback(a.get('title',''), a.get('summary','')))
    except Exception as e:
        print("Failed to read/parse articles file:", repr(e))
else:
    print("\nNo articles file found at", ARTICLES_PATH, "- run Step 1 (scraper) or change path.")

print("\n===== Diagnostic end =====")


===== Diagnostic start =====
OPENAI_KEY set in env: True
OpenAI client initialized: True
Running small OpenAI test (one request)...
OpenAI response (choice): 'Hi there!'

Running fallback generator test:
Fallback script output:
 Sample Headline: Market update Stock markets saw mixed trading today as investors digested a string of corporate earnings. Analysts say tech stocks led gains despite concerns over inflation. Trading volumes were moderate. Stay tuned for more updates.

Found articles file at /content/trending_news.json

Sample article loaded (title): Mamata Banerjee demands PM’s apology over ‘Bankim da’ remark in Lok Sabha - The Hindu

OpenAI-generated script (sample):
 West Bengal Chief Minister Mamata Banerjee has called for an apology from Prime Minister Narendra Modi over his remarks referring to 'Bankim da' in the Lok Sabha. This incident has sparked a heated debate, with TMC MPs staging a silent protest against what they perceive as an insult to Bengal's cultural icons. Un

In [None]:
# Run this in Colab to generate scripts for all articles and save them
import os, json
ARTICLES_PATH = "/content/trending_news.json"   # update if you saved elsewhere
OUT_PATH = "/content/generated_scripts.json"

if not os.path.exists(ARTICLES_PATH):
    raise FileNotFoundError(f"{ARTICLES_PATH} not found. Run the scraper step first or update the path.")

with open(ARTICLES_PATH, "r", encoding="utf-8") as f:
    articles = json.load(f)

print(f"Loaded {len(articles)} articles. Generating scripts...")
generated = batch_generate_scripts(articles, tone="urgent but factual", word_target=110, out_path=OUT_PATH)
print(f"Done — generated {len(generated)} scripts and saved to {OUT_PATH}")

# quick preview
for i, g in enumerate(generated[:3], start=1):
    print(f"\n--- Script {i} ({g.get('source')}) ---\n{g.get('script')[:500]}\n")


Loaded 6 articles. Generating scripts...
Done — generated 6 scripts and saved to /content/generated_scripts.json

--- Script 1 (The Hindu) ---
ChatCompletion(id='chatcmpl-CkrpwvjlghlMyFvD4dqxY54Xd0S2W', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Tensions are rising in Indian politics. West Bengal Chief Minister Mamata Banerjee is demanding an apology from Prime Minister Narendra Modi over his controversial "Bankim da" remark made in the Lok Sabha. This statement has sparked outrage among TMC MPs, who staged a silent protest, claiming it insults Bengal\'s cultural icons. Meanwhil


--- Script 2 (Hindustan Times) ---
ChatCompletion(id='chatcmpl-CkrqKkBGjB4Ai2syMVxCNthfvGIDj', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='In a shocking turn of events, Goa nightclub owners Saurabh and Gaurav Luthra fled to Thailand just hours after a devastating fire at their establishment. Repo

In [None]:
import json

with open("/content/generated_scripts.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print("Total scripts:", len(data))
print("\nSample script:\n")
print(data[0]["script"])


Total scripts: 6

Sample script:

ChatCompletion(id='chatcmpl-CkrpwvjlghlMyFvD4dqxY54Xd0S2W', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Tensions are rising in Indian politics. West Bengal Chief Minister Mamata Banerjee is demanding an apology from Prime Minister Narendra Modi over his controversial "Bankim da" remark made in the Lok Sabha. This statement has sparked outrage among TMC MPs, who staged a silent protest, claiming it insults Bengal\'s cultural icons. Meanwhile, Union Home Minister Amit Shah defended the government\'s stance on the Vande Mataram debate, igniting further clashes with opposition leaders. As the political landscape heats up, all eyes are on how this controversy will unfold. Stay tuned for more updates on this developing story.', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None))], created=1765286072, model='gpt-4o-mini-2024-07-18', object='chat.completion',

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cp /content/generated_scripts.json /content/drive/MyDrive/generated_scripts.json


In [None]:
# Colab cell 1: install required libraries
!pip install -q moviepy pillow requests gTTS tqdm


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/98.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Colab cell 2: prepare synthetic images + narration audio
import os, json, textwrap, pathlib, random
from PIL import Image, ImageDraw, ImageFilter, ImageFont
from gtts import gTTS

ASSETS_DIR = "/content/assets"
pathlib.Path(ASSETS_DIR).mkdir(parents=True, exist_ok=True)

# Load the first script
SCRIPTS_PATH = "/content/generated_scripts.json"
with open(SCRIPTS_PATH, "r", encoding="utf-8") as f:
    scripts = json.load(f)

entry = scripts[0]
title = entry.get("title", "News Story")
script_text = entry.get("script_180") or entry.get("script") or entry.get("summary")

print("Using:", title)

# --- Synthetic cinematic gradient background generator ---
def create_gradient_image(out_path, size=(1080,1920), hue_shift=0):
    w, h = size
    img = Image.new("RGB", (w, h))
    draw = ImageDraw.Draw(img)

    for y in range(h):
        ratio = y / h
        r = int(100 + 80 * ratio)
        g = int(60 + 140 * (1 - ratio))
        b = int(120 + 60 * ratio)
        draw.line([(0, y), (w, y)], fill=(r, g, b))

    img = img.filter(ImageFilter.GaussianBlur(radius=12))
    img.save(out_path, "JPEG", quality=90)
    return out_path

# Create 3 images
image_paths = []
for i in range(3):
    out = f"{ASSETS_DIR}/bg_{i+1}.jpg"
    create_gradient_image(out)
    image_paths.append(out)

print("Synthetic images created:", image_paths)

# --- Create narration audio using gTTS ---
tts_path = f"{ASSETS_DIR}/narration.mp3"
tts = gTTS(script_text, lang="en")
tts.save(tts_path)

print("Narration saved:", tts_path)

# Save manifest
manifest = {
    "title": title,
    "script": script_text,
    "images": image_paths,
    "audio": tts_path
}
with open(f"{ASSETS_DIR}/manifest.json", "w") as f:
    json.dump(manifest, f, indent=2)

print("Assets ready in", ASSETS_DIR)


Using: Mamata Banerjee demands PM’s apology over ‘Bankim da’ remark in Lok Sabha - The Hindu
Synthetic images created: ['/content/assets/bg_1.jpg', '/content/assets/bg_2.jpg', '/content/assets/bg_3.jpg']
Narration saved: /content/assets/narration.mp3
Assets ready in /content/assets


In [None]:
# Fixed MoviePy assembly cell — renders caption PNGs using PIL.draw.textbbox (no ImageMagick)
from moviepy.editor import ImageClip, AudioFileClip, CompositeVideoClip, concatenate_videoclips, vfx
from PIL import Image, ImageDraw, ImageFont
import textwrap, json, os, pathlib

ASSETS_DIR = "/content/assets"
with open(os.path.join(ASSETS_DIR, "manifest.json"), "r", encoding="utf-8") as f:
    manifest = json.load(f)

images = manifest["images"]
audio_path = manifest["audio"]
script_text = manifest["script"]

audio = AudioFileClip(audio_path)
duration = audio.duration

W, H = 1080, 1920
segments = len(images)
segment_dur = max(2.0, duration / segments)  # ensure at least 2s per image

# split caption text into segments
words = script_text.split()
if len(words) < 1:
    words = [" "]
words_per_seg = max(8, len(words) // segments)
captions = []
for i in range(segments):
    start = i * words_per_seg
    end = (i + 1) * words_per_seg if i < segments - 1 else len(words)
    seg = " ".join(words[start:end]).strip()
    if not seg:
        seg = "..."
    captions.append(seg)

# utility: create caption PNG using PIL (robust measurement using textbbox)
def create_caption_png(text, out_path, width=W-120, fontsize=56, font_path=None, fill=(255,255,255,255), bg=(0,0,0,180)):
    """
    Render wrapped text into a transparent PNG sized to width. Returns path.
    Uses ImageDraw.textbbox for reliable measurements across PIL versions.
    """
    try:
        if font_path and os.path.exists(font_path):
            font = ImageFont.truetype(font_path, fontsize)
        else:
            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", fontsize)
    except Exception:
        font = ImageFont.load_default()

    # wrap text into lines fitting the width
    words = text.split()
    lines = []
    line = ""
    # create a temporary image/draw for measurement
    tmp_img = Image.new("RGBA", (10,10), (0,0,0,0))
    tmp_draw = ImageDraw.Draw(tmp_img)
    for w in words:
        test = (line + " " + w).strip()
        bbox = tmp_draw.textbbox((0,0), test, font=font)
        test_w = bbox[2] - bbox[0]
        if test_w <= width:
            line = test
        else:
            if line:
                lines.append(line)
            line = w
    if line:
        lines.append(line)

    # measure height
    bbox = tmp_draw.textbbox((0,0), "Ay", font=font)
    line_height = (bbox[3] - bbox[1]) + 8
    img_h = line_height * len(lines) + 40
    img_w = width + 40

    # create image with transparent bg and semi-opaque rounded rectangle
    img = Image.new("RGBA", (img_w, img_h), (0,0,0,0))
    draw = ImageDraw.Draw(img)

    # draw semi-opaque rounded rectangle for readability
    rect_margin = 8
    rect = [rect_margin, rect_margin, img_w-rect_margin, img_h-rect_margin]
    radius = 18
    overlay = Image.new("RGBA", (img_w, img_h), (0,0,0,0))
    od = ImageDraw.Draw(overlay)
    # Pillow supports rounded_rectangle in modern versions; fallback to rectangle if not
    try:
        od.rounded_rectangle(rect, radius=radius, fill=bg)
        img = Image.alpha_composite(img, overlay)
    except Exception:
        od.rectangle(rect, fill=bg)
        img = Image.alpha_composite(img, overlay)
    draw = ImageDraw.Draw(img)

    # draw centered text lines
    y = 20
    for ln in lines:
        bbox = draw.textbbox((0,0), ln, font=font)
        w_text = bbox[2] - bbox[0]
        x = (img_w - w_text) // 2
        draw.text((x, y), ln, font=font, fill=fill)
        y += line_height

    # save PNG
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    img.save(out_path, format="PNG")
    return out_path

# create caption pngs
pathlib.Path(ASSETS_DIR).mkdir(parents=True, exist_ok=True)
caption_paths = []
for idx, cap in enumerate(captions, start=1):
    outp = os.path.join(ASSETS_DIR, f"caption_{idx}.png")
    create_caption_png(cap, outp, fontsize=56)
    caption_paths.append(outp)

# Build clips
clips = []
for img_path, cap_path in zip(images, caption_paths):
    # base image -> ensure it fills vertical frame
    base = ImageClip(img_path).set_duration(segment_dur)

    # resize to at least H height then center-crop to WxH
    base = base.resize(height=H)
    if base.w > W:
        x1 = int((base.w - W) / 2)
        base = base.crop(x1=x1, width=W)
    else:
        base = base.resize(width=W)

    # apply slow zoom-in (Ken Burns)
    base = base.fx(vfx.resize, lambda t: 1 + 0.02 * t)

    # caption image clip (small, positioned above bottom)
    caption_clip = ImageClip(cap_path).set_duration(segment_dur)
    if caption_clip.w > (W - 60):
        caption_clip = caption_clip.resize(width=(W - 60))
    caption_clip = caption_clip.set_position(("center", H - 360))

    comp = CompositeVideoClip([base, caption_clip], size=(W, H)).set_duration(segment_dur)
    clips.append(comp)

# Concatenate and set audio
video = concatenate_videoclips(clips, method="compose")

# adjust duration to match audio if needed
if abs(video.duration - audio.duration) > 0.1:
    if video.duration < audio.duration:
        extra = audio.duration - video.duration
        last = clips[-1].set_duration(clips[-1].duration + extra)
        clips[-1] = last
        video = concatenate_videoclips(clips, method="compose")
    else:
        video = video.subclip(0, audio.duration)

video = video.set_audio(audio)

OUT_PATH = "/content/final_moviepy_video_no_imagemagick_v2.mp4"
print("Rendering final video to:", OUT_PATH)
video.write_videofile(OUT_PATH, fps=24, codec="libx264", audio_codec="aac", threads=4, bitrate="4M")

print("Final video saved to:", OUT_PATH)


Rendering final video to: /content/final_moviepy_video_no_imagemagick_v2.mp4
Moviepy - Building video /content/final_moviepy_video_no_imagemagick_v2.mp4.
MoviePy - Writing audio in final_moviepy_video_no_imagemagick_v2TEMP_MPY_wvf_snd.mp4




MoviePy - Done.
Moviepy - Writing video /content/final_moviepy_video_no_imagemagick_v2.mp4





Moviepy - Done !
Moviepy - video ready /content/final_moviepy_video_no_imagemagick_v2.mp4
Final video saved to: /content/final_moviepy_video_no_imagemagick_v2.mp4


In [None]:
!cp "/content/final_moviepy_video_no_imagemagick_v2.mp4" "/content/drive/MyDrive/"
print("Saved to Google Drive → MyDrive folder")


Saved to Google Drive → MyDrive folder
