In [None]:
import requests
from bs4 import BeautifulSoup
import time
import re

HEADERS = {"User-Agent": "Mozilla/5.0"}

def yahoo_search(query, num_results=10):
    search_url = f"https://search.yahoo.com/search?p={query}"
    try:
        response = requests.get(search_url, headers=HEADERS, timeout=5)
        soup = BeautifulSoup(response.text, 'html.parser')
        results = []

        for a in soup.select("a[href^='http']"):
            href = a['href']
            if any(domain in href for domain in ['moneycontrol.com', 'reuters.com', 'livemint.com', 'cnbc.com', 'economictimes.indiatimes.com']):
                if href not in results:
                    results.append(href)
            if len(results) >= num_results:
                break
        return results
    except Exception as e:
        print(f"❌ Yahoo Search failed: {e}")
        return []

def extract_bullet_points(content, keywords):
    bullets = []
    sentences = re.split(r'\.|\?|!', content)
    for sentence in sentences:
        if all(k.lower() in sentence.lower() for k in keywords):
            bullets.append("• " + sentence.strip())
    if not bullets:
        for sentence in sentences:
            if any(k.lower() in sentence.lower() for k in keywords):
                bullets.append("• " + sentence.strip())
    return bullets[:3]

def scrape_article(url, keywords):
    try:
        resp = requests.get(url, headers=HEADERS, timeout=5)
        resp.encoding = resp.apparent_encoding
        soup = BeautifulSoup(resp.text, 'html.parser')

        title = soup.title.text.strip() if soup.title else "No Title"
        paragraphs = soup.find_all(['p', 'article'])
        content = ' '.join([p.get_text() for p in paragraphs if p.get_text().strip()])

        if any(k.lower() in content.lower() for k in keywords):
            bullets = extract_bullet_points(content, keywords)
            print(f"\n🔗 URL: {url}")
            print(f"📰 Title: {title}")
            print("📌 Reasons:")
            for b in bullets:
                print(b)
        else:
            print(f"⏭ Skipped (no keyword match): {url}")
    except Exception as e:
        print(f"❌ Failed to scrape {url}: {e}")

if __name__ == "__main__":
    # 👇 Input line here
    keywords_input = input("🔍 Enter keywords (comma-separated): ")
    keywords = [k.strip() for k in keywords_input.split(',') if k.strip()]
    query = '+'.join(keywords + ['site:moneycontrol.com OR site:reuters.com OR site:cnbc.com OR site:livemint.com OR site:economictimes.indiatimes.com'])

    print(f"\n🔎 Searching Yahoo for: {query.replace('+', ' ')}")
    urls = yahoo_search(query)

    print(f"\n✅ Found {len(urls)} articles. Now filtering...\n")
    for url in urls:
        scrape_article(url, keywords)
        time.sleep(1.5)


In [1]:
import requests
from bs4 import BeautifulSoup
import time

HEADERS = {"User-Agent": "Mozilla/5.0"}

def get_article_links(base_url, path_filter, domain):
    try:
        response = requests.get(base_url, headers=HEADERS, timeout=5)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = []
        for a in soup.find_all('a', href=True):
            href = a['href']
            if path_filter in href:
                full_url = href if href.startswith("http") else domain + href
                if full_url not in links:
                    links.append(full_url)
            if len(links) >= 5:
                break
        return links
    except Exception as e:
        print(f"❌ Failed to fetch links from {base_url}: {e}")
        return []

def extract_bullet_points(content, keywords):
    bullets = []
    sentences = content.split('.')
    # Prioritize sentences with all keywords
    for sentence in sentences:
        if all(k.lower() in sentence.lower() for k in keywords):
            bullets.append("• " + sentence.strip())
    # Fallback to any keyword
    if not bullets:
        for sentence in sentences:
            if any(k.lower() in sentence.lower() for k in keywords):
                bullets.append("• " + sentence.strip())
    return bullets[:3]

def scrape_and_filter_article(url, keywords):
    try:
        resp = requests.get(url, headers=HEADERS, timeout=5)
        resp.encoding = resp.apparent_encoding
        soup = BeautifulSoup(resp.text, 'html.parser')

        title = soup.title.text.strip() if soup.title else "No Title"
        paragraphs = soup.find_all('p')
        content = ' '.join([p.get_text() for p in paragraphs if p.get_text().strip()])

        if any(k.lower() in content.lower() for k in keywords):
            bullets = extract_bullet_points(content, keywords)
            print(f"\n🔗 URL: {url}")
            print(f"📰 Title: {title}")
            print("📌 Reasons:")
            for b in bullets:
                print(b)
        else:
            print(f"⏭ Skipped (no keyword match): {url}")
    except Exception as e:
        print(f"❌ Failed to scrape {url}: {e}")

# 🌐 Expanded list of structured & relevant market news sources
sources = [
    {
        "name": "Reuters",
        "url": "https://www.reuters.com/business/",
        "filter": "/business/",
        "domain": "https://www.reuters.com"
    },
    {
        "name": "CNBC",
        "url": "https://www.cnbc.com/finance/",
        "filter": "/202",
        "domain": "https://www.cnbc.com"
    },
    {
        "name": "MarketWatch",
        "url": "https://www.marketwatch.com/latest-news?mod=top_nav",
        "filter": "/story/",
        "domain": "https://www.marketwatch.com"
    },
    {
        "name": "Screener",
        "url": "https://www.screener.in/news/",
        "filter": "/news/",
        "domain": "https://www.screener.in"
    },
    {
        "name": "Moneycontrol",
        "url": "https://www.moneycontrol.com/news/business/",
        "filter": "/news/",
        "domain": "https://www.moneycontrol.com"
    },
    {
        "name": "Yahoo Finance",
        "url": "https://finance.yahoo.com/",
        "filter": "/news/",
        "domain": "https://finance.yahoo.com"
    },
    {
        "name": "Economic Times",
        "url": "https://economictimes.indiatimes.com/markets",
        "filter": "/markets/",
        "domain": "https://economictimes.indiatimes.com"
    },
    {
        "name": "LiveMint",
        "url": "https://www.livemint.com/market",
        "filter": "/market/",
        "domain": "https://www.livemint.com"
    }
]

if __name__ == "__main__":
    # 👇👇👇 INPUT LINE HERE 👇👇👇
    keywords_input = input("🔍 Enter keywords (comma-separated): ")
    keywords = [k.strip() for k in keywords_input.split(',') if k.strip()]

    all_urls = []
    print(f"\n📡 Scanning multiple sites for mentions of: {', '.join(keywords)}\n")

    for source in sources:
        print(f"🌐 {source['name']} ...")
        urls = get_article_links(source['url'], source['filter'], source['domain'])
        all_urls.extend(urls)
        time.sleep(1)

    print(f"\n✅ Scraped {len(all_urls)} total articles. Now filtering...\n")
    for url in all_urls:
        scrape_and_filter_article(url, keywords)
        time.sleep(1.5)


🔍 Enter keywords (comma-separated): hdfc

📡 Scanning multiple sites for mentions of: hdfc

🌐 Reuters ...
🌐 CNBC ...
🌐 MarketWatch ...
🌐 Screener ...
🌐 Moneycontrol ...
🌐 Yahoo Finance ...
🌐 Economic Times ...
🌐 LiveMint ...

✅ Scraped 25 total articles. Now filtering...

⏭ Skipped (no keyword match): https://www.cnbc.com/2023/05/11/market-strategist-survey-forecast.html
⏭ Skipped (no keyword match): https://www.cnbc.com/2025/03/19/cnbc-pro-stock-lists-here-are-the-latest-stocks-including-all-weather-plays.html
⏭ Skipped (no keyword match): https://www.cnbc.com/2025/04/11/stocks-making-the-biggest-moves-midday-aapl-stla-blk-ulcc-and-jpm.html
⏭ Skipped (no keyword match): https://www.cnbc.com/2025/04/11/jamie-dimon-expects-sp-500-earnings-estimates-to-fall-amid-uncertainty.html
⏭ Skipped (no keyword match): https://www.cnbc.com/2025/04/11/tariffs-spell-trouble-for-vcs-amid-klarna-stubhub-ipo-delays.html
⏭ Skipped (no keyword match): https://www.moneycontrol.com/news/
⏭ Skipped (no keywor