In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timedelta
import re
from google import genai
import os

In [2]:
def get_article(article_url):
    """
    Scrapes detailed information from a Moneycontrol article page.

    Args:
        article_url (str): URL of the article.

    Returns:
        dict: Dictionary containing article metadata and content.
    """
    try:
        response = requests.get(article_url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content.decode("utf-8", "ignore"), "html.parser")
    except Exception as e:
        raise RuntimeError(f"Failed to fetch or parse article: {e}")

    article = {}

    # Title
    title_tag = soup.find(attrs={"class": "article_title"})
    article["title"] = title_tag.string.strip() if title_tag and title_tag.string else None

    # Summary
    summary_tag = soup.find(attrs={"class": "article_desc"})
    article["summary"] = summary_tag.string.strip() if summary_tag and summary_tag.string else None

    # Timestamp
    time_tag = soup.find(attrs={"class": "article_schedule"})
    time_date_string = ""
    if time_tag:
        for element in time_tag.contents:
            if hasattr(element, 'string') and element.string and element.string.strip():
                time_date_string += element.string.strip()
    article["timestamp"] = time_date_string or None

    # Author
    author_tag = soup.select_one(".content_block span")
    article["author"] = author_tag.string.strip() if author_tag and author_tag.string else None

    # Image URL
    img_tag = soup.select_one(".article_image img")
    article["img_url"] = img_tag["data-src"] if img_tag and img_tag.has_attr("data-src") else None

    # Article content
    content_tags = soup.select(".content_wrapper > p")
    content = [c.get_text(strip=True) for c in content_tags if c.get_text(strip=True)]
    article["content"] = " ".join(content) if content else None

    # Tags
    tag_links = soup.select(".tags_first_line > a")
    article["tags"] = [tag.get_text(strip=True).lstrip("#") for tag in tag_links if tag.get_text(strip=True)]

    return article


In [3]:
def get_news_links(pages=1):
    """
    Scrapes news article links from Moneycontrol's 'stocks' section.

    Args:
        pages (int): Number of paginated pages to scrape.

    Returns:
        list: Filtered list of unique article URLs from the 'markets' subsection.
    """
    base_url = "https://www.moneycontrol.com/news/business/stocks/page-{}/"
    target_prefix = "https://www.moneycontrol.com/news/"
    market_prefix = "https://www.moneycontrol.com/news/business/markets/"

    headers = {'User-Agent': 'Mozilla/5.0'}
    links = []

    for i in range(pages):
        url = base_url.format(i)
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"❌ Failed to fetch page {i}: {e}")
            continue

        soup = BeautifulSoup(response.text, 'html.parser')
        containers = soup.find_all(class_='clearfix')

        for container in containers:
            for a in container.find_all('a', href=True):
                full_url = urljoin(url, a['href'])
                if full_url.startswith(target_prefix):
                    links.append(full_url)

    # Remove duplicates and filter only 'markets' section links
    updated_links = [
        link for link in set(links)
        if link.startswith(market_prefix) and link != market_prefix
    ]

    return updated_links


In [4]:
updated_links = get_news_links(10)

In [5]:
def scrape_articles_multithreaded(links, get_article_func, max_workers=10):
    """
    Fetch articles concurrently using multithreading.

    Args:
        links (list): List of URLs to scrape.
        get_article_func (callable): Function to call for each URL. Should accept a single URL argument.
        max_workers (int): Number of threads to use.

    Returns:
        list: List of successfully fetched article results.
    """
    all_articles = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_url = {executor.submit(get_article_func, link): link for link in links}
        
        for future in as_completed(future_to_url):
            link = future_to_url[future]
            try:
                article = future.result()
                all_articles.append(article)
                print(f"✅ Completed: {link}")
            except Exception as e:
                print(f"❌ Failed: {link} | Reason: {e}")
    
    return all_articles


In [6]:
all_articles_raw = scrape_articles_multithreaded(updated_links, get_article)

✅ Completed: https://www.moneycontrol.com/news/business/markets/bharat-forge-emerges-as-l1-bidder-in-army-s-tender-with-cqb-carbine-designed-by-drdo-13163413.html
✅ Completed: https://www.moneycontrol.com/news/business/markets/sebi-grants-investors-more-flexibility-to-co-invest-with-alternative-investment-funds-aifs-13138707.html
✅ Completed: https://www.moneycontrol.com/news/business/markets/m-m-shares-in-focus-as-it-incorporates-arm-mahindra-advanced-technologies-12988023.html
✅ Completed: https://www.moneycontrol.com/news/business/markets/bharat-electronics-shares-in-focus-on-rs-2-210-crore-order-win-from-defence-ministry-12987866.html
✅ Completed: https://www.moneycontrol.com/news/business/markets/mutual-fund-assets-double-to-31-of-bank-deposits-in-10-years-uday-kotak-says-savers-turning-investors-13152025.html
✅ Completed: https://www.moneycontrol.com/news/business/markets/wall-street-trades-lower-as-trump-threatens-china-with-steeper-tariffs-markets-near-bear-territory-12987885.h

In [7]:
def parse_article_timestamp(timestamp: str) -> datetime | None:
    """
    Parse a Moneycontrol-style timestamp into a datetime object.

    Args:
        timestamp (str): The raw timestamp string from the article.

    Returns:
        datetime | None: A datetime object if parsing is successful, else None.
    """
    try:
        if not timestamp:
            return None

        # Normalize string
        timestamp = timestamp.strip()
        # Remove trailing time zone and slashes, e.g., "/ 09:30 IST"
        timestamp = re.sub(r"/\s*\d{2}:\d{2}\s*IST", "", timestamp)

        # Extract date and (optional) time
        match = re.search(r'([A-Za-z]+\s+\d{1,2},\s+\d{4})/?\s*(\d{2}:\d{2})?', timestamp)
        if not match:
            return None

        date_part = match.group(1)
        time_part = match.group(2) or "00:00"

        full_string = f"{date_part} {time_part}"
        return datetime.strptime(full_string, "%B %d, %Y %H:%M")
    
    except Exception as e:
        print(f"❌ Error parsing timestamp: {timestamp} | {e}")
        return None

def filter_recent_articles(articles: list[dict], hours: int = 24) -> list[dict]:
    """
    Filter articles published in the last `hours` hours.

    Args:
        articles (list): List of article dicts (each containing a 'timestamp' key).
        hours (int): Time range to filter in hours (default 24).

    Returns:
        list: Articles published within the given time window.
    """
    cutoff = datetime.now() - timedelta(hours=hours)
    recent = []

    for article in articles:
        dt = parse_article_timestamp(article.get("timestamp", ""))
        if dt and dt >= cutoff:
            recent.append(article)

    return recent


recent_articles_raw = filter_recent_articles(all_articles_raw, hours=24)

In [8]:
def format_articles_to_string(articles: list[dict]) -> str:
    """
    Takes a list of article dicts and returns a formatted string with
    title, timestamp, and content for each article.

    Args:
        articles (list): List of article dictionaries.

    Returns:
        str: Formatted string combining the articles.
    """
    result = []
    for article in articles:
        title = article.get("title", "No Title")
        timestamp = article.get("timestamp", "No Timestamp")
        content = article.get("content", "No Content")

        # Skip articles with no content
        if not content or not isinstance(content, str) or content.strip() == "":
            continue

        formatted = f"📰 {title}\n🕒 {timestamp}\n\n{content.strip()}\n{'-' * 80}"
        result.append(formatted)

    return "\n\n".join(result)


In [9]:
recent_articles_text = format_articles_to_string(recent_articles_raw)

In [10]:
import os
import textwrap
from google import genai

def create_morning_report(recent_articles_text):
	client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))

	prompt = f"""
		You are a financial analyst generating a comprehensive Premarket Report for equity traders in India.

		Based on the following raw market news and updates, write a concise, actionable, and well-structured report suitable to be read by traders before the Indian stock market opens.

		The report should include:
		- 🔔 A crisp summary of global cues (GIFT Nifty, US markets, crude, gold, dollar index, bond yields, Asian markets)
		- 📊 Domestic market setup: Nifty/Sensex close, support/resistance levels, VIX, FII/DII flows, PCR
		- 🔍 Stocks in Focus with reason (news impact, earnings, regulatory update, deals, etc.)
		- 💹 Top Trading Ideas (stock, CMP, buy/sell, target, SL)
		- 📢 Corporate actions or events (dividends, bonus, board meetings, SME listings)
		- 🧾 Bulk/Block deals or fund flow highlights
		- ⚠️ Risks to watch (macro, geopolitical, etc.)
		- ✅ A strategy summary to guide the trading day

		Remove any fluff before the first emoji section like 🔔 or 📊.
		Format the output using emojis and headers to make it engaging and scannable. Keep the tone clear, professional, and trader-friendly.

		Here is the raw input: {recent_articles_text} """

	# Generate text
	response = client.models.generate_content(
		model="gemini-2.5-flash",
		contents=prompt
	)

	return response.text


In [21]:
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import markdown

def send_email_report(report_text, subject="📈 Premarket Report", to_email="mdnishan006@gmail.com"):
    # Configuration
    from_email = "mdnishan006@gmail.com"
    app_password = "mciu itco kbmp mnvd"  # Use your 16-character app password

    # Convert markdown to HTML
    report_html = markdown.markdown(report_text)

    # Construct the email
    msg = MIMEMultipart("alternative")
    msg['From'] = from_email
    msg['To'] = to_email
    msg['Subject'] = subject

    # Attach both plain and HTML versions
    # msg.attach(MIMEText(report_text, "plain"))
    msg.attach(MIMEText(report_html, "html"))

    # Send the email via Gmail SMTP
    try:
        with smtplib.SMTP_SSL("smtp.gmail.com", 465) as server:
            server.login(from_email, app_password)
            server.send_message(msg)
        print("✅ Email sent successfully.")
    except Exception as e:
        print(f"❌ Failed to send email: {e}")


In [None]:
morning_report = create_morning_report(recent_articles_text)
send_email_report(morning_report, subject="📈 Daily Premarket Report", to_email="mdnishan006@gmail.com")