In [1]:
!pip install feedparser

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install newspaper3k

Defaulting to user installation because normal site-packages is not writeable


In [3]:
!pip install feedparser requests beautifulsoup4 newspaper3k tqdm transformers torch yfinance pandas lxml[html_clean] regex

Defaulting to user installation because normal site-packages is not writeable


In [None]:
import os
import re
import time
import json
import requests
from datetime import datetime, timedelta
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import yfinance as yf
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from concurrent.futures import ThreadPoolExecutor, as_completed
import warnings
warnings.filterwarnings('ignore')

# ---------------- CONFIG ----------------
START_DATE = "2015-01-01"
END_DATE = "2023-12-31"
OUTPUT_DIR = "output_news"
CACHE_DIR = os.path.join(OUTPUT_DIR, "cache")
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(CACHE_DIR, exist_ok=True)

# GDELT parameters
GDELT_API_BASE = "https://api.gdeltproject.org/api/v2/doc/doc"

# Search queries for Nifty 50
SEARCH_QUERIES = [
    "Nifty 50 India",
    "NSE Nifty"
]

# Trusted news sources
PREFERRED_SOURCES = [
    "bloomberg.com",
    "reuters.com",
    "economictimes.indiatimes.com",
    "moneycontrol.com",
    "livemint.com",
    "business-standard.com",
    "thehindubusinessline.com"
]

# Performance settings
MAX_ARTICLES_PER_QUERY = 250
MAX_WORKERS = 10  # Parallel threads
REQUEST_DELAY = 0.2  # Reduced delay
BATCH_SIZE = 50  # Process in batches

# Models
SUMMARIZER_MODEL = "sshleifer/distilbart-cnn-12-6"
FINBERT_MODEL = "ProsusAI/finbert"
# ----------------------------------------

class FastGDELTCollector:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        self.cache_file = os.path.join(CACHE_DIR, "article_cache.json")
        self.article_cache = self.load_cache()

    def load_cache(self):
        """Load cached articles to avoid re-fetching"""
        if os.path.exists(self.cache_file):
            try:
                with open(self.cache_file, 'r') as f:
                    return json.load(f)
            except:
                return {}
        return {}

    def save_cache(self):
        """Save article cache"""
        with open(self.cache_file, 'w') as f:
            json.dump(self.article_cache, f)

    def query_gdelt(self, query, start_date, end_date, max_records=250):
        """Query GDELT for articles"""
        articles = []

        start_str = start_date.strftime("%Y%m%d000000")
        end_str = end_date.strftime("%Y%m%d235959")

        params = {
            "query": query,
            "mode": "artlist",
            "maxrecords": max_records,
            "format": "json",
            "startdatetime": start_str,
            "enddatetime": end_str,
            "sourcelang": "eng"
        }

        try:
            response = self.session.get(GDELT_API_BASE, params=params, timeout=30)

            if response.status_code == 200:
                data = response.json()
                articles = data.get("articles", [])

            time.sleep(REQUEST_DELAY)

        except Exception:
            pass

        return articles

    def collect_articles_by_period(self, queries, start_date, end_date, period_days=180):
        """Collect articles in larger periods for speed"""
        all_articles = []
        seen_urls = set()

        current_date = start_date
        periods = []

        while current_date <= end_date:
            period_end = min(current_date + timedelta(days=period_days), end_date)
            periods.append((current_date, period_end))
            current_date = period_end + timedelta(days=1)

        print(f"Collecting articles from {len(periods)} time periods...")

        for period_start, period_end in tqdm(periods, desc="Querying GDELT"):
            for query in queries:
                articles = self.query_gdelt(query, period_start, period_end, MAX_ARTICLES_PER_QUERY)

                for article in articles:
                    url = article.get("url", "")

                    if not url or url in seen_urls:
                        continue

                    domain = self.extract_domain(url)
                    is_preferred = any(source in domain for source in PREFERRED_SOURCES)

                    # Filter: only preferred sources for speed
                    if is_preferred:
                        article["is_preferred"] = True
                        article["domain"] = domain
                        seen_urls.add(url)
                        all_articles.append(article)

        print(f"Collected {len(all_articles)} unique articles from preferred sources")
        return all_articles

    def extract_domain(self, url):
        """Extract domain from URL"""
        try:
            from urllib.parse import urlparse
            return urlparse(url).netloc
        except:
            return ""

    def fetch_article_text(self, url):
        """Fetch article text with caching"""
        # Check cache first
        if url in self.article_cache:
            return self.article_cache[url]

        try:
            response = self.session.get(url, timeout=10)

            if response.status_code != 200:
                self.article_cache[url] = None
                return None

            soup = BeautifulSoup(response.text, 'html.parser')

            # Remove unwanted elements
            for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
                tag.decompose()

            # Try article containers
            text = ""
            article = soup.find('article')
            if article:
                paragraphs = article.find_all('p')
                text = '\n'.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 30])

            # Fallback
            if len(text) < 100:
                paragraphs = soup.find_all('p')
                text = '\n'.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 30])

            text = re.sub(r'\s+', ' ', text).strip()

            if len(text) > 100:
                self.article_cache[url] = text
                return text

            self.article_cache[url] = None
            return None

        except Exception:
            self.article_cache[url] = None
            return None

def process_article_batch(articles, collector, summarizer, finbert_tok, finbert_model, device):
    """Process a batch of articles"""
    results = []

    for article in articles:
        url = article.get("url", "")
        title = article.get("title", "")
        seendate = article.get("seendate", "")
        domain = article.get("domain", "")

        try:
            # Parse date
            date = None
            if seendate:
                try:
                    date = datetime.strptime(seendate[:8], "%Y%m%d").date()
                except:
                    pass

            if not date:
                continue

            # Fetch text
            text = collector.fetch_article_text(url)

            if not text or len(text) < 200:
                continue

            text = re.sub(r'\s+', ' ', text).strip()

            # Quick summary (limit length for speed)
            try:
                summary = summarizer(
                    text[:800],
                    max_length=60,
                    min_length=20,
                    truncation=True,
                    do_sample=False
                )[0]["summary_text"]
            except Exception:
                summary = text[:200] + "..."

            # FinBERT sentiment
            try:
                inputs = finbert_tok(
                    text[:1500],
                    return_tensors="pt",
                    truncation=True,
                    max_length=512
                )

                if device == 0:
                    inputs = {k: v.to('cuda') for k, v in inputs.items()}

                with torch.no_grad():
                    logits = finbert_model(**inputs).logits[0].cpu().numpy()

                probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()
                neg, neu, pos = probs
                score = float(pos - neg)

                if score > 0.05:
                    label = "positive"
                elif score < -0.05:
                    label = "negative"
                else:
                    label = "neutral"

            except Exception:
                label, pos, neu, neg, score = "neutral", 0.33, 0.34, 0.33, 0.0

            results.append({
                "date": date.isoformat(),
                "source": domain,
                "url": url,
                "title": title,
                "text": text[:1000],  # Truncate for storage
                "summary": summary,
                "sentiment_label": label,
                "sentiment_pos": float(pos),
                "sentiment_neu": float(neu),
                "sentiment_neg": float(neg),
                "sentiment_score": float(score),
                "is_preferred_source": True
            })

        except Exception:
            continue

    return results

def main():
    start_time = time.time()

    # Parse dates
    start_date = datetime.fromisoformat(START_DATE)
    end_date = datetime.fromisoformat(END_DATE)

    # Check for existing progress
    progress_file = os.path.join(CACHE_DIR, "progress.json")
    checkpoint_file = os.path.join(OUTPUT_DIR, "nifty_news_gdelt_raw_checkpoint.csv")

    if os.path.exists(checkpoint_file):
        print("Found existing checkpoint. Loading...")
        existing_df = pd.read_csv(checkpoint_file)
        print(f"Loaded {len(existing_df)} articles from checkpoint")
        response = input("Continue from checkpoint? (y/n): ")
        if response.lower() != 'y':
            existing_df = None
    else:
        existing_df = None

    # Load models
    print("\nLoading models...")
    device = 0 if torch.cuda.is_available() else -1

    summarizer = pipeline(
        "summarization",
        model=SUMMARIZER_MODEL,
        device=device,
        batch_size=8 if device == 0 else 1
    )

    finbert_tok = AutoTokenizer.from_pretrained(FINBERT_MODEL)
    finbert_model = AutoModelForSequenceClassification.from_pretrained(FINBERT_MODEL)
    finbert_model.eval()

    if torch.cuda.is_available():
        finbert_model.to('cuda')
        print("✓ Using GPU\n")
    else:
        print("✓ Using CPU\n")

    # Initialize collector
    collector = FastGDELTCollector()

    # Collect articles
    if existing_df is None:
        print("="*60)
        print("COLLECTING ARTICLES")
        print("="*60)

        articles = collector.collect_articles_by_period(
            SEARCH_QUERIES,
            start_date,
            end_date,
            period_days=180  # Larger periods for speed
        )

        if not articles:
            print("\nNo articles found. Exiting.")
            return

        # Sort by date
        articles.sort(key=lambda x: x.get('seendate', ''), reverse=True)

        # Process in batches with progress saving
        print(f"\n{'='*60}")
        print("PROCESSING ARTICLES")
        print('='*60)

        all_rows = []

        for i in tqdm(range(0, len(articles), BATCH_SIZE), desc="Processing batches"):
            batch = articles[i:i+BATCH_SIZE]

            rows = process_article_batch(
                batch, collector, summarizer,
                finbert_tok, finbert_model, device
            )

            all_rows.extend(rows)

            # Save checkpoint every 5 batches
            if (i // BATCH_SIZE) % 5 == 0 and all_rows:
                df_checkpoint = pd.DataFrame(all_rows)
                df_checkpoint.to_csv(checkpoint_file, index=False)

        # Save cache
        collector.save_cache()

        print(f"\n✓ Successfully processed: {len(all_rows)} articles")

        if not all_rows:
            print("\nNo valid articles. Exiting.")
            return

        df_raw = pd.DataFrame(all_rows)
    else:
        df_raw = existing_df

    # Clean and prepare data
    df_raw['date'] = pd.to_datetime(df_raw['date']).dt.date
    df_raw = df_raw.sort_values('date')
    df_raw = df_raw.drop_duplicates(subset=['url'])

    # Save raw articles
    raw_path = os.path.join(OUTPUT_DIR, "nifty_news_gdelt_raw.csv")
    df_raw.to_csv(raw_path, index=False)
    print(f"\n✓ Saved {len(df_raw)} articles to: {raw_path}")

    # Aggregate daily sentiment
    print("\nAggregating daily sentiment...")

    agg = df_raw.groupby('date').agg(
        n_items=('sentiment_score', 'count'),
        mean_sentiment_score=('sentiment_score', 'mean'),
        median_sentiment_score=('sentiment_score', 'median'),
        std_sentiment_score=('sentiment_score', 'std'),
        pos_pct=('sentiment_label', lambda s: (s == 'positive').sum() / len(s)),
        neg_pct=('sentiment_label', lambda s: (s == 'negative').sum() / len(s)),
        neutral_pct=('sentiment_label', lambda s: (s == 'neutral').sum() / len(s))
    ).reset_index()

    # Download Nifty 50 prices
    print("Downloading Nifty 50 price data...")

    try:
        nifty = yf.download(
            "^NSEI",
            start=START_DATE,
            end=(datetime.fromisoformat(END_DATE) + timedelta(days=1)).isoformat(),
            progress=False
        )

        if isinstance(nifty.columns, pd.MultiIndex):
            nifty.columns = nifty.columns.get_level_values(0)

        nifty = nifty.reset_index()[['Date', 'Open', 'High', 'Low', 'Close', 'Volume']]
        nifty.columns = ['date', 'nifty_open', 'nifty_high', 'nifty_low', 'nifty_close', 'nifty_volume']
        nifty['date'] = pd.to_datetime(nifty['date']).dt.date

        # Merge
        daily = pd.merge(agg, nifty, on='date', how='outer')
        daily = daily.sort_values('date')

        # Calculate returns
        daily['nifty_return_1d'] = daily['nifty_close'].pct_change()
        daily['nifty_return_5d'] = daily['nifty_close'].pct_change(periods=5)
        daily['nifty_return_20d'] = daily['nifty_close'].pct_change(periods=20)

        # Rolling sentiment
        daily['sentiment_ma_7d'] = daily['mean_sentiment_score'].rolling(7, min_periods=1).mean()
        daily['sentiment_ma_30d'] = daily['mean_sentiment_score'].rolling(30, min_periods=1).mean()

        daily_path = os.path.join(OUTPUT_DIR, "nifty_news_gdelt_daily.csv")
        daily.to_csv(daily_path, index=False)
        print(f"✓ Saved daily data to: {daily_path}")

    except Exception as e:
        print(f"✗ Error with price data: {e}")

    # Summary statistics
    elapsed = time.time() - start_time

    print("\n" + "="*60)
    print("SUMMARY")
    print("="*60)
    print(f"Total articles: {len(df_raw)}")
    print(f"Date range: {df_raw['date'].min()} to {df_raw['date'].max()}")
    print(f"Unique sources: {df_raw['source'].nunique()}")

    print(f"\nTop sources:")
    print(df_raw['source'].value_counts().head())

    print(f"\nSentiment distribution:")
    print(df_raw['sentiment_label'].value_counts())

    print(f"\nSentiment stats:")
    print(f"  Mean: {df_raw['sentiment_score'].mean():.4f}")
    print(f"  Median: {df_raw['sentiment_score'].median():.4f}")

    print(f"\nArticles per year:")
    df_raw['year'] = pd.to_datetime(df_raw['date']).dt.year
    print(df_raw['year'].value_counts().sort_index())
    
    print(f"\nTotal runtime: {elapsed/60:.1f} minutes")
    print(f"Output: {OUTPUT_DIR}")
    print("="*60)

    # Clean up checkpoint
    if os.path.exists(checkpoint_file):
        os.remove(checkpoint_file)

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm



Loading models...
✓ Using CPU

COLLECTING ARTICLES
Collecting articles from 19 time periods...


Querying GDELT: 100%|██████████| 19/19 [01:03<00:00,  3.33s/it]


Collected 2285 unique articles from preferred sources

PROCESSING ARTICLES


Processing batches:  35%|███▍      | 16/46 [1:13:00<2:09:41, 259.39s/it]Your max_length is set to 60, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
Processing batches:  93%|█████████▎| 43/46 [4:10:39<17:49, 356.66s/it]  Your max_length is set to 60, but your input_length is only 51. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)
Processing batches:  96%|█████████▌| 44/46 [4:16:41<11:56, 358.36s/it]Your max_length is set to 60, but your input_length is only 51. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)
Your max_length is set to 60, but your inp


✓ Successfully processed: 1942 articles

✓ Saved 1942 articles to: output_news\nifty_news_gdelt_raw.csv

Aggregating daily sentiment...
Downloading Nifty 50 price data...



1 Failed download:
['^NSEI']: ValueError('unconverted data remains: T00:00:00')


✓ Saved daily data to: output_news\nifty_news_gdelt_daily.csv

SUMMARY
Total articles: 1942
Date range: 2017-09-27 to 2024-01-01
Unique sources: 11

Top sources:
source
economictimes.indiatimes.com             915
www.business-standard.com                607
articles.economictimes.indiatimes.com    234
www.thehindubusinessline.com              66
hindi.business-standard.com               47
Name: count, dtype: int64

Sentiment distribution:
sentiment_label
positive    1440
negative     306
neutral      196
Name: count, dtype: int64

Sentiment stats:
  Mean: 0.3761
  Median: 0.6436

Articles per year:
year
2017     64
2018    402
2019    358
2020    335
2021    298
2022    130
2023    342
2024     13
Name: count, dtype: int64

Total runtime: 266.4 minutes
Output: output_news
