In [12]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
import json
import requests
from typing import Dict, List, Optional, Tuple
import sqlite3
import hashlib
import time
import re
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import logging
import random

from mycongif import ConfigManager

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class RobustStockNewsEventsFetcher:
    def __init__(self, db_path: str = "db/stock_news.db"):
        self.db_path = db_path
        self.session = requests.Session()
        self.apikeys = ConfigManager("private/apikeys.ini")
        
        # Rotate user agents to avoid detection
        self.user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0'
        ]
        
        self.init_database()
        
        # Updated news sources with better strategies
        self.news_sources = {
            'yahoo_finance_api': {
                'function': self._fetch_yahoo_news_api,
                'description': 'Yahoo Finance News API'
            },
            'alpha_vantage_news': {
                'function': self._fetch_alpha_vantage_news,
                'description': 'Alpha Vantage News API'
            },
            'newsapi': {
                'function': self._fetch_newsapi,
                'description': 'NewsAPI.org'
            },
            # 'financial_modeling_prep': {
            #     'function': self._fetch_fmp_news,
            #     'description': 'Financial Modeling Prep'
            # },
            'polygon_news': {
                'function': self._fetch_polygon_news,
                'description': 'Polygon.io News'
            }
        }
    
    def init_database(self):
        """Initialize SQLite database with required tables"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        # Create news table
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS news_articles (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                ticker TEXT NOT NULL,
                title TEXT NOT NULL,
                url TEXT NOT NULL,
                content TEXT,
                summary TEXT,
                publisher TEXT,
                author TEXT,
                publish_date DATETIME,
                scraped_date DATETIME DEFAULT CURRENT_TIMESTAMP,
                source_site TEXT,
                content_hash TEXT UNIQUE,
                sentiment_score REAL,
                UNIQUE(url, ticker)
            )
        ''')
        
        # Create events table
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS stock_events (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                ticker TEXT NOT NULL,
                event_date DATE,
                event_type TEXT,
                event_description TEXT,
                event_value REAL,
                scraped_date DATETIME DEFAULT CURRENT_TIMESTAMP,
                UNIQUE(ticker, event_date, event_type, event_description)
            )
        ''')
        
        # Create price movements table
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS price_movements (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                ticker TEXT NOT NULL,
                movement_date DATE,
                price_change_percent REAL,
                close_price REAL,
                volume INTEGER,
                movement_type TEXT,
                scraped_date DATETIME DEFAULT CURRENT_TIMESTAMP,
                UNIQUE(ticker, movement_date)
            )
        ''')
        
        # Create indexes for better performance
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_news_ticker_date ON news_articles(ticker, publish_date)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_events_ticker_date ON stock_events(ticker, event_date)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_movements_ticker_date ON price_movements(ticker, movement_date)')
        
        conn.commit()
        conn.close()
    
    def get_content_hash(self, content: str) -> str:
        """Generate MD5 hash of content to detect duplicates"""
        return hashlib.md5(content.encode('utf-8')).hexdigest()
    
    def is_news_exists(self, url: str, ticker: str, content_hash: str) -> bool:
        """Check if news article already exists in database"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute('''
            SELECT id FROM news_articles 
            WHERE (url = ? AND ticker = ?) OR content_hash = ?
        ''', (url, ticker, content_hash))
        
        exists = cursor.fetchone() is not None
        conn.close()
        return exists
    
    def save_news_to_db(self, news_data: Dict, ticker: str):
        """Save news article to database if it doesn't exist"""
        content_hash = self.get_content_hash(news_data.get('content', '') + news_data.get('title', ''))
        
        if self.is_news_exists(news_data.get('url', ''), ticker, content_hash):
            logger.info(f"News already exists: {news_data.get('title', 'Unknown')[:50]}...")
            return False
        
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        try:
            cursor.execute('''
                INSERT INTO news_articles 
                (ticker, title, url, content, summary, publisher, author, publish_date, source_site, content_hash)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                ticker,
                news_data.get('title', ''),
                news_data.get('url', ''),
                news_data.get('content', ''),
                news_data.get('summary', ''),
                news_data.get('publisher', ''),
                news_data.get('author', ''),
                news_data.get('publish_date'),
                news_data.get('source_site', ''),
                content_hash
            ))
            conn.commit()
            logger.info(f"Saved new news: {news_data.get('title', 'Unknown')[:50]}...")
            return True
        except sqlite3.IntegrityError:
            logger.info(f"Duplicate news detected: {news_data.get('title', 'Unknown')[:50]}...")
            return False
        finally:
            conn.close()
    
    def _get_random_headers(self):
        """Get random headers to avoid detection"""
        return {
            'User-Agent': random.choice(self.user_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }
    
    def _fetch_yahoo_news_api(self, ticker: str) -> List[Dict]:
        """Fetch news using Yahoo Finance's internal API"""
        articles = []
        try:
            # Try Yahoo's news API endpoint
            url = f"https://query1.finance.yahoo.com/v1/finance/search?q={ticker}"
            headers = self._get_random_headers()
            
            response = requests.get(url, headers=headers, timeout=10)
            if response.status_code == 200:
                data = response.json()
                # Process news if available in response
                if 'news' in data:
                    for item in data['news'][:10]:
                        articles.append({
                            'title': item.get('title', ''),
                            'url': item.get('link', ''),
                            'content': item.get('summary', ''),
                            'summary': item.get('summary', '')[:200] + '...' if len(item.get('summary', '')) > 200 else item.get('summary', ''),
                            'publisher': item.get('publisher', 'Yahoo Finance'),
                            'author': '',
                            'publish_date': self._convert_timestamp(item.get('providerPublishTime')),
                            'source_site': 'yahoo_finance_api'
                        })
            
            # Alternative: Try yfinance news (more reliable)
            try:
                stock = yf.Ticker(ticker)
                news = stock.news
                for item in news[:10]:
                    articles.append({
                        'title': item.get('title', ''),
                        'url': item.get('link', ''),
                        'content': item.get('summary', ''),
                        'summary': item.get('summary', '')[:200] + '...' if len(item.get('summary', '')) > 200 else item.get('summary', ''),
                        'publisher': item.get('publisher', 'Yahoo Finance'),
                        'author': '',
                        'publish_date': self._convert_timestamp(item.get('providerPublishTime')),
                        'source_site': 'yfinance_news'
                    })
            except Exception as e:
                logger.debug(f"yfinance news error: {e}")
                
        except Exception as e:
            logger.error(f"Yahoo News API error: {e}")
        
        return articles
    
    def _fetch_alpha_vantage_news(self, ticker: str) -> List[Dict]:
        """Fetch news from Alpha Vantage (requires API key)"""
        articles = []
        # Note: This requires an API key from Alpha Vantage
        # You can get a free one at https://www.alphavantage.co/support/#api-key
        
        api_key = self.apikeys.get_key("alpha_vantage", "apikey") # Replace with your API key
        if api_key == "YOUR_ALPHA_VANTAGE_API_KEY":
            logger.info("Alpha Vantage API key not provided, skipping...")
            return articles
        
        try:
            url = f"https://www.alphavantage.co/query?function=NEWS_SENTIMENT&tickers={ticker}&apikey={api_key}"
            response = requests.get(url, timeout=10)
            
            if response.status_code == 200:
                data = response.json()
                if 'feed' in data:
                    for item in data['feed'][:10]:
                        articles.append({
                            'title': item.get('title', ''),
                            'url': item.get('url', ''),
                            'content': item.get('summary', ''),
                            'summary': item.get('summary', '')[:200] + '...' if len(item.get('summary', '')) > 200 else item.get('summary', ''),
                            'publisher': item.get('source', 'Alpha Vantage'),
                            'author': item.get('authors', [''])[0] if item.get('authors') else '',
                            'publish_date': item.get('time_published', ''),
                            'source_site': 'alpha_vantage'
                        })
        except Exception as e:
            logger.error(f"Alpha Vantage error: {e}")
        
        return articles
    
    def _fetch_newsapi(self, ticker: str) -> List[Dict]:
        """Fetch news from NewsAPI.org (requires API key)"""
        articles = []
        # Note: This requires an API key from NewsAPI.org
        # You can get a free one at https://newsapi.org/
        
        api_key = self.apikeys.get_key("newsapi", "apikey")  # Replace with your API key
        if api_key == "YOUR_NEWSAPI_KEY":
            logger.info("NewsAPI key not provided, skipping...")
            return articles
        
        try:
            # Get company name for better search results
            stock = yf.Ticker(ticker)
            company_name = stock.info.get('longName', ticker)
            
            url = f"https://newsapi.org/v2/everything?q={company_name}&apiKey={api_key}&sortBy=publishedAt&pageSize=10"
            response = requests.get(url, timeout=10)
            
            if response.status_code == 200:
                data = response.json()
                if 'articles' in data:
                    for item in data['articles']:
                        articles.append({
                            'title': item.get('title', ''),
                            'url': item.get('url', ''),
                            'content': item.get('content', ''),
                            'summary': item.get('description', '')[:200] + '...' if len(item.get('description', '')) > 200 else item.get('description', ''),
                            'publisher': item.get('source', {}).get('name', 'NewsAPI'),
                            'author': item.get('author', ''),
                            'publish_date': item.get('publishedAt', ''),
                            'source_site': 'newsapi'
                        })
        except Exception as e:
            logger.error(f"NewsAPI error: {e}")
        
        return articles
    
    def _fetch_fmp_news(self, ticker: str) -> List[Dict]:
        return Exception()
        """Fetch news from Financial Modeling Prep (requires API key)"""
        articles = []
        # Note: This requires an API key from Financial Modeling Prep
        # You can get a free one at https://financialmodelingprep.com/
        
        api_key = self.apikeys.get_key("fmp", "apikey")  # Replace with your API key
        if api_key == "YOUR_FMP_API_KEY":
            logger.info("FMP API key not provided, skipping...")
            return articles
        
        try:
            url = f"https://financialmodelingprep.com/api/v3/stock_news?tickers={ticker}&limit=10&apikey={api_key}"
            response = requests.get(url, timeout=10)
            
            if response.status_code == 200:
                data = response.json()
                for item in data:
                    articles.append({
                        'title': item.get('title', ''),
                        'url': item.get('url', ''),
                        'content': item.get('text', ''),
                        'summary': item.get('text', '')[:200] + '...' if len(item.get('text', '')) > 200 else item.get('text', ''),
                        'publisher': item.get('site', 'FMP'),
                        'author': '',
                        'publish_date': item.get('publishedDate', ''),
                        'source_site': 'fmp'
                    })
        except Exception as e:
            logger.error(f"FMP error: {e}")
        
        return articles
    
    def _fetch_polygon_news(self, ticker: str) -> List[Dict]:
        """Fetch news from Polygon.io (requires API key)"""
        articles = []
        # Note: This requires an API key from Polygon.io
        # You can get a free one at https://polygon.io/
        
        api_key = self.apikeys.get_key("polygon", "apikey")  # Replace with your API key
        if api_key == "YOUR_POLYGON_API_KEY":
            logger.info("Polygon API key not provided, skipping...")
            return articles
        
        try:
            url = f"https://api.polygon.io/v2/reference/news?ticker={ticker}&limit=10&apikey={api_key}"
            response = requests.get(url, timeout=10)
            
            if response.status_code == 200:
                data = response.json()
                if 'results' in data:
                    for item in data['results']:
                        articles.append({
                            'title': item.get('title', ''),
                            'url': item.get('article_url', ''),
                            'content': item.get('description', ''),
                            'summary': item.get('description', '')[:200] + '...' if len(item.get('description', '')) > 200 else item.get('description', ''),
                            'publisher': item.get('publisher', {}).get('name', 'Polygon'),
                            'author': item.get('author', ''),
                            'publish_date': item.get('published_utc', ''),
                            'source_site': 'polygon'
                        })
        except Exception as e:
            logger.error(f"Polygon error: {e}")
        
        return articles
    
    def _convert_timestamp(self, timestamp):
        """Convert various timestamp formats to standard format"""
        if not timestamp:
            return datetime.now().strftime('%Y-%m-%d')
        
        try:
            if isinstance(timestamp, (int, float)):
                # Unix timestamp
                return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d')
            else:
                # String format
                return pd.to_datetime(timestamp).strftime('%Y-%m-%d')
        except:
            return datetime.now().strftime('%Y-%m-%d')
    
    def scrape_news_for_ticker(self, ticker: str, days_back: int = 30) -> int:
        """Scrape news from multiple sources for a given ticker"""
        total_new_articles = 0
        
        for source_name, source_config in self.news_sources.items():
            logger.info(f"Fetching from {source_config['description']} for {ticker}...")
            
            try:
                # Fetch articles using source-specific function
                articles = source_config['function'](ticker)
                
                # Save new articles to database
                new_count = 0
                for article in articles:
                    if self.save_news_to_db(article, ticker):
                        new_count += 1
                
                total_new_articles += new_count
                logger.info(f"Found {len(articles)} articles from {source_name}, {new_count} were new")
                
                # Be respectful to APIs
                time.sleep(2)
                
            except Exception as e:
                logger.error(f"Error fetching from {source_name} for {ticker}: {str(e)}")
                continue
        
        return total_new_articles
    
    def get_stock_data_and_news(self, ticker: str, years_back: int = 10) -> Dict:
        """Get comprehensive stock data including scraped news"""
        logger.info(f"Starting comprehensive data collection for {ticker}")
        
        # First, scrape fresh news
        new_articles_count = self.scrape_news_for_ticker(ticker, days_back=365)
        logger.info(f"Scraped {new_articles_count} new articles for {ticker}")
        
        # Get traditional yfinance data
        stock_data = self._get_yfinance_data(ticker, years_back)
        
        # Get news from database
        db_news = self._get_news_from_db(ticker, days_back=365)
        
        # Combine all data
        result = stock_data.copy()
        result['scraped_news'] = db_news
        result['scraped_news_count'] = len(db_news)
        result['new_articles_found'] = new_articles_count
        
        return result
    
    def _get_yfinance_data(self, ticker: str, years_back: int) -> Dict:
        """Get stock data from yfinance (existing functionality)"""
        try:
            stock = yf.Ticker(ticker)
            end_date = datetime.now()
            start_date = end_date - timedelta(days=years_back * 365)
            
            info = stock.info
            hist_data = stock.history(start=start_date, end=end_date)
            actions = stock.actions
            
            if not actions.empty:
                actions_filtered = actions[actions.index >= start_date.strftime('%Y-%m-%d')]
            else:
                actions_filtered = pd.DataFrame()
            
            significant_events = self._identify_significant_events(hist_data)
            
            return {
                'ticker': ticker,
                'company_name': info.get('longName', 'N/A'),
                'sector': info.get('sector', 'N/A'),
                'industry': info.get('industry', 'N/A'),
                'date_range': {
                    'start': start_date.strftime('%Y-%m-%d'),
                    'end': end_date.strftime('%Y-%m-%d')
                },
                'corporate_actions': self._format_actions(actions_filtered),
                'significant_price_events': significant_events,
                'summary_stats': {
                    'dividends_count': len(actions_filtered[actions_filtered['Dividends'] > 0]) if not actions_filtered.empty else 0,
                    'stock_splits_count': len(actions_filtered[actions_filtered['Stock Splits'] > 0]) if not actions_filtered.empty else 0,
                    'significant_events_count': len(significant_events)
                }
            }
        except Exception as e:
            logger.error(f"Error getting yfinance data: {e}")
            return {'error': str(e)}
    
    def _get_news_from_db(self, ticker: str, days_back: int = 365) -> List[Dict]:
        """Retrieve news articles from database"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cutoff_date = (datetime.now() - timedelta(days=days_back)).strftime('%Y-%m-%d')
        
        cursor.execute('''
            SELECT title, url, content, summary, publisher, author, publish_date, source_site, scraped_date
            FROM news_articles 
            WHERE ticker = ? AND (publish_date >= ? OR publish_date IS NULL)
            ORDER BY publish_date DESC, scraped_date DESC
        ''', (ticker, cutoff_date))
        
        rows = cursor.fetchall()
        conn.close()
        
        news_list = []
        for row in rows:
            news_list.append({
                'title': row[0],
                'url': row[1],
                'content': row[2],
                'summary': row[3],
                'publisher': row[4],
                'author': row[5],
                'publish_date': row[6],
                'source_site': row[7],
                'scraped_date': row[8]
            })
        
        return news_list
    
    def _format_actions(self, actions: pd.DataFrame) -> List[Dict]:
        """Format corporate actions data"""
        if actions.empty:
            return []
        
        formatted_actions = []
        for date, row in actions.iterrows():
            date_str = date.strftime('%Y-%m-%d') if hasattr(date, 'strftime') else str(date)
            
            if row['Dividends'] > 0:
                formatted_actions.append({
                    'date': date_str,
                    'type': 'Dividend',
                    'amount': float(row['Dividends'])
                })
            if row['Stock Splits'] > 0:
                formatted_actions.append({
                    'date': date_str,
                    'type': 'Stock Split',
                    'ratio': float(row['Stock Splits'])
                })
        
        return formatted_actions
    
    def _identify_significant_events(self, hist_data: pd.DataFrame, threshold: float = 0.05) -> List[Dict]:
        """Identify significant price movements"""
        if hist_data.empty:
            return []
        
        hist_data['Daily_Return'] = hist_data['Close'].pct_change()
        significant_days = hist_data[abs(hist_data['Daily_Return']) > threshold]
        
        events = []
        for date, row in significant_days.iterrows():
            date_str = date.strftime('%Y-%m-%d') if hasattr(date, 'strftime') else str(date)
            
            events.append({
                'date': date_str,
                'price_change_percent': round(float(row['Daily_Return']) * 100, 2),
                'close_price': round(float(row['Close']), 2),
                'volume': int(row['Volume']),
                'type': 'Significant Price Movement'
            })
        
        events.sort(key=lambda x: x['date'], reverse=True)
        return events[:50]
    
    def print_comprehensive_summary(self, data: Dict):
        """Print comprehensive summary including scraped news"""
        if 'error' in data:
            print(f"Error: {data['error']}")
            return
        
        print(f"\n=== Comprehensive Summary for {data['ticker']} ({data['company_name']}) ===")
        print(f"Sector: {data['sector']}")
        print(f"Industry: {data['industry']}")
        print(f"Date Range: {data['date_range']['start']} to {data['date_range']['end']}")
        
        print(f"\n=== News Summary ===")
        print(f"  • Total Scraped Articles: {data['scraped_news_count']}")
        print(f"  • New Articles Found: {data['new_articles_found']}")
        print(f"  • Corporate Actions: {data['summary_stats']['dividends_count']} dividends, {data['summary_stats']['stock_splits_count']} splits")
        print(f"  • Significant Price Events: {data['summary_stats']['significant_events_count']}")
        
        # Show recent scraped news
        if data['scraped_news']:
            print(f"\n=== Recent News (Top 5) ===")
            for i, news in enumerate(data['scraped_news'][:5]):
                print(f"{i+1}. {news['title']}")
                print(f"   Source: {news['publisher']} ({news['source_site']}) | Date: {news['publish_date']}")
                print(f"   URL: {news['url']}")
                if news['summary']:
                    print(f"   Summary: {news['summary'][:150]}...")
                print()
        
        # Setup instructions
        print(f"\n=== Setup Instructions for Full Functionality ===")
        print("To get news from all sources, you'll need API keys (all have free tiers):")
        print("1. Alpha Vantage: https://www.alphavantage.co/support/#api-key")
        print("2. NewsAPI: https://newsapi.org/")
        print("3. Financial Modeling Prep: https://financialmodelingprep.com/")
        print("4. Polygon.io: https://polygon.io/")
        print("\nReplace the 'YOUR_*_API_KEY' placeholders in the code with your actual keys.")
    
    def export_to_json(self, data: Dict, filename: str = None):
        """Export data to JSON file"""
        if filename is None:
            filename = f"{data.get('ticker', 'unknown')}_comprehensive_data_{datetime.now().strftime('%Y%m%d')}.json"
        
        # Make data JSON serializable
        json_data = self._make_json_serializable(data)
        
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(json_data, f, indent=2, ensure_ascii=False, default=str)
        
        print(f"Data exported to {filename}")
    
    def _make_json_serializable(self, obj):
        """Convert objects to JSON serializable format"""
        if isinstance(obj, dict):
            return {key: self._make_json_serializable(value) for key, value in obj.items()}
        elif isinstance(obj, list):
            return [self._make_json_serializable(item) for item in obj]
        elif isinstance(obj, pd.Timestamp):
            return obj.strftime('%Y-%m-%d %H:%M:%S')
        elif isinstance(obj, datetime):
            return obj.strftime('%Y-%m-%d %H:%M:%S')
        elif hasattr(obj, 'item'):
            return obj.item()
        elif pd.isna(obj):
            return None
        else:
            return obj


# Example usage
def main():
    # Initialize the enhanced fetcher
    fetcher = RobustStockNewsEventsFetcher()
    
    # Example: Get comprehensive data for Boeing
    ticker = "BA"
    
    print("=== Robust Stock News and Events Fetcher ===")
    print("Features:")
    print("  • Multiple API-based news sources (more reliable than web scraping)")
    print("  • SQLite database storage with duplicate detection")
    print("  • Comprehensive stock event tracking")
    print("  • Fallback strategies for when sources fail")
    print("\nRequired packages: pip install yfinance pandas beautifulsoup4 requests")
    print(f"\nFetching comprehensive data for {ticker}...")
    
    # Get comprehensive data
    data = fetcher.get_stock_data_and_news(ticker, years_back=2)
    
    # Print comprehensive summary
    fetcher.print_comprehensive_summary(data)
    
    # Export to JSON
    fetcher.export_to_json(data)

In [13]:
# Initialize fetcher
fetcher = RobustStockNewsEventsFetcher()
    
# Example: Get comprehensive data for Boeing
ticker = "BA"

print("=== Robust Stock News and Events Fetcher ===")
print("Features:")
print("  • Multiple API-based news sources (more reliable than web scraping)")
print("  • SQLite database storage with duplicate detection")
print("  • Comprehensive stock event tracking")
print("  • Fallback strategies for when sources fail")
print("\nRequired packages: pip install yfinance pandas beautifulsoup4 requests")
print(f"\nFetching comprehensive data for {ticker}...")

# Get comprehensive data
data = fetcher.get_stock_data_and_news(ticker, years_back=2)

# Print comprehensive summary
fetcher.print_comprehensive_summary(data)

# Export to JSON
fetcher.export_to_json(data)

INFO:__main__:Starting comprehensive data collection for BA
INFO:__main__:Fetching from Yahoo Finance News API for BA...


Configuration loaded from: private/apikeys.ini
=== Robust Stock News and Events Fetcher ===
Features:
  • Multiple API-based news sources (more reliable than web scraping)
  • SQLite database storage with duplicate detection
  • Comprehensive stock event tracking
  • Fallback strategies for when sources fail

Required packages: pip install yfinance pandas beautifulsoup4 requests

Fetching comprehensive data for BA...


INFO:__main__:News already exists: ...
INFO:__main__:News already exists: ...
INFO:__main__:News already exists: ...
INFO:__main__:News already exists: ...
INFO:__main__:News already exists: ...
INFO:__main__:News already exists: ...
INFO:__main__:News already exists: ...
INFO:__main__:News already exists: ...
INFO:__main__:News already exists: ...
INFO:__main__:News already exists: ...
INFO:__main__:Found 10 articles from yahoo_finance_api, 0 were new
INFO:__main__:Fetching from Alpha Vantage News API for BA...
INFO:__main__:Saved new news: INVESTOR ALERT: Pomerantz Law Firm Investigates Cl...
INFO:__main__:Saved new news: Wall Street Hits Record Highs, Nike Jumps 18%: Wha...
INFO:__main__:Saved new news: RTX Clinches a $250M Deal From MELCO to Produce ES...
INFO:__main__:Saved new news: Howmet vs. Textron: Which Aerospace & Defense Stoc...
INFO:__main__:Saved new news: Lockheed Secures a $250M Contract Involving F-35 F...
INFO:__main__:Saved new news: A CAPITOL FOURTH CELEBRATES 45 Y


=== Comprehensive Summary for BA (The Boeing Company) ===
Sector: Industrials
Industry: Aerospace & Defense
Date Range: 2023-06-29 to 2025-06-28

=== News Summary ===
  • Total Scraped Articles: 29
  • New Articles Found: 28
  • Corporate Actions: 0 dividends, 0 splits
  • Significant Price Events: 15

=== Recent News (Top 5) ===
1. INVESTOR ALERT: Pomerantz Law Firm Investigates Claims On Behalf of Investors of The Boeing Company - BA - Boeing  ( NYSE:BA ) 
   Source: Benzinga (alpha_vantage) | Date: 20250628T140000
   URL: https://www.benzinga.com/pressreleases/25/06/g46157463/investor-alert-pomerantz-law-firm-investigates-claims-on-behalf-of-investors-of-the-boeing-company
   Summary: NEW YORK, June 28, 2025 ( GLOBE NEWSWIRE ) -- Pomerantz LLP is investigating claims on behalf of investors of The Boeing Company ( "Boeing" or the "Co...

2. Wall Street Hits Record Highs, Nike Jumps 18%: What's Moving Markets Friday? Wall Street Hits Record Highs, Nike Jumps 18%: What's Moving Market

In [14]:
sqldata = fetcher._get_news_from_db(ticker="BA")

In [19]:
sqldata[1]

{'title': "Wall Street Hits Record Highs, Nike Jumps 18%: What's Moving Markets Friday? Wall Street Hits Record Highs, Nike Jumps 18%: What's Moving Markets Friday? - Boeing  ( NYSE:BA ) ",
 'url': 'https://www.benzinga.com/markets/equities/25/06/46147570/markets-today-news-wall-street-dow-jones-nasdaq-sp500-nike',
 'content': "S&P 500 and Nasdaq 100 hit new record highs on trade optimism. Nike jumps 18% after earnings beat, leading Dow's charge toward 44,000. Market-moving news hits Benzinga Pro first-get a 30-minute edge and save 60% this 4th of July.",
 'summary': "S&P 500 and Nasdaq 100 hit new record highs on trade optimism. Nike jumps 18% after earnings beat, leading Dow's charge toward 44,000. Market-moving news hits Benzinga Pro first-get a 30-minute edge a...",
 'publisher': 'Benzinga',
 'author': 'Piero Cingari',
 'publish_date': '20250627T171124',
 'source_site': 'alpha_vantage',
 'scraped_date': '2025-06-28 22:06:51'}

In [35]:
ticker = "BA"
company_name = "The Boeing Company"
api_key = apikeys.get_key("newsapi", "apikey")
# url = f"https://financialmodelingprep.com/api/v3/stock_news?tickers={ticker}&limit=10&apikey={api_key}"
url = f"https://newsapi.org/v2/everything?q={company_name}&apiKey={api_key}&sortBy=publishedAt&pageSize=100"
response = requests.get(url, timeout=100000)
data = response.json()

In [42]:
data

{'status': 'ok',
 'totalResults': 857,
 'articles': [{'source': {'id': None, 'name': 'Expansion.com'},
   'author': 'Antonio Santamaría',
   'title': 'Así invierten los jeques que quieren dominar el mundo',
   'description': 'Qatar y Abu Dabi están presentes desde hace décadas en el capital de grandes empresas occidentales a través de sus fondos soberanos. Arabia Saudí sigue sus pasos para reducir su dependencia del petróleo.\xa0Leer',
   'url': 'https://www.expansion.com/empresas/2025/06/28/685ed7d3468aebf61a8b4589.html',
   'urlToImage': 'https://phantom-expansion.uecdn.es/02136266517ab87d0803235963f2a7bf/crop/22x24/2023x1359/resize/1200/f/webp/assets/multimedia/imagenes/2025/06/27/17510459782938.jpg',
   'publishedAt': '2025-06-27T22:34:11Z',
   'content': 'Qatar y Abu Dabi están presentes desde hace décadas en el capital de grandes empresas occidentales a través de sus fondos soberanos. Arabia Saudí sigue sus pasos para reducir su dependencia del petró… [+20389 chars]'},
  {'source

In [41]:
data["articles"][0]["content"]

'Qatar y Abu Dabi están presentes desde hace décadas en el capital de grandes empresas occidentales a través de sus fondos soberanos. Arabia Saudí sigue sus pasos para reducir su dependencia del petró… [+20389 chars]'

In [30]:
# "UL6SJRQMIVMO4IQR"

from mycongif import ConfigManager
apikeys = ConfigManager("private/apikeys.ini")
api_key = apikeys.get_key("newsapi", "apikey")
api_key

Configuration loaded from: private/apikeys.ini


'27a6e8043c1640528c513db3b78a4ebe'

In [10]:
apikeys.add_key("polygon", "apikey", "vOV6QumNVGPIwq2pjRCf1U0MS0APOFDA")
apikeys.add_key("polygon", "url_apikey", "https://polygon.io")
apikeys.save_config()

Created new section: [polygon]
Added/Updated key: [polygon]apikey = vOV6QumNVGPIwq2pjRCf1U0MS0APOFDA
Added/Updated key: [polygon]url_apikey = https://polygon.io
Configuration saved to: private/apikeys.ini


In [77]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import time
import json
import re
from typing import List, Dict, Tuple, Optional
import yfinance as yf
import logging

logger = logging.getLogger(__name__)


class YahooFinanceData:
    """Class for retrieving and managing Yahoo Finance company data"""
    
    def __init__(self):
        """Initialize Yahoo Finance data handler"""
        self._cache = {}
        self._cache_timeout = 300  # 5 minutes cache
    
    def get_company_info(self, ticker: str, use_cache: bool = True) -> Dict:
        """
        Get comprehensive company information from Yahoo Finance
        
        Args:
            ticker: Stock ticker symbol
            use_cache: Whether to use cached data if available
            
        Returns:
            Dictionary with comprehensive company information
        """
        ticker = ticker.upper()
        
        # Check cache first
        if use_cache and self._is_cached_data_valid(ticker):
            logger.info(f"Using cached data for {ticker}")
            return self._cache[ticker]['data']
        
        try:
            stock = yf.Ticker(ticker)
            info = stock.info
            
            # Get basic company information
            company_data = {
                'ticker': ticker,
                'longName': info.get('longName', ticker),
                'shortName': info.get('shortName', ticker),
                'sector': info.get('sector', ''),
                'industry': info.get('industry', ''),
                'website': info.get('website', ''),
                'country': info.get('country', ''),
                'city': info.get('city', ''),
                'state': info.get('state', ''),
                'exchange': info.get('exchange', ''),
                'currency': info.get('currency', ''),
                'marketCap': info.get('marketCap', 0),
                'employees': info.get('fullTimeEmployees', 0),
                'businessSummary': info.get('longBusinessSummary', ''),
                
                # Financial metrics
                'currentPrice': info.get('currentPrice', 0),
                'previousClose': info.get('previousClose', 0),
                'open': info.get('open', 0),
                'dayLow': info.get('dayLow', 0),
                'dayHigh': info.get('dayHigh', 0),
                'volume': info.get('volume', 0),
                'averageVolume': info.get('averageVolume', 0),
                'fiftyTwoWeekLow': info.get('fiftyTwoWeekLow', 0),
                'fiftyTwoWeekHigh': info.get('fiftyTwoWeekHigh', 0),
                'peRatio': info.get('trailingPE', 0),
                'forwardPE': info.get('forwardPE', 0),
                'dividendRate': info.get('dividendRate', 0),
                'dividendYield': info.get('dividendYield', 0),
                'beta': info.get('beta', 0),
                'enterpriseValue': info.get('enterpriseValue', 0),
                'priceToBook': info.get('priceToBook', 0),
                'returnOnEquity': info.get('returnOnEquity', 0),
                'returnOnAssets': info.get('returnOnAssets', 0),
                'debtToEquity': info.get('debtToEquity', 0),
                'grossMargins': info.get('grossMargins', 0),
                'operatingMargins': info.get('operatingMargins', 0),
                'profitMargins': info.get('profitMargins', 0),
                
                # Revenue and earnings
                'totalRevenue': info.get('totalRevenue', 0),
                'revenueGrowth': info.get('revenueGrowth', 0),
                'earningsGrowth': info.get('earningsGrowth', 0),
                'earningsQuarterlyGrowth': info.get('earningsQuarterlyGrowth', 0),
                
                # Analyst recommendations
                'recommendationKey': info.get('recommendationKey', ''),
                'recommendationMean': info.get('recommendationMean', 0),
                'numberOfAnalystOpinions': info.get('numberOfAnalystOpinions', 0),
                'targetHighPrice': info.get('targetHighPrice', 0),
                'targetLowPrice': info.get('targetLowPrice', 0),
                'targetMeanPrice': info.get('targetMeanPrice', 0),
                'targetMedianPrice': info.get('targetMedianPrice', 0),
                
                # Additional fields
                'lastFiscalYearEnd': info.get('lastFiscalYearEnd', 0),
                'nextFiscalYearEnd': info.get('nextFiscalYearEnd', 0),
                'mostRecentQuarter': info.get('mostRecentQuarter', 0),
                
                # Metadata
                'data_fetch_timestamp': time.time()
            }
            
            # Cache the data
            self._cache[ticker] = {
                'data': company_data,
                'timestamp': time.time()
            }
            
            logger.info(f"Retrieved comprehensive company info for {ticker}: {company_data['longName']}")
            return company_data
            
        except Exception as e:
            logger.warning(f"Failed to get company info for {ticker}: {e}")
            # Return minimal data structure
            minimal_data = {
                'ticker': ticker,
                'longName': ticker,
                'shortName': ticker,
                'sector': '',
                'industry': '',
                'data_fetch_timestamp': time.time()
            }
            return minimal_data
    
    def get_financial_summary(self, ticker: str) -> Dict:
        """
        Get a comprehensive financial summary for the company
        
        Args:
            ticker: Stock ticker symbol
            
        Returns:
            Dictionary with formatted financial metrics and ratios
        """
        try:
            company_info = self.get_company_info(ticker)
            
            # Calculate additional metrics
            current_price = company_info.get('currentPrice', 0)
            previous_close = company_info.get('previousClose', 0)
            price_change = current_price - previous_close if current_price and previous_close else 0
            price_change_percent = (price_change / previous_close * 100) if previous_close else 0
            
            # Market cap formatting
            market_cap = company_info.get('marketCap', 0)
            market_cap_formatted = self._format_large_number(market_cap)
            
            # Enterprise value formatting
            enterprise_value = company_info.get('enterpriseValue', 0)
            enterprise_value_formatted = self._format_large_number(enterprise_value)
            
            # Revenue formatting
            total_revenue = company_info.get('totalRevenue', 0)
            total_revenue_formatted = self._format_large_number(total_revenue)
            
            summary = {
                'basic_info': {
                    'ticker': company_info['ticker'],
                    'company_name': company_info['longName'],
                    'sector': company_info['sector'],
                    'industry': company_info['industry'],
                    'country': company_info['country'],
                    'website': company_info['website'],
                    'employees': company_info['employees']
                },
                'price_data': {
                    'current_price': current_price,
                    'previous_close': previous_close,
                    'price_change': round(price_change, 2),
                    'price_change_percent': round(price_change_percent, 2),
                    'day_low': company_info['dayLow'],
                    'day_high': company_info['dayHigh'],
                    'fifty_two_week_low': company_info['fiftyTwoWeekLow'],
                    'fifty_two_week_high': company_info['fiftyTwoWeekHigh'],
                    'volume': company_info['volume'],
                    'average_volume': company_info['averageVolume']
                },
                'valuation_metrics': {
                    'market_cap': market_cap,
                    'market_cap_formatted': market_cap_formatted,
                    'enterprise_value': enterprise_value,
                    'enterprise_value_formatted': enterprise_value_formatted,
                    'pe_ratio': company_info['peRatio'],
                    'forward_pe': company_info['forwardPE'],
                    'price_to_book': company_info['priceToBook'],
                    'beta': company_info['beta']
                },
                'financial_metrics': {
                    'total_revenue': total_revenue,
                    'total_revenue_formatted': total_revenue_formatted,
                    'revenue_growth': company_info['revenueGrowth'],
                    'earnings_growth': company_info['earningsGrowth'],
                    'gross_margins': company_info['grossMargins'],
                    'operating_margins': company_info['operatingMargins'],
                    'profit_margins': company_info['profitMargins'],
                    'return_on_equity': company_info['returnOnEquity'],
                    'return_on_assets': company_info['returnOnAssets'],
                    'debt_to_equity': company_info['debtToEquity']
                },
                'dividend_info': {
                    'dividend_rate': company_info['dividendRate'],
                    'dividend_yield': company_info['dividendYield']
                },
                'analyst_data': {
                    'recommendation': company_info['recommendationKey'],
                    'recommendation_mean': company_info['recommendationMean'],
                    'analyst_count': company_info['numberOfAnalystOpinions'],
                    'target_high': company_info['targetHighPrice'],
                    'target_low': company_info['targetLowPrice'],
                    'target_mean': company_info['targetMeanPrice'],
                    'target_median': company_info['targetMedianPrice']
                },
                'business_summary': company_info['businessSummary']
            }
            
            return summary
            
        except Exception as e:
            logger.error(f"Error getting financial summary for {ticker}: {e}")
            return {'error': str(e)}
    
    def get_search_terms(self, ticker: str) -> List[str]:
        """
        Generate search terms based on company information
        
        Args:
            ticker: Stock ticker symbol
            
        Returns:
            List of search terms for news queries
        """
        try:
            company_info = self.get_company_info(ticker)
            terms = []
            
            # Primary ticker symbol
            terms.append(ticker.upper())
            
            # Company names
            long_name = company_info.get('longName', '')
            if long_name and long_name != ticker and len(long_name) > 3:
                terms.append(f'"{long_name}"')
            
            short_name = company_info.get('shortName', '')
            if short_name and short_name not in [ticker, long_name] and len(short_name) > 3:
                terms.append(f'"{short_name}"')
            
            # Sector-specific terms for better coverage
            sector = company_info.get('sector', '')
            if sector and any(keyword in long_name.lower() for keyword in ['bank', 'financial', 'insurance']):
                terms.append(f'"{long_name}" OR "{ticker}" financial')
            elif sector and any(keyword in long_name.lower() for keyword in ['tech', 'software', 'cloud']):
                terms.append(f'"{long_name}" OR "{ticker}" technology')
            
            # Limit to avoid rate limiting and improve relevance
            return terms[:3]
            
        except Exception as e:
            logger.warning(f"Error generating search terms for {ticker}: {e}")
            return [ticker.upper()]
    
    def get_company_context(self, ticker: str) -> Dict:
        """
        Get essential company context for news articles
        
        Args:
            ticker: Stock ticker symbol
            
        Returns:
            Dictionary with essential company context
        """
        try:
            company_info = self.get_company_info(ticker)
            return {
                'ticker': company_info['ticker'],
                'company_name': company_info['longName'],
                'sector': company_info['sector'],
                'industry': company_info['industry'],
                'current_price': company_info.get('currentPrice', 0),
                'market_cap': company_info.get('marketCap', 0),
                'market_cap_formatted': self._format_large_number(company_info.get('marketCap', 0))
            }
        except Exception as e:
            logger.warning(f"Error getting company context for {ticker}: {e}")
            return {
                'ticker': ticker.upper(),
                'company_name': ticker,
                'sector': '',
                'industry': ''
            }
    
    def _is_cached_data_valid(self, ticker: str) -> bool:
        """Check if cached data is still valid"""
        if ticker not in self._cache:
            return False
        
        cache_age = time.time() - self._cache[ticker]['timestamp']
        return cache_age < self._cache_timeout
    
    def _format_large_number(self, number: float) -> str:
        """Format large numbers with appropriate suffixes"""
        if not number:
            return "N/A"
        
        if number >= 1e12:
            return f"${number/1e12:.2f}T"
        elif number >= 1e9:
            return f"${number/1e9:.2f}B"
        elif number >= 1e6:
            return f"${number/1e6:.2f}M"
        elif number >= 1e3:
            return f"${number/1e3:.2f}K"
        else:
            return f"${number:.2f}"
    
    def clear_cache(self):
        """Clear the internal cache"""
        self._cache.clear()
        logger.info("Yahoo Finance cache cleared")


class NewsWire:
    """Enhanced NewsWire class for fetching complete news articles with full content extraction"""
    
    def __init__(self, apikeys_manager, yahoo_finance_data: Optional[YahooFinanceData] = None):
        """
        Initialize NewsWire with API keys manager and optional Yahoo Finance data
        
        Args:
            apikeys_manager: Object that provides get_key(service, key_name) method
            yahoo_finance_data: Optional YahooFinanceData instance for company information
        """
        self.apikeys = apikeys_manager
        self.yahoo_finance = yahoo_finance_data or YahooFinanceData()
        
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Cache-Control': 'max-age=0'
        })
        
        # Domains to skip for content extraction
        self.problematic_domains = {
            'twitter.com', 'x.com', 'facebook.com', 'instagram.com', 
            'youtube.com', 'linkedin.com', 'reddit.com', 'tiktok.com'
        }
        
        # Common unwanted CSS selectors
        self.unwanted_selectors = [
            'script', 'style', 'nav', 'header', 'footer', 'aside', 
            '.advertisement', '.ad', '.ads', '.social-share', 
            '.related-articles', '.comments', '.newsletter-signup',
            '[class*="ad"]', '[id*="ad"]', '.sidebar', '.menu',
            '.cookie-banner', '.popup', '.modal'
        ]
        
        # Article content selectors (ordered by specificity)
        self.article_selectors = [
            'article[role="main"]',
            'main article',
            '[data-module="ArticleBody"]',
            '.article-body__content',
            '.story-body__inner',
            '.entry-content',
            '.post-content', 
            '.article-content',
            '.story-content',
            '.article-text',
            '.post-body',
            '.content-body',
            '.article-body',
            'article',
            '[role="main"]',
            'main .content',
            '.main-content',
            '.content'
        ]

    def filter_company_name(self, name):
        stopwords = ["The", "Company", "Inc"]
        for n in stopwords:
            name = name.replace(n, "")
        return name.strip()

    def fetch_newsapi(self, ticker: str, page_size: int = 10, include_company_data: bool = True) -> Dict:
        """
        Fetch news from NewsAPI.org with full article content extraction
        
        Args:
            ticker: Stock ticker symbol
            page_size: Number of articles to fetch (max 100)
            include_company_data: Whether to include Yahoo Finance company data
            
        Returns:
            Dictionary containing articles and optional company information
        """
        result = {
            'ticker': ticker.upper(),
            'articles': [],
            'company_info': {},
            'extraction_stats': {},
            'fetch_timestamp': time.time()
        }
        
        api_key = self.apikeys.get_key("newsapi", "apikey")
        if not api_key or api_key == "YOUR_NEWSAPI_KEY":
            logger.info("NewsAPI key not provided, skipping...")
            return result
        
        try:
            # Get search terms from Yahoo Finance data if available
            search_terms = [self.filter_company_name(self.yahoo_finance.get_company_info("BA")["longName"])]# self.yahoo_finance.get_search_terms(ticker)
            logger.info(f"Using search terms: {search_terms}")
            
            # Get company context for articles
            company_context = self.yahoo_finance.get_company_context(ticker)
            
            # Include full company data if requested
            if include_company_data:
                result['company_info'] = self.yahoo_finance.get_company_info(ticker)
            
            articles = []
            for term in search_terms:
                term_articles = self._fetch_articles_for_term(term, api_key, page_size // len(search_terms))
                articles.extend(term_articles)
                
                # Break if we have enough articles
                if len(articles) >= page_size:
                    break
            
            # Remove duplicates and limit to requested size
            articles = self._deduplicate_articles(articles)[:page_size]
            
            # Add company context to each article
            for article in articles:
                article['company_info'] = company_context
            
            # Extract full content for each article
            self._extract_full_content_batch(articles)
            
            result['articles'] = articles
            result['extraction_stats'] = self.get_extraction_stats(articles)
            
            logger.info(f"Successfully fetched {len(articles)} articles for {ticker}")
            
        except Exception as e:
            logger.error(f"NewsAPI fetch error: {e}")
            result['error'] = str(e)
        
        return result

    def _fetch_articles_for_term(self, term: str, api_key: str, limit: int) -> List[Dict]:
        """Fetch articles for a specific search term"""
        articles = []
        
        try:
            url = (f"https://newsapi.org/v2/everything?"
                   f"q={term}&"
                   f"apiKey={api_key}&"
                   f"sortBy=publishedAt&"
                   f"pageSize={limit}&"
                   f"language=en")
            print(url)
            
            response = self.session.get(url, timeout=15)
            print(response)
            response.raise_for_status()
            
            data = response.json()
            print(data)
            if data.get('status') == 'ok' and 'articles' in data:
                for item in data['articles']:
                    article = self._create_article_dict(item)
                    if self._is_valid_article(article):
                        articles.append(article)
            
        except Exception as e:
            logger.error(f"Error fetchisdfng articles for term '{term}': {e}")
        
        return articles

    def _create_article_dict(self, item: Dict) -> Dict:
        """Create standardized article dictionary with enhanced metadata"""
        return {
            'title': item.get('title', '').strip(),
            'url': item.get('url', ''),
            'content': item.get('content', ''),
            'summary': item.get('description', ''),  # Full summary without truncation
            'publisher': item.get('source', {}).get('name', 'NewsAPI'),
            'author': item.get('author', ''),
            'publish_date': item.get('publishedAt', ''),
            'source_site': 'newsapi',
            'image_url': item.get('urlToImage', ''),
            'extraction_success': False,
            'content_length': 0,
            'extraction_method': None,
            'fetch_timestamp': time.time(),
            'company_info': {}  # Will be populated later
        }

    def _is_valid_article(self, article: Dict) -> bool:
        """Check if article meets basic quality criteria"""
        return (article.get('title') and 
                article.get('url') and 
                len(article.get('title', '')) > 10 and
                not self._is_problematic_url(article.get('url', '')))

    def _is_problematic_url(self, url: str) -> bool:
        """Check if URL is from a problematic domain"""
        if not url:
            return True
        
        try:
            domain = urlparse(url).netloc.lower()
            return any(prob_domain in domain for prob_domain in self.problematic_domains)
        except:
            return True

    def _deduplicate_articles(self, articles: List[Dict]) -> List[Dict]:
        """Remove duplicate articles based on URL and title similarity"""
        seen_urls = set()
        seen_titles = set()
        unique_articles = []
        
        for article in articles:
            url = article.get('url', '')
            title = article.get('title', '').lower().strip()
            
            if url not in seen_urls and title not in seen_titles:
                seen_urls.add(url)
                seen_titles.add(title)
                unique_articles.append(article)
        
        return unique_articles

    def _extract_full_content_batch(self, articles: List[Dict]) -> None:
        """Extract full content for a batch of articles"""
        for i, article in enumerate(articles):
            try:
                full_content, method = self._fetch_full_article_content(article['url'])
                
                if full_content:
                    article['content'] = full_content
                    article['extraction_success'] = True
                    article['content_length'] = len(full_content)
                    article['extraction_method'] = method
                    logger.info(f"Successfully extracted content from {article['url']} using {method}")
                else:
                    logger.warning(f"Failed to extract content from {article['url']}")
                
                # Rate limiting - be respectful to websites
                if i < len(articles) - 1:  # Don't sleep after the last article
                    time.sleep(1.5)
                    
            except Exception as e:
                logger.error(f"Error extracting content from {article['url']}: {e}")

    def _fetch_full_article_content(self, url: str) -> Tuple[str, Optional[str]]:
        """
        Fetch full article content from URL
        
        Returns:
            Tuple of (content, extraction_method)
        """
        if not url or self._is_problematic_url(url):
            return "", None
        
        try:
            response = self.session.get(url, timeout=20, allow_redirects=True)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            self._remove_unwanted_elements(soup)
            
            # Try multiple extraction strategies
            content, method = self._extract_content_with_strategies(soup)
            
            if content and len(content.strip()) > 200:
                cleaned_content = self._clean_extracted_text(content)
                if len(cleaned_content) > 200:
                    return cleaned_content, method
                    
        except Exception as e:
            logger.warning(f"Failed to fetch content from {url}: {e}")
        
        return "", None

    def _remove_unwanted_elements(self, soup: BeautifulSoup) -> None:
        """Remove unwanted elements from the soup"""
        for selector in self.unwanted_selectors:
            for element in soup.select(selector):
                element.decompose()

    def _extract_content_with_strategies(self, soup: BeautifulSoup) -> Tuple[str, Optional[str]]:
        """Try multiple content extraction strategies"""
        
        # Strategy 1: JSON-LD structured data
        content = self._extract_from_json_ld(soup)
        if content:
            return content, "json-ld"
        
        # Strategy 2: Article-specific selectors
        content = self._extract_from_selectors(soup)
        if content:
            return content, "css-selectors"
        
        # Strategy 3: Paragraph-based extraction
        content = self._extract_from_paragraphs(soup)
        if content:
            return content, "paragraphs"
        
        # Strategy 4: Fallback to body text
        content = self._extract_from_body(soup)
        if content:
            return content, "body-fallback"
        
        return "", None

    def _extract_from_json_ld(self, soup: BeautifulSoup) -> str:
        """Extract content from JSON-LD structured data"""
        json_scripts = soup.find_all('script', type='application/ld+json')
        
        for script in json_scripts:
            try:
                data = json.loads(script.string)
                if isinstance(data, list):
                    data = data[0] if data else {}
                
                if isinstance(data, dict):
                    # Try different JSON-LD properties
                    for prop in ['articleBody', 'text', 'description']:
                        if prop in data and isinstance(data[prop], str):
                            content = data[prop].strip()
                            if len(content) > 300:
                                return content
                                
            except (json.JSONDecodeError, TypeError, IndexError):
                continue
        
        return ""

    def _extract_from_selectors(self, soup: BeautifulSoup) -> str:
        """Extract content using CSS selectors"""
        for selector in self.article_selectors:
            elements = soup.select(selector)
            if elements:
                text = elements[0].get_text(separator=' ', strip=True)
                if len(text) > 300:
                    return text
        return ""

    def _extract_from_paragraphs(self, soup: BeautifulSoup) -> str:
        """Extract content from paragraph elements"""
        paragraphs = soup.find_all('p')
        if not paragraphs:
            return ""
        
        good_paragraphs = []
        skip_phrases = {
            'subscribe', 'newsletter', 'cookie', 'privacy policy', 
            'terms of service', 'follow us', 'share this', 'advertisement'
        }
        
        for p in paragraphs:
            text = p.get_text(strip=True)
            if (len(text) > 40 and 
                not any(phrase in text.lower() for phrase in skip_phrases)):
                good_paragraphs.append(text)
        
        if len(good_paragraphs) >= 3:
            return ' '.join(good_paragraphs)
        
        return ""

    def _extract_from_body(self, soup: BeautifulSoup) -> str:
        """Fallback: extract from body element"""
        body = soup.find('body')
        if body:
            text = body.get_text(separator=' ', strip=True)
            if len(text) > 300:
                return text
        return ""

    def _clean_extracted_text(self, text: str) -> str:
        """Clean and normalize extracted text"""
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        
        # Remove common unwanted phrases
        unwanted_patterns = [
            r'Advertisement\s*',
            r'ADVERTISEMENT\s*',
            r'Skip to main content\s*',
            r'Subscribe to our newsletter\s*',
            r'Sign up for our newsletter\s*',
            r'Follow us on\s*',
            r'Share this article\s*',
            r'Related articles?\s*',
            r'More from this author\s*',
            r'Click here to\s*',
            r'Read more:?\s*'
        ]
        
        for pattern in unwanted_patterns:
            text = re.sub(pattern, '', text, flags=re.IGNORECASE)
        
        return text.strip()

    def get_extraction_stats(self, articles: List[Dict]) -> Dict:
        """Get statistics about content extraction success"""
        if not articles:
            return {}
        
        total = len(articles)
        successful = sum(1 for a in articles if a.get('extraction_success'))
        methods = {}
        
        for article in articles:
            method = article.get('extraction_method')
            if method:
                methods[method] = methods.get(method, 0) + 1
        
        avg_length = sum(a.get('content_length', 0) for a in articles) / total if total > 0 else 0
        
        return {
            'total_articles': total,
            'successful_extractions': successful,
            'success_rate': successful / total if total > 0 else 0,
            'extraction_methods': methods,
            'average_content_length': int(avg_length)
        }

    def __del__(self):
        """Clean up session when object is destroyed"""
        if hasattr(self, 'session'):
            self.session.close()

In [78]:
# Initialize
newswire = NewsWire(apikeys)

# Get detailed financial summary
financial_summary = newswire.yahoo_finance.get_financial_summary("BA")
print(f"Current Price: ${financial_summary['price_data']['current_price']}")
print(f"Market Cap: {financial_summary['valuation_metrics']['market_cap_formatted']}")

# Get comprehensive data
result = newswire.fetch_newsapi("BA", page_size=15)

# # Access company information
# company_info = result['company_info']
# print(f"Market Cap: ${company_info['marketCap']:,}")
# print(f"P/E Ratio: {company_info['peRatio']}")


# Access articles with company context
for article in result['articles']:
    print(f"Title: {article['title']}")
    print(f"Company: {article['company_info']['company_name']}")
    print(f"Content Length: {article['content_length']} chars")

INFO:__main__:Retrieved comprehensive company info for BA: The Boeing Company
INFO:__main__:Using cached data for BA
INFO:__main__:Using search terms: ['Boeing']
INFO:__main__:Using cached data for BA
INFO:__main__:Using cached data for BA


Current Price: $214.55
Market Cap: $161.77B
https://newsapi.org/v2/everything?q=Boeing&apiKey=27a6e8043c1640528c513db3b78a4ebe&sortBy=publishedAt&pageSize=15&language=en


ERROR:__main__:Error fetchisdfng articles for term 'Boeing': Expecting value: line 1 column 1 (char 0)
INFO:__main__:Successfully fetched 0 articles for BA


<Response [200]>


In [72]:
newswire.yahoo_finance.get_company_info("BA")["longName"]


INFO:__main__:Retrieved comprehensive company info for BA: The Boeing Company


'The Boeing Company'