In [1]:
import pandas as pd
import newspaper
import re
import time
import requests
import json
import os
from datetime import datetime
from urllib.parse import quote, urlparse
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("article_search.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

class ArticleFinder:
    def __init__(self, api_key, search_engine_id, output_dir="downloaded_articles"):
        """
        Initialize the ArticleFinder with required credentials and settings
        
        Args:
            api_key: Google API key
            search_engine_id: Google Custom Search Engine ID
            output_dir: Directory to save HTML files
        """
        self.api_key = api_key
        self.search_engine_id = search_engine_id
        self.output_dir = output_dir
        
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        logger.info(f"HTML files will be saved to {output_dir}")
        
        # Track API usage to avoid exceeding limits
        self.search_count = 0
        self.results = []
    
    def extract_slug(self, url):
        """Extract the article slug from RSS feed URLs"""
        # Try multiple patterns to extract slugs from different URL formats

        # Pattern for specific sections (newstopstories, techtopstories, etc.)
        match = re.search(r'usatoday-(\w+)~(.*?)(?:/|$)', url)
        if match:
            slug = match.group(2)
            search_term = slug.replace('-', ' ')
            return slug, search_term

        # Pattern for nation-topstories format
        match = re.search(r'usatodaycom(\w+)-topstories~(.*?)(?:/|$)', url)
        if match:
            slug = match.group(2)
            search_term = slug.replace('-', ' ')
            return slug, search_term

        # Last resort: Extract the last segment of the URL path
        parts = url.rstrip('/').split('/')
        if parts and len(parts) > 0:
            last_segment = parts[-1]
            # Check if it looks like a slug (contains dashes)
            if '-' in last_segment and not last_segment.startswith('~'):
                search_term = last_segment.replace('-', ' ')
                return last_segment, search_term

        return None, None
    
    def dedupe_by_slug(urls, extract_slug_func):
        seen_slugs = set()
        unique_urls = []

        for url in urls:
            slug, _ = extract_slug_func(url)
            if slug and slug not in seen_slugs:
                seen_slugs.add(slug)
                unique_urls.append(url)

        return unique_urls

    def search_for_article(self, search_term, site="usatoday.com"):
        """
        Search for an article using Google's Custom Search API
        
        Args:
            search_term: The term to search for
            site: The site to restrict search to
            
        Returns:
            The URL of the first search result, or None if no results
        """
        # Properly format the search query with site restriction
        query = f"site:{site} {search_term}"
        logger.info(f"Searching for: {query}")
        
        # Track API usage (Google CSE has limits)
        self.search_count += 1
        if self.search_count % 10 == 0:
            logger.info(f"Search API count: {self.search_count}")
        
        # Custom Search API URL
        search_url = "https://www.googleapis.com/customsearch/v1"
        
        # Parameters for the API request
        params = {
            "key": self.api_key,
            "cx": self.search_engine_id,
            "q": query,
            "num": 5  # Request 5 results
        }
        
        try:
            # Make the API request with proper error handling
            response = requests.get(search_url, params=params)
            
            if response.status_code == 200:
                data = response.json()
                
                # Check if we have search items
                if "items" in data and len(data["items"]) > 0:
                    # Get the first result URL
                    result_url = data["items"][0]["link"]
                    result_title = data["items"][0]["title"]
                    logger.info(f"Found article: {result_title} at {result_url}")
                    return result_url, result_title
                else:
                    logger.warning(f"No search results found for '{query}'")
                    if "searchInformation" in data:
                        logger.info(f"Total results: {data['searchInformation'].get('totalResults', 0)}")
            else:
                # Detailed error logging
                logger.error(f"Search API error: {response.status_code}")
                logger.error(f"Error details: {response.text}")
                
                # Handle specific error codes
                if response.status_code == 403:
                    logger.error("Error 403: API quota exceeded or invalid credentials")
                elif response.status_code == 429:
                    logger.warning("Error 429: Rate limit exceeded, waiting before retry")
                    time.sleep(10)  # Wait longer for rate limit
                    
        except Exception as e:
            logger.error(f"Search API exception: {str(e)}")
        
        return None, None
    
    def download_and_parse_article(self, url):
        """
        Download and parse an article using newspaper3k
        
        Args:
            url: URL of the article to download
            
        Returns:
            Dictionary with article data and success status
        """
        logger.info(f"Downloading article from: {url}")
        
        try:
            article = newspaper.Article(url)
            article.download()
            
            # Save the HTML content
            html_content = article.html
            
            # Create a filename based on the URL
            parsed_url = urlparse(url)
            domain = parsed_url.netloc.replace(".", "_")
            path = parsed_url.path.strip("/").replace("/", "_")
            if not path:
                path = "index"
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"{domain}_{path}_{timestamp}.html"
            filepath = os.path.join(self.output_dir, filename)
            
            # Save the HTML
            with open(filepath, "w", encoding="utf-8") as f:
                f.write(html_content)
            
            # Parse the article
            article.parse()
            
            return {
                "url": url,
                "title": article.title,
                "text": article.text[:500] + "..." if len(article.text) > 500 else article.text,
                "publish_date": str(article.publish_date),
                "authors": article.authors,
                "html_saved_path": filepath,
                "html_size": len(html_content),
                "text_size": len(article.text),
                "success": True
            }
            
        except Exception as e:
            logger.error(f"Error downloading/parsing {url}: {str(e)}")
            return {
                "url": url,
                "error": str(e),
                "success": False
            }
    
    def process_rss_urls(self, urls, max_urls=None, results_file="article_results.jsonl"):
        """
        Process a list of RSS feed URLs to find and download the actual articles

        Args:
            urls: List of RSS feed URLs
            max_urls: Maximum number of URLs to process (None for all)
            results_file: JSONL file to save results to incrementally

        Returns:
            Pandas DataFrame with article data
        """
        self.results = []
        total_urls = len(urls)

        if max_urls:
            urls = urls[:max_urls]
            logger.info(f"Processing {len(urls)} of {total_urls} URLs")
        else:
            logger.info(f"Processing all {total_urls} URLs")

        # Create or truncate the JSONL file at the start
        with open(results_file, 'w') as f:
            f.write('')  # Just create/truncate the file

        for i, url in enumerate(urls):
            logger.info(f"Processing {i+1}/{len(urls)}: {url}")

            # Extract the slug
            slug, search_term = self.extract_slug(url)
            if not slug:
                logger.warning(f"Could not extract slug from URL: {url}")

                # Create failed entry
                result = {
                    "original_rss_url": url,
                    "slug": None,
                    "search_term": None,
                    "found_url": None,
                    "found_title": None,
                    "success": False,
                    "error": "Could not extract slug",
                    "timestamp": datetime.now().isoformat()
                }

                # Append to JSONL file
                self._append_to_jsonl(result, results_file)

                # Also keep in memory
                self.results.append(result)
                continue

            logger.info(f"Extracted slug: {slug}")
            logger.info(f"Search term: {search_term}")

            # Search for the article - just one simple search approach
            found_url, found_title = self.search_for_article(search_term)

            if not found_url:
                logger.warning(f"No article found for slug: {slug}")
                
                # Create failed entry
                result = {
                    "original_rss_url": url,
                    "slug": slug,
                    "search_term": search_term,
                    "found_url": None,
                    "found_title": None,
                    "success": False,
                    "error": "No article found",
                    "timestamp": datetime.now().isoformat()
                }
                
                # Append to JSONL file
                self._append_to_jsonl(result, results_file)
                
                # Also keep in memory
                self.results.append(result)
                continue

            # Download and parse the article
            article_data = self.download_and_parse_article(found_url)

            # Add original URL info and search results
            result = {
                "original_rss_url": url,
                "slug": slug,
                "search_term": search_term,
                "found_url": found_url,
                "found_title": found_title,
                "timestamp": datetime.now().isoformat()
            }
            result.update(article_data)

            # Append to JSONL file
            self._append_to_jsonl(result, results_file)

            # Also keep in memory
            self.results.append(result)

            # Be nice to servers
            time.sleep(2)

        # Create DataFrame from final results
        df = pd.DataFrame(self.results)

        # Save final results as CSV as well
        self.save_results("article_results.csv", df)

        return df

    def _append_to_jsonl(self, result, jsonl_file):
        """
        Append a single result to a JSONL file

        Args:
            result: Result dictionary to append
            jsonl_file: Path to JSONL file
        """
        # Create a copy of the result to avoid modifying the original
        result_copy = result.copy()

        # If the result contains HTML content, truncate it to save space
        if "html_content" in result_copy:
            result_copy["html_content"] = f"[HTML content truncated, {len(result_copy['html_content'])} bytes]"

        # If the result contains article text, truncate it if it's too long
        if "text" in result_copy and len(result_copy["text"]) > 1000:
            result_copy["text"] = result_copy["text"][:1000] + "..."

        # Append to the JSONL file
        with open(jsonl_file, 'a', encoding='utf-8') as f:
            f.write(json.dumps(result_copy) + '\n')

        logger.debug(f"Appended result for {result_copy.get('slug', 'unknown')} to {jsonl_file}")
    
    def save_results(self, filename="article_results.csv", df=None):
        """
        Save results to a CSV file using append mode

        Args:
            filename: Name of the CSV file
            df: DataFrame to save (uses self.results if None)

        Returns:
            The DataFrame that was saved
        """
        if df is None:
            df = pd.DataFrame(self.results)

        # Check if file exists to determine if header should be written
        file_exists = os.path.isfile(filename)

        # Append to file if it exists, create new file if it doesn't
        df.to_csv(filename, mode='a', header=not file_exists, index=False)

        logger.info(f"Results {'appended to' if file_exists else 'saved to'} {filename}")

        return df

In [2]:
# Example usage
if __name__ == "__main__":
    API_KEY = ""
    SEARCH_ENGINE_ID = ""

    with open('usat_urls.json', 'r', encoding='utf-8') as f:
        raw_urls = json.load(f)
    
    ## Already Processed
    # Path to your log file
    log_path = "article_search.log"

    # Read and extract rssfeeds URLs
    with open(log_path, "r") as f:
        log_data = f.read()

    # Regex pattern for rssfeeds URLs
    pattern = r"http[s]?://rssfeeds[^\s]+"
    rssfeed_urls = re.findall(pattern, log_data)

    rssfeed_urls = set(rssfeed_urls)
    unprocessed_urls = list(set(raw_urls) - rssfeed_urls)
    
    # Deduplicate using slugs
    finder = ArticleFinder(API_KEY, SEARCH_ENGINE_ID)
    seen_slugs = set()
    deduped_urls = []

    for url in unprocessed_urls:
        slug, _ = finder.extract_slug(url)
        if slug and slug not in seen_slugs:
            seen_slugs.add(slug)
            deduped_urls.append(url)

    # Take the desired slice
    urls = deduped_urls
    print(len(urls))

    # Now process
    df = finder.process_rss_urls(urls)

    print("\nResults Summary:")
    print(f"Total URLs processed: {len(df)}")
    print(f"Successfully retrieved: {df['success'].sum()}")
    print(f"Failed: {len(df) - df['success'].sum()}")
        
    # Show the DataFrame
    print("\nDataFrame Preview:")
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    print(df[['slug', 'found_title', 'found_url', 'success', 'html_saved_path']].head())

2025-04-07 18:39:47,385 - INFO - HTML files will be saved to downloaded_articles
2025-04-07 18:39:47,392 - INFO - Processing all 1266 URLs
2025-04-07 18:39:47,508 - INFO - Processing 1/1266: http://rssfeeds.usatoday.com/~/723790304/0/usatodaycomnation-topstories~AP-Top-Stories-January-A/
2025-04-07 18:39:47,509 - INFO - Extracted slug: AP-Top-Stories-January-A
2025-04-07 18:39:47,509 - INFO - Search term: AP Top Stories January A
2025-04-07 18:39:47,509 - INFO - Searching for: site:usatoday.com AP Top Stories January A


1266


2025-04-07 18:39:48,316 - INFO - Found article: AP Top Stories January 25 A at https://www.usatoday.com/videos/news/nation/2024/01/25/ap-top-stories-january-25-a/72349433007/
2025-04-07 18:39:48,319 - INFO - Downloading article from: https://www.usatoday.com/videos/news/nation/2024/01/25/ap-top-stories-january-25-a/72349433007/
2025-04-07 18:39:49,602 - ERROR - Error downloading/parsing https://www.usatoday.com/videos/news/nation/2024/01/25/ap-top-stories-january-25-a/72349433007/: Article `download()` failed with 404 Client Error: OK for url: https://www.usatoday.com/videos/news/nation/2024/01/25/ap-top-stories-january-25-a/72349433007/ on URL https://www.usatoday.com/videos/news/nation/2024/01/25/ap-top-stories-january-25-a/72349433007/
2025-04-07 18:39:51,610 - INFO - Processing 2/1266: http://rssfeeds.usatoday.com/~/724774097/0/usatodaycomnation-topstories~Today-in-History-for-January-th/
2025-04-07 18:39:51,612 - INFO - Extracted slug: Today-in-History-for-January-th
2025-04-07 18

KeyboardInterrupt: 

In [12]:
with open('usat_urls.json', 'r', encoding='utf-8') as f:
        raw_urls = json.load(f)
    

log_path = "article_search.log"

with open(log_path, "r") as f:
    log_data = f.read()

pattern = r"http[s]?://rssfeeds[^\s]+"
rssfeed_urls = re.findall(pattern, log_data)

rssfeed_urls = list(set(rssfeed_urls))
unprocessed_urls =  list(set(raw_urls) - set(rssfeed_urls))
    
finder = ArticleFinder(API_KEY, SEARCH_ENGINE_ID)
seen_slugs = set()
deduped_urls = []

for url in unprocessed_urls:
    slug, _ = finder.extract_slug(url)
    if slug and slug not in seen_slugs:
        seen_slugs.add(slug)
        deduped_urls.append(url)

len(deduped_urls)

2025-04-04 10:14:53,858 - INFO - HTML files will be saved to downloaded_articles


22166

In [19]:
API_KEY = "AIzaSyBaC0E3rPFtQlfccgJJetBLB-abFMStqaA"
SEARCH_ENGINE_ID = "e3cd371a06a394127"

with open('usat_urls.json', 'r', encoding='utf-8') as f:
        raw_urls = json.load(f)

# Deduplicate using slugs
parser =  ArticleFinder(API_KEY, SEARCH_ENGINE_ID)

seen_slugs = set()
deduped_urls = []

for url in raw_urls:
    slug, _ = parser.extract_slug(url)
    if slug and slug not in seen_slugs:
        seen_slugs.add(slug)
        deduped_urls.append(url)
len(deduped_urls)

2025-04-02 11:09:04,377 - INFO - HTML files will be saved to downloaded_articles


38062

In [6]:
import os
import csv
import logging
from typing import List, Dict
from tqdm import tqdm
import pandas as pd
from bs4 import BeautifulSoup

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s: %(message)s',
    handlers=[
        logging.FileHandler('usa_today_extraction.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

def extract_metadata_from_filename(filename: str) -> dict:
    """
    Extract metadata from the filename
    
    Args:
        filename (str): Name of the HTML file
    
    Returns:
        dict: Extracted metadata from filename
    """
    # Remove file extension
    name_without_ext = os.path.splitext(filename)[0]
    
    # Split the filename by underscores
    parts = name_without_ext.split('_')
    
    # Try to extract meaningful parts
    metadata = {
        'domain': parts[0] if len(parts) > 0 else '',
        'source': ' '.join(parts[1:3]) if len(parts) > 2 else '',
        'date_in_filename': parts[-2] if len(parts) > 1 else '',
        'timestamp': parts[-1] if len(parts) > 0 else ''
    }
    
    return metadata

def extract_text_from_html(html_path: str) -> dict:
    """
    Extract text and metadata from an HTML file
    
    Args:
        html_path (str): Path to the HTML file
    
    Returns:
        Dict: Extracted information
    """
    try:
        # Read the HTML file
        with open(html_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Extract filename metadata
        filename = os.path.basename(html_path)
        filename_metadata = extract_metadata_from_filename(filename)
        
        # Try to extract text from various elements
        text_candidates = []
        
        # Try different text extraction methods
        if soup.body:
            # Extract text from paragraphs
            paragraphs = soup.find_all(['p', 'div'])
            text_candidates = [p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]
        
        # Combine text candidates
        full_text = ' '.join(text_candidates)
        
        # Get title from title tag or first heading
        title = soup.title.string if soup.title else (soup.find(['h1', 'h2']) or {}).get_text(strip=True) or ''
        
        # Look for any potential meta tags
        meta_description = ''
        meta_tags = soup.find_all('meta')
        for tag in meta_tags:
            if tag.get('name', '').lower() in ['description', 'og:description']:
                meta_description = tag.get('content', '')
                break
        
        return {
            'filename': filename,
            'full_text': full_text,
            'title': title,
            'meta_description': meta_description,
            'domain': filename_metadata.get('domain', ''),
            'source': filename_metadata.get('source', ''),
            'date_in_filename': filename_metadata.get('date_in_filename', ''),
            'timestamp': filename_metadata.get('timestamp', ''),
            'file_path': html_path
        }
    except Exception as e:
        logger.error(f"Error processing {html_path}: {e}")
        return {
            'filename': os.path.basename(html_path),
            'error': str(e)
        }

def extract_articles_to_csv(folder_path: str, output_csv: str) -> None:
    """
    Extract information from all HTML files in a folder and save to CSV
    
    Args:
        folder_path (str): Path to the folder containing HTML files
        output_csv (str): Path to save the output CSV file
    """
    # Get all HTML files in the folder
    html_files = [
        os.path.join(folder_path, f) 
        for f in os.listdir(folder_path) 
        if f.endswith('.html')
    ]
    
    logger.info(f"Found {len(html_files)} HTML files to process")
    
    # Extract article info with progress bar
    articles_data = []
    skipped_files = 0
    
    for file in tqdm(html_files, desc="Extracting Articles", unit="file"):
        article = extract_text_from_html(file)
        
        # Filter out entries with errors or empty text
        if 'error' not in article and article.get('full_text', '').strip():
            articles_data.append(article)
        else:
            skipped_files += 1
    
    # Convert to DataFrame for easy CSV export
    df = pd.DataFrame(articles_data)
    
    # Save to CSV
    df.to_csv(output_csv, index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')
    
    # Log summary
    logger.info(f"Extracted {len(articles_data)} articles to {output_csv}")
    logger.info(f"Skipped {skipped_files} files due to errors or empty content")

# Example usage
if __name__ == "__main__":
    # Replace these with your actual paths
    input_folder = "downloaded_articles/"
    output_file = "usa_today_articles.csv"
    
    extract_articles_to_csv(input_folder, output_file)

2025-04-07 19:45:50,493 - INFO - Found 24624 HTML files to process
Extracting Articles:   1%|                | 176/24624 [00:02<05:07, 79.60file/s]2025-04-07 19:45:53,017 - ERROR - Error processing downloaded_articles/www_usatoday_com_videos_news_nation_2022_09_28_international-space-station-gets-new-commander_10454984002_20250331_200013.html: 'dict' object has no attribute 'get_text'
2025-04-07 19:45:53,029 - ERROR - Error processing downloaded_articles/www_usatoday_com_videos_news_nation_2022_07_04_heavy-rains-and-floods-prompt-sydney-evacuations_7801839001_20250331_143633.html: 'dict' object has no attribute 'get_text'
Extracting Articles:   1%|▏               | 195/24624 [00:02<04:47, 84.93file/s]2025-04-07 19:45:53,299 - ERROR - Error processing downloaded_articles/www_usatoday_com_videos_news_nation_2022_10_27_canine-cafe-offers-75-tasting-menu-pampered-pups_10612869002_20250401_193143.html: 'dict' object has no attribute 'get_text'
Extracting Articles:   1%|▏               | 227