### Get URLs for FNC

In [8]:
import requests
import time
import logging
import pandas as pd

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='fox_news_transcript_scraper.log',
    filemode='a'
)
logger = logging.getLogger(__name__)

In [2]:
def fetch_fox_news_category_data(page_size=10, max_retries=3, delay=1):
    """
    Fetch article metadata from Fox News API for the transcript category.
    
    Args:
        page_size: Number of results per page
        max_retries: Maximum number of retry attempts for failed requests
        delay: Time to wait between requests in seconds
        
    Returns:
        List of article metadata items from the transcript category
    """
    category_data = []
    start = 0
    total_fetched = 0
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
    }
    
    logger.info(f"Starting Fox News category API data collection (page size: {page_size})")
    
    while True:
        params = {
            'searchBy': 'categories',
            'values': 'fox-news/transcript',
            'size': page_size,
            'from': start
        }
        
        # Attempt request with retry logic
        for attempt in range(max_retries):
            try:
                response = requests.get(
                    'https://www.foxnews.com/api/article-search',
                    params=params,
                    headers=headers,
                    timeout=10
                )
                response.raise_for_status()  # Raise exception for bad status codes
                break
            except requests.exceptions.RequestException as e:
                logger.error(f"Request failed (attempt {attempt+1}/{max_retries}): {e}")
                if attempt == max_retries - 1:
                    logger.error("Max retries reached. Exiting.")
                    return category_data
                time.sleep(delay * 2)  # Wait longer between retries
        
        # Process successful response
        logger.info(f"Fetched page {start//page_size + 1} (from: {start}, status: {response.status_code})")
        
        try:
            data = response.json()
            if not data:
                logger.info("No more results. Ending pagination.")
                break
                
            count = len(data)
            category_data.extend(data)
            total_fetched += count
            
            logger.info(f"Added {count} items from current page. Total: {total_fetched}")
            
            # Prepare for next page
            start += page_size
            time.sleep(delay)  # Be polite to the server
            
        except ValueError as e:
            logger.error(f"Failed to parse JSON response: {e}")
            break
    
    logger.info(f"Completed category data collection. Total items: {len(category_data)}")
    return category_data

In [None]:
categories = fetch_fox_news_category_data()

In [5]:
def process_category_data(categories):
    """
    Process raw category data into a clean DataFrame.
    
    Args:
        categories: List of category data from the Fox News API
        
    Returns:
        DataFrame with processed category information
    """
    # Create initial DataFrame
    cat_df = pd.DataFrame(categories)
    
    # Extract category value and type
    cat_df['cat_value'] = cat_df.category.apply(
        lambda c: 'fox-news' + c['url'].split('/category')[-1] if isinstance(c, dict) and 'url' in c else None
    )
    cat_df['cat_type'] = cat_df.category.apply(
        lambda c: c['name'] if isinstance(c, dict) and 'name' in c else None
    )
    
    # Log some information about the categories
    logger.info(f"Processed {len(cat_df)} items into {cat_df.cat_value.nunique()} unique categories")
    
    return cat_df

In [9]:
cat_df = process_category_data(results)
cat_df.cat_value.unique()

array(['fox-news/transcript', 'fox-news/shows/fox-news-sunday/transcript',
       'fox-news/shows/sunday-morning-futures/transcript',
       'fox-news/shows/media-buzz/transcript',
       'fox-news/shows/life-liberty-levin/life-liberty-levin-transcript',
       'fox-news/shows/gutfeld/transcript-gutfeld',
       'fox-news/shows/ingraham-angle/transcript',
       'fox-news/shows/hannity/transcript',
       'fox-news/shows/tucker-carlson-tonight/transcript',
       'fox-news/shows/special-report/transcript',
       'fox-news/shows/the-five/transcript',
       'fox-news/shows/your-world/transcript',
       'fox-news/shows/fox-news-sunday',
       'fox-news/shows/cost-of-freedom/transcript',
       'fox-news/media/fox-news-flash', 'fox-news/shows/gutfeld',
       'fox-news/shows/on-the-record/transcript',
       'fox-news/shows/justice-with-judge-jeanine/justice-with-judge-jeanine-transcript',
       'fox-news/shows/watters-world/watters-world-transcript',
       'fox-news/shows/hannity', 

In [10]:
cat_df[cat_df.cat_type != 'TRANSCRIPT']

Unnamed: 0,imageUrl,title,description,url,publicationDate,lastPublishedDate,category,isBreaking,isLive,duration,authors,cat_value,cat_type
0,https://a57.foxnews.com///static.foxnews.com/s...,Fox News Sunday Local Air Times,Please check back for our updated Fox News Sun...,/transcript/fox-news-sunday-local-air-times,2018-01-16T15:17:00-05:00,2024-11-25T10:16:16-05:00,"{'name': 'transcript', 'url': '/category/trans...",False,False,,[],fox-news/transcript,transcript
52,https://a57.foxnews.com/cf-images.us-east-1.pr...,'Sunday Morning Futures' on Democrats facing a...,"Guests: Ted Budd, Robert Cahaly, Mike Lee, Her...",/transcript/sunday-morning-futures-democrats-f...,2022-11-06T10:00:41-05:00,2022-11-07T10:41:41-05:00,"{'name': 'Transcript', 'url': '/category/shows...",False,False,,[],fox-news/shows/sunday-morning-futures/transcript,Transcript
54,https://a57.foxnews.com/cf-images.us-east-1.pr...,"'Life, Liberty & Levin' on what's at stake in ...","Guests: Mehmet Oz, Blake Masters, Lee Zeldin, ...",/transcript/life-liberty-levin-stake-midterm-e...,2022-11-06T20:00:15-05:00,2022-11-07T09:58:43-05:00,"{'name': 'Life Liberty & Levin Transcript', 'u...",False,False,,[],fox-news/shows/life-liberty-levin/life-liberty...,Life Liberty & Levin Transcript
56,https://a57.foxnews.com/cf-images.us-east-1.pr...,"'Gutfeld!' on Paul Pelosi attack reaction, DHS...","Guests: Tom Cotton, Emily Compagno, Joe DeVito...",/transcript/gutfeld-paul-pelosi-attack-reactio...,2022-11-01T23:00:58-04:00,2022-11-03T08:37:55-04:00,"{'name': 'Gutfeld Transcript', 'url': '/catego...",False,False,,[],fox-news/shows/gutfeld/transcript-gutfeld,Gutfeld Transcript
68,https://a57.foxnews.com/cf-images.us-east-1.pr...,'Gutfeld!' on NYC crime and subway murder rates,"Guests: Rob Schneider, Michelle Tafoya, Kat Ti...",/transcript/gutfeld-on-nyc-crime-and-subway-mu...,2022-10-12T23:00:16-04:00,2022-10-14T15:30:22-04:00,"{'name': 'Gutfeld Transcript', 'url': '/catego...",False,False,,[],fox-news/shows/gutfeld/transcript-gutfeld,Gutfeld Transcript
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,https://a57.foxnews.com///static.foxnews.com/s...,Pork Barrel Spending in Pennsylvania?,Show Me the Money Centers named for two Pennsy...,/transcript/pork-barrel-spending-in-pennsylvania,2012-04-05T17:40:10-04:00,2017-05-08T16:59:03-04:00,"{'name': 'SPECIAL REPORT', 'url': '/category/s...",False,False,,[],fox-news/shows/special-report,SPECIAL REPORT
9996,https://a57.foxnews.com///static.foxnews.com/s...,Another Politician Gets Caught With a Resume P...,"Friday Follow-Up In today's Friday Follow-Up, ...",/transcript/another-politician-gets-caught-wit...,2012-04-05T17:40:10-04:00,2017-05-08T16:59:01-04:00,"{'name': 'SPECIAL REPORT', 'url': '/category/s...",False,False,,[],fox-news/shows/special-report,SPECIAL REPORT
9997,https://a57.foxnews.com///static.foxnews.com/s...,'Special Report' Panel on Mexican President's ...,PRESIDENT BARACK OBAMA: I think the Arizona la...,/transcript/special-report-panel-on-mexican-pr...,2012-04-05T17:40:10-04:00,2017-05-08T16:58:59-04:00,"{'name': 'SPECIAL REPORT', 'url': '/category/s...",False,False,,[],fox-news/shows/special-report,SPECIAL REPORT
9998,https://a57.foxnews.com///static.foxnews.com/s...,AllStar Panelist Interviews,BAIER: Let me insert this.,/transcript/allstar-panelist-interviews,2012-04-05T17:40:10-04:00,2017-05-08T16:58:58-04:00,"{'name': 'SPECIAL REPORT', 'url': '/category/s...",False,False,,[],fox-news/shows/special-report,SPECIAL REPORT


In [11]:
def fetch_fox_news_show_data(cat_df, page_size=30, delay=1, max_retries=1):
    """
    Fetch article data from Fox News API for show tags.
    
    Args:
        cat_df: DataFrame containing category values in a 'cat_value' column
        page_size: Number of results per page
        delay: Time to wait between requests in seconds
        max_retries: Maximum retry attempts for failed requests
        
    Returns:
        List of article data from shows
    """
    all_data = []
    show_count = 0
    
    # Get unique show categories
    show_categories = [val for val in cat_df.cat_value.unique() if 'shows' in val]
    logger.info(f"Found {len(show_categories)} show categories to process")
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
    }
    
    # Process each show category
    for i, values in enumerate(show_categories):
        logger.info(f"Processing category {i+1}/{len(show_categories)}: {values}")
        
        start = 0
        category_items = 0
        
        # Paginate through all results for this category
        while True:
            params = {
                'searchBy': 'tags',
                'values': values,
                'size': page_size,
                'from': start,
                'excludeBy': 'tags',
                'excludeValues': ''
            }
            
            # Try the request with retries
            for attempt in range(max_retries):
                try:
                    response = requests.get(
                        'https://www.foxnews.com/api/article-search', 
                        params=params, 
                        headers=headers,
                        timeout=10
                    )
                    response.raise_for_status()
                    break
                except requests.exceptions.RequestException as e:
                    logger.error(f"Request failed (attempt {attempt+1}/{max_retries}): {e}")
                    if attempt == max_retries - 1:
                        logger.error(f"Max retries reached for {values}. Moving to next category.")
                        break
                    time.sleep(delay * 2)
            
            try:
                results = response.json()
                logger.info(f"Page {start//page_size + 1}: Retrieved {len(results)} items (offset {start})")
                
                # If no results, we've reached the end of this category
                if not results:
                    logger.info(f"No more results for {values}")
                    break
                
                # Add results to our data collection
                all_data.extend(results)
                category_items += len(results)
                
                # Prepare for next page
                start += page_size
                time.sleep(delay)
                
            except ValueError as e:
                logger.error(f"Failed to parse JSON response: {e}")
                break
        
        show_count += 1
        logger.info(f"Completed {values}: {category_items} items")
    
    logger.info(f"Finished processing {show_count} show categories. Total items: {len(all_data)}")
    return all_data

In [None]:
data = fetch_fox_news_show_data(cat_df)

In [None]:
df = pd.DataFrame(data)
df.drop_duplicates(subset=['url'], inplace = True)

In [None]:
df.shape

In [None]:
df['publicationDate'] = pd.to_datetime(df['publicationDate'], utc = True)

In [None]:
df['publicationDate'].min()

In [None]:
df['publicationDate'].max()

In [None]:
df.to_csv('foxnews-transcript-urls-2025.csv.gz', index=False, compression='gzip')