In [4]:
import os
import gzip
from bs4 import BeautifulSoup

In [5]:
# Paths
input_dir = '../data/html/'
output_dir = '../data/text/'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# List all gzipped HTML files in the directory
html_files = [f for f in os.listdir(input_dir) if f.endswith('.html.gz')]

# Process each file
for i, html_file in enumerate(html_files):
    try:
        # Open and decompress the gzipped file
        with gzip.open(os.path.join(input_dir, html_file), 'rt', encoding='utf-8') as f:
            soup = BeautifulSoup(f.read(), 'html.parser')
        
        texts = []
        elem = soup.find('p', {'class': 'speakable'})
        
        if elem:
            texts.extend(elem.get_text(strip=True, separator='\n').splitlines())
            for elem in elem.next_siblings:
                if hasattr(elem, 'get_text'):  # Check if element has get_text method
                    texts.extend(elem.get_text(strip=True, separator='\n').splitlines())
        
        # Create output filename by removing .gz extension
        txt_file = html_file.replace('.html.gz', '.txt')
        with open(os.path.join(output_dir, txt_file), 'w', encoding='utf-8') as f:
            f.write('\n'.join(texts))
    
    except Exception as e:
        print(e, html_file)
    
    if i % 1000 == 0:
        print(i, html_file)

print(f"Processed {len(html_files)} HTML files")

0 duck-dynasty-star-phil-robertson-on-aiding-trump-i-pointed-him-to-jesus.html.gz
1000 ted-cruz-on-his-campaign-ahead-of-the-iowa-caucuses.html.gz
2000 6260618066001.html.gz
3000 6368939575112.html.gz
4000 buchanan-obama-exposed-as-utterly-naive.html.gz
'utf-8' codec can't decode bytes in position 123140-123141: invalid continuation byte alan-dershowitz-on-white-house-dinner-cohen-raid.html.gz
5000 sen-rand-paul-budget-deal-brings-back-obama-era-deficits.html.gz
6000 6331210464112.html.gz
7000 taxes-forcing-businesses-residents-to-move.html.gz
8000 bill-oreilly-the-big-republican-debate-and-reaction-to-donald-trump-on-the-factor.html.gz
9000 6359616734112.html.gz
10000 big-smile-bigger-heart.html.gz
11000 6319509061112.html.gz
12000 6369575762112.html.gz
13000 6366572815112.html.gz
14000 sen-lindsey-graham-orlando-is-a-symptom-of-obamas-failing-isis-policy.html.gz
15000 archbishop-pelosi-says-walls-are-immoral-so-what-does-she-think-about-israel-and-its-massive-new-wall.html.gz
16000 t

In [6]:
import tarfile

# Specify the folder to compress and the output filename
folder_to_compress = "../data/text/"  # Replace with your folder path
output_filename = "../data/fnc_transcripts_text_2025.tar.gz"          # Name of the output archive

def create_tarball(source_dir, output_filename):
    with tarfile.open(output_filename, "w:gz") as tar:
        # Add the directory to the archive
        tar.add(source_dir, arcname=os.path.basename(source_dir))
    
    print(f"Created archive: {output_filename}")
    print(f"Archive size: {os.path.getsize(output_filename) / (1024*1024):.2f} MB")

# Run the function
create_tarball(folder_to_compress, output_filename)

Created archive: ../data/fnc_transcripts_text_2025.tar.gz
Archive size: 198.04 MB


In [7]:
urls = pd.read_csv("../data/foxnews-transcript-urls-2025.csv")

In [8]:
urls.head()

Unnamed: 0,imageUrl,title,description,url,publicationDate,lastPublishedDate,category,isBreaking,isLive,duration,authors
0,https://a57.foxnews.com/cf-images.us-east-1.pr...,"'Fox News Sunday' on October 20, 2024",Maryland Gov. Wes Moore weighs in on Harris' p...,/transcript/fox-news-sunday-october-20-2024,2024-10-27 18:14:43+00:00,2024-10-27T14:14:43-04:00,"{'name': 'TRANSCRIPT', 'url': '/category/shows...",False,False,,[{'name': 'Fox News Staff'}]
1,https://a57.foxnews.com/cf-images.us-east-1.pr...,"'Fox News Sunday' on September 15, 2024",‘Fox News Sunday’ anchor Shannon Bream welcome...,/transcript/fox-news-sunday-september-15-2024,2024-10-07 13:01:55+00:00,2024-10-07T09:01:55-04:00,"{'name': 'TRANSCRIPT', 'url': '/category/shows...",False,False,,[{'name': 'Fox News Staff'}]
2,https://a57.foxnews.com/cf-images.us-east-1.pr...,"'Fox News Sunday' on September 8, 2024","This week on 'Fox News Sunday,' host Shannon B...",/transcript/fox-news-sunday-september-8-2024,2024-09-10 22:04:22+00:00,2024-09-10T18:04:22-04:00,"{'name': 'TRANSCRIPT', 'url': '/category/shows...",False,False,,[{'name': 'Fox News Staff'}]
3,https://a57.foxnews.com/cf-images.us-east-1.pr...,"'Fox News Sunday' on August 25, 2024","This week on ‘Fox News Sunday,’ host Shannon B...",/media/fox-news-sunday-august-25-2024,2024-08-25 17:40:15+00:00,2024-08-25T13:40:15-04:00,"{'name': 'TRANSCRIPT', 'url': '/category/shows...",False,False,,[{'name': 'kayla bailey'}]
4,https://a57.foxnews.com/cf-images.us-east-1.pr...,"'Fox News Sunday' on July 21, 2024","This week on ‘Fox News Sunday,’ Shannon Bream ...",/transcript/fox-news-sunday-july-21-2024,2024-07-21 17:53:48+00:00,2024-07-21T13:53:48-04:00,"{'name': 'TRANSCRIPT', 'url': '/category/shows...",False,False,,[{'name': 'Fox News Staff'}]


In [15]:
urls['publicationDate'] = pd.to_datetime(urls['publicationDate'])
urls['year'] = urls['publicationDate'].dt.year
yearly_counts = urls.groupby('year').size().reset_index(name='count')

markdown_table = "| Year | Count |\n| --- | --- |\n"
for _, row in yearly_counts.iterrows():
    markdown_table += f"| {row['year']} | {row['count']} |\n"

In [16]:
markdown_table

'| Year | Count |\n| --- | --- |\n| 2003 | 450 |\n| 2004 | 365 |\n| 2005 | 431 |\n| 2006 | 411 |\n| 2007 | 304 |\n| 2008 | 418 |\n| 2009 | 425 |\n| 2010 | 314 |\n| 2011 | 523 |\n| 2012 | 1019 |\n| 2013 | 777 |\n| 2014 | 866 |\n| 2015 | 890 |\n| 2016 | 821 |\n| 2017 | 1259 |\n| 2018 | 1752 |\n| 2019 | 5865 |\n| 2020 | 5995 |\n| 2021 | 5400 |\n| 2022 | 6782 |\n| 2023 | 9585 |\n| 2024 | 8256 |\n| 2025 | 1474 |\n'