Author: Naomi Baes and Chat GPT //
Source: https://osf.io/uya29/ ; Authors: Salvatore Giorgi, Daniel Roy Sadek Habib, Douglas Bellew, Garrick Sherman, and Brenda Curtis//
Aim: This script is designed to parse HTML files containing articles from The New York Times website. It extracts various metadata fields such as the article title, ID, publication date, section, and paragraph text from HTML files and organizes them into a structured format (e.g., CSV or TSV). Additionally, it handles certain cases where the HTML files do not contain actual articles, logging these cases as errors.

In [1]:
import numpy as np 
import os
import pandas as pd
import glob
import csv
import regex
from bs4 import BeautifulSoup
import json
import datetime
import requests  # Import requests to download HTML content

DEFAULT_DATA_DIRECTORY = "C:/Users/naomi/OneDrive/COMP80004_PhDResearch/RESEARCH/DATA/CORPORA/MEDIA/NYT/output"

def download_html(url, output_dir):
    try:
        response = requests.get(url)
        response.raise_for_status()
        url_filename = url.split("/")[-1]
        file_path = os.path.join(output_dir, f"{url_filename}.html")
        with open(file_path, "w", encoding='utf-8') as file:
            file.write(response.text)
        return file_path
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")
        return None

def parse_html_file(filename, field_list, error_list):
    df = pd.DataFrame(columns=field_list)
    with open(filename, "r", encoding='utf-8') as in_file:
        html_tree = BeautifulSoup(in_file, "lxml")
        row_dict = {}
        try:
            no_article = False
            if len(html_tree.find_all("article")) <= 0:
                no_article = True
                if "NoArticle" not in error_list:
                    error_list["NoArticle"] = []
                error_list["NoArticle"].append(filename.split("/")[-1])
                return df
            if len(html_tree.find_all("meta", attrs={"content": "Word of the Day"})) > 0:
                if "WordofDay" not in error_list:
                    error_list["WordofDay"] = []
                error_list["WordofDay"].append(filename.split("/")[-1])
                return df
            elif len(html_tree.find_all("nav", attrs={"aria-labelledby": "showcontrols"})) > 0:
                if "Slideshow" not in error_list:
                    error_list["Slideshow"] = []
                error_list["Slideshow"].append(filename.split("/")[-1])
                return df
            elif len(html_tree.find_all("meta", attrs={"content": "interactive"})) > 0:
                if "InteractivePoll" not in error_list:
                    error_list["InteractivePoll"] = []
                error_list["InteractivePoll"].append(filename.split("/")[-1])                       
                return df 
            elif len(html_tree.find_all("div", attrs={"class": "nytint-discussion-content"})) > 0:
                if "Discussion" not in error_list:
                    error_list["Discussion"] = []
                error_list["Discussion"].append(filename.split("/")[-1])                         
                return df             
            elif len(html_tree.find_all("meta", attrs={"property": "article:tag", "content": "Picture Prompt"})) > 0:
                if "PicturePrompt" not in error_list:
                    error_list["PicturePrompt"] = []
                error_list["PicturePrompt"].append(filename.split("/")[-1])                          
                return df
            elif ((len(html_tree.find_all("meta", attrs={"property": "article:tag", "content": "In Our Pages"})) > 0) or
                  (len(html_tree.find_all("meta", attrs={"name": "SCG", "content": "iht-retrospective"})) > 0)):
                if "HistoricalArticle" not in error_list:
                    error_list["HistoricalArticle"] = []
                error_list["HistoricalArticle"].append(filename.split("/")[-1])
                return df
            elif ((len(html_tree.find_all("meta", attrs={"property": "og:url"}, content=regex.compile("/audio/"))) > 0) or
                  (len(html_tree.find_all("meta", attrs={"property": "og:url"}, content=regex.compile("/video/"))) > 0)):
                if "AVFile" not in error_list:
                    error_list["AVFile"] = []
                error_list["AVFile"].append(filename.split("/")[-1])
                return df
            
            if no_article:
                if "NoErrorWithNoArticle" not in error_list:
                    error_list["NoErrorWithNoArticle"] = []
                error_list["NoErrorWithNoArticle"].append(filename)
            
            if "title" in field_list:
                row_dict["title"] = html_tree.find("meta", property="og:title", content=True)
                if row_dict["title"] is None:
                    print("No title found in file:" + filename)
                else:
                    row_dict["title"] = row_dict["title"]["content"]
                    if row_dict["title"].find("(Published") >= 0:
                        row_dict["title"] = row_dict["title"][0:row_dict["title"].find("(Published")].rstrip()     

            if "article_id" in field_list:
                row_dict["article_id"] = html_tree.find("meta", {"name": "articleid"}, content=True)
                if row_dict["article_id"] is None:
                    row_dict["article_id"] = html_tree.find("meta", {"name": "blogpostid"}, content=True)
                if row_dict["article_id"] is not None:
                    row_dict["article_id"] = row_dict["article_id"]["content"]
                else:
                    row_dict["article_id"] = ""
                    check_for_no_article(error_list, filename.split("/")[-1])

            if "publish_date" in field_list:
                row_dict["publish_date"] = html_tree.find("meta", {"property": "article:published_time"}, content=True)
                if row_dict["publish_date"] is not None:
                    row_dict["publish_date"] = row_dict["publish_date"]["content"]
                    try:
                        row_dict["publish_date"] = datetime.datetime.strptime(row_dict["publish_date"], "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%Y-%m-%d")
                    except ValueError:
                        pass
                else:
                    row_dict["publish_date"] = ""

            if "section" in field_list:
                row_dict["section"] = html_tree.find("meta", {"property": "article:section"}, content=True)
                if row_dict["section"] is not None:
                    row_dict["section"] = row_dict["section"]["content"]
                else:
                    row_dict["section"] = ""

            if "keyword_list" in field_list:
                row_dict["keyword_list"] = html_tree.find("meta", {"name": "news_keywords"}, content=True)
                if row_dict["keyword_list"] is not None:
                    row_dict["keyword_list"] = row_dict["keyword_list"]["content"].split(",")
                else:
                    row_dict["keyword_list"] = []

            if "paragraph_text" in field_list:
                paragraphs = html_tree.find_all("p")
                for i, paragraph in enumerate(paragraphs):
                    row_dict["paragraph_num"] = i + 1
                    row_dict["paragraph_text"] = paragraph.get_text()
                    df = df.append(row_dict, ignore_index=True)

        except Exception as e:
            print(f"Error processing file {filename}: {e}")
            check_for_no_article(error_list, filename.split("/")[-1])
    return df

def main(data_directory=DEFAULT_DATA_DIRECTORY):
    error_list = {}
    field_list = ["title", "article_id", "paragraph_num", "publish_date", "section", "paragraph_text", "keyword_list"]
    all_files = glob.glob(os.path.join(data_directory, "*.txt"))

    for file in all_files:
        with open(file, "r") as url_file:
            urls = url_file.readlines()
            for url in urls:
                url = url.strip()
                html_filename = download_html(url, data_directory)
                if html_filename:
                    df = parse_html_file(html_filename, field_list, error_list)
                    # You can save or process the df as needed
                    # Example: df.to_csv(f"{html_filename}.csv", index=False)

if __name__ == "__main__":
    main()


Error downloading https://www.nytimes.com/1930/01/01/archives/chamberlin-to-open-byrd-exhibit.html: 403 Client Error: Forbidden for url: https://www.nytimes.com/1930/01/01/archives/chamberlin-to-open-byrd-exhibit.html
Error downloading https://www.nytimes.com/1930/01/01/archives/pugsley-to-drop-bank-duties.html: 403 Client Error: Forbidden for url: https://www.nytimes.com/1930/01/01/archives/pugsley-to-drop-bank-duties.html
Error downloading https://www.nytimes.com/1930/01/01/archives/divided-as-to-future-here-majority-opinion-in-german-financial.html: 403 Client Error: Forbidden for url: https://www.nytimes.com/1930/01/01/archives/divided-as-to-future-here-majority-opinion-in-german-financial.html
Error downloading https://www.nytimes.com/1930/01/01/archives/live-stock-in-chicago.html: 403 Client Error: Forbidden for url: https://www.nytimes.com/1930/01/01/archives/live-stock-in-chicago.html
Error downloading https://www.nytimes.com/1930/01/01/archives/jibes-at-prince-of-wales-new-lon

KeyboardInterrupt: 