Author: Naomi Baes and Chat GPT //
Source: https://osf.io/uya29/ ; Authors: Salvatore Giorgi, Daniel Roy Sadek Habib, Douglas Bellew, Garrick Sherman, and Brenda Curtis//
Aim: This script is designed to parse HTML files containing articles from The New York Times website. It extracts various metadata fields such as the article title, ID, publication date, section, and paragraph text from HTML files and organizes them into a structured format (e.g., CSV or TSV). Additionally, it handles certain cases where the HTML files do not contain actual articles, logging these cases as errors.

In [2]:
# Setup

import numpy as np 
import os
import pandas as pd
import argparse
import glob
import csv
import regex
from bs4 import BeautifulSoup
import json
import datetime
import requests  # Import requests to download HTML content

DEFAULT_DATA_DIRECTORY = "C:/Users/naomi/OneDrive/COMP80004_PhDResearch/RESEARCH/DATA/CORPORA/MEDIA/NYT/output"

In [None]:
# This function checks if a given filename is not present in the "NoArticle" list of the error_list dictionary.
# If the filename is not present, it appends the filename to the "ErrorWithArticle" list in the error_list dictionary.
def check_for_no_article(error_list, filename):
    
    if (filename not in error_list["NoArticle"]):
        if "ErrorWithArticle" not in error_list:
            error_list["ErrorWithArticle"] = []
        error_list["ErrorWithArticle"].append(filename)

# This function is responsible for parsing an HTML file and extracting specific fields listed in the field_list.
# It uses BeautifulSoup to parse the HTML content.
# It checks various conditions in the HTML content to determine if the file should be skipped or if it contains errors.
# If the HTML file meets certain conditions (e.g., absence of an article tag, presence of specific meta tags), it adds the filename to the corresponding error list in the error_list dictionary.
# If the file is deemed valid, it extracts fields such as title, article_id, publish_date, section, keyword_list, and paragraph_text.
# It then constructs a DataFrame (df) containing the extracted data.
# Finally, it returns the DataFrame.
def parse_html_file(filename, field_list, error_list):
    #"title", "article_id", "paragraph_num", "publish_date", "section", "paragraph_text", [keyword_list]

    #row_list = []
    df = pd.DataFrame(columns=field_list)
    with open(filename,"r") as in_file:
        html_tree = BeautifulSoup(in_file, "lxml")
        row_dict = {}
        try:
            #Search for skippable files:
            #Doesn't contain an article tag
            no_article = False
            if (len(html_tree.find_all("article")) <=0):
                no_article = True
                if "NoArticle" not in error_list:
                    error_list["NoArticle"] = []
                error_list["NoArticle"].append(filename.split("/")[-1])
                return df
            #<meta property="article:tag" content="Word of the Day">
            if (len(html_tree.find_all("meta", attrs={"content":"Word of the Day"})) > 0):
                #print("May be a Word of the Day - skip file:" + filename)
                if "WordofDay" not in error_list:
                    error_list["WordofDay"] = []
                error_list["WordofDay"].append(filename.split("/")[-1])
                #check_for_no_article(error_list, filename.split("/")[-1])
                return df
            #<nav class="css-k38v3p" aria-labelledby="showcontrols">
            elif (len(html_tree.find_all("nav", attrs={"aria-labelledby":"showcontrols"})) > 0):
                #print("May be a slideshow - skip file:"+filename)
                if "Slideshow" not in error_list:
                    error_list["Slideshow"] = []
                error_list["Slideshow"].append(filename.split("/")[-1])
                #check_for_no_article(error_list, filename.split("/")[-1])
                return df
            #<meta name="applicationName" content="interactive" id="applicationName">
            elif (len(html_tree.find_all("meta", attrs={"content":"interactive"})) > 0):
                #print("May be a interactive poll - skip file:"+filename)
                if "InteractivePoll" not in error_list:
                    error_list["InteractivePoll"] = []
                error_list["InteractivePoll"].append(filename.split("/")[-1])   
                #check_for_no_article(error_list, filename.split("/")[-1])                       
                return df 
            #<div class="nytint-discussion-content">
            elif (len(html_tree.find_all("div", attrs={"class":"nytint-discussion-content"})) > 0):
                #print("May be a discussion article quoting multiple sources - skip file:"+filename)
                if "Discussion" not in error_list:
                    error_list["Discussion"] = []
                error_list["Discussion"].append(filename.split("/")[-1])  
                #check_for_no_article(error_list, filename.split("/")[-1])                         
                return df             
            #<meta property="article:tag" content="Picture Prompt">
            elif (len(html_tree.find_all("meta", attrs={"property":"article:tag","content":"Picture Prompt"})) > 0):
                #print("May be a picture prompt - skip file:"+filename)
                if "PicturePrompt" not in error_list:
                    error_list["PicturePrompt"] = []
                error_list["PicturePrompt"].append(filename.split("/")[-1])  
                #check_for_no_article(error_list, filename.split("/")[-1])                          
                return df
            # These items and others I eventually found all had the "No Article Tag" Commonality
            #<meta property="article:tag" content="News Q's">         
            #elif (len(html_tree.find_all("meta", attrs={"property":"article:tag","content":"News Q's"})) > 0):
            #    print("May be a question prompt - skip file:"+filename)            
            #    return df  
            #<meta property="article:tag" content="Student Opinion">
            #elif (len(html_tree.find_all("meta", attrs={"property":"article:tag","content":"Student Opinion"})) > 0):
            #    print("May be a student response prompt - skip file:"+filename)            
            #    return df 
            
            # These items are snippet refereces to older articles
            #<meta property="article:tag" content="In Our Pages">
            #<meta name="SCG" content="iht-retrospective">
            elif ((len(html_tree.find_all("meta", attrs={"property":"article:tag","content":"In Our Pages"})) > 0) or
                  (len(html_tree.find_all("meta", attrs={"name":"SCG","content":"iht-retrospective"})) > 0)) :
                #print("May be a link to a historical article - skip file:"+filename)        
                if "HistoricalArticle" not in error_list:
                    error_list["HistoricalArticle"] = []
                error_list["HistoricalArticle"].append(filename.split("/")[-1])
                #check_for_no_article(error_list, filename.split("/")[-1])
                return df
            #Articles that were just either an audio or video file
            #<meta data-rh="true" property="og:url" content="https://www.nytimes.com/audio/2016/09/13/insider/13-Insider-Pete-Audio.html">
            #<meta data-rh="true" property="og:url" content="https://www.nytimes.com/video/world/middleeast/100000004643525/postcards-from-the-hajj-the-crowds.html">
            elif ((len(html_tree.find_all("meta", attrs={"property":"og:url"},content=regex.compile("/audio/"))) > 0) or
                  (len(html_tree.find_all("meta", attrs={"property":"og:url"},content=regex.compile("/video/"))) > 0)) :
                #print("May be a link to a audio or video - skip file:"+filename)            
                if "AVFile" not in error_list:
                    error_list["AVFile"] = []
                error_list["AVFile"].append(filename.split("/")[-1])
                #check_for_no_article(error_list, filename.split("/")[-1])
                return df
            
            #Double check if any files had no artile tag but weren't caught by the above tags
            # Superseeded by making having "No Article" an error instead of a warning
            if (no_article):
                if "NoErrorWithNoArticle" not in error_list:
                    error_list["NoErrorWithNoArticle"] = []
                error_list["NoErrorWithNoArticle"].append(filename)
            
            #
            # Should be Valid File - Parse HTML by Tag listed in the field list
            # Ordering shouldn't matter as we're putting the data into a dataframe (with ordered columns)
            # before writing out the data
            #
            if ("title" in field_list):
                #<meta data-rh="true" property="og:title" content="Opinion | Trump Picks Wall Street Over Main Street (Published 2017)">
                #<meta property="og:title" content="Introducing kyt — Our Web App Configuration Toolkit">
                row_dict["title"] = html_tree.find("meta", property="og:title", content=True)
                if row_dict["title"] is None:
                    print("No title found in file:"+filename)
                else:
                    row_dict["title"] = row_dict["title"]["content"]
                    if (row_dict["title"].find("(Published") >= 0):
                        row_dict["title"] = row_dict["title"][0:row_dict["title"].find("(Published")].rstrip()     
                #print(row_dict["title"])
            
            if ("article_id" in field_list):
                #<meta data-rh="true" name="articleid" content="100000004912753">
                #<meta itemprop="identifier" name="blogpostid" content="100000004644202">
                #<article class="post-7322 post type-post status-publish hentry category-code category-open-source tag-configuration tag-open-source tag-toolkit des-computers-and-the-internet des-open-source-software org-new-york-times" id="post-7322">
                row_dict["article_id"] = html_tree.find("meta",  attrs={"name":"articleid"}, content=True)
                if row_dict["article_id"] is None:
                    row_dict["article_id"] = html_tree.find("meta", attrs={"itemprop":"identifier"}, content=True)
                    if row_dict["article_id"] is None:
                        if (len(html_tree.find_all("article", id=True)) > 0):
                            row_dict["article_id"] = html_tree.find_all("article")[0]["id"]
                        else:             
                            print("No article_id found in file:"+filename)
                    else:
                        row_dict["article_id"] = row_dict["article_id"]["content"]
                else:
                    row_dict["article_id"] = row_dict["article_id"]["content"]
                #print(row_dict["article_id"])
            
            if ("publish_date" in field_list):
                #<meta data-rh="true" property="article:published_time" content="2017-02-04T17:30:57.000Z">
                row_dict["publish_date"] = html_tree.find("meta",property="article:published_time", content=True)
                if row_dict["publish_date"] is None:
                    print("No publish_date found in file:"+filename)
                else:
                    row_dict["publish_date"] = row_dict["publish_date"]["content"]
                    # Possilbe responses: 1474096173, 2016-09-17T05:00:29.000Z
                    if (len(row_dict["publish_date"]) == 10):
                        try:
                            date_num = int(row_dict["publish_date"])
                            date_time = datetime.datetime.fromtimestamp(date_num)
                            row_dict["publish_date"] = date_time.isoformat(timespec="milliseconds") + "Z"
                            #print("Converting datestamp in file:"+filename)
                        except ValueError:
                            # Not a number - just leave the date as whatever came in
                            pass
                #print(row_dict["publish_date"])
            
            if ("section" in field_list):
                #<meta data-rh="true" property="article:section" content="Opinion">
                row_dict["section"] = html_tree.find("meta", property="article:section", content=True)
                if row_dict["section"] is None:
                    print("No section found in file:"+filename)
                else:
                    row_dict["section"] = row_dict["section"]["content"]
                    # We found a section tag, but it was blank in the data.  Replace with someting.
                    if (row_dict["section"] == ""):
                        row_dict["section"] = "No Section"
                #print(row_dict["section"])
            
            if ("keyword_list" in field_list):
                #<meta data-rh="true" name="news_keywords" content="Banking and Finance,Donald Trump,Executive Orders,Dodd Frank,Regulation and Deregulation">
                #<meta name="keywords" content="Computers and the Internet,Open-Source Software,New York Times,Code,Open Source">
                row_dict["keyword_list"] = html_tree.find("meta", attrs={"name":"news_keywords"}, content=True)
                if row_dict["keyword_list"] is None:
                    row_dict["keyword_list"] = html_tree.find("meta", attrs={"name":"keywords"}, content=True)
                    if row_dict["keyword_list"] is None:
                        print("No keyword_list found in file:"+filename)
                    else:            
                        row_dict["keyword_list"] = row_dict["keyword_list"]["content"]
                else:
                    row_dict["keyword_list"] = row_dict["keyword_list"]["content"]
                #print(row_dict["keyword_list"])
        
            if (("paragraph_text" in field_list) or
                ("paragraph_num" in field_list)):
                #<p class="css-axufdj evys1bk0">President Trump fired the first round in his war against financial regulations by signing two executive orders on Friday.</p>
                #<p class="story-body-text" itemprop="articleBody"><img src="https://static01.nyt.com/images/blogs/open/2016/ios-fire-emoji.png" alt="Fire emoji" width="20px"><strong>Welcome to configuration hell</strong></p>
                #<div class="listy_body">
                    #<p><span>On the first day of summer in 2005, Snapple sought to break a Guinness World Record by erecting a 25-foot, 35,000-pound tower of flavored ice in Union Square. The kiwi-strawberry pillar was to be the world’s largest Popsicle. If only the sun had cooperated.</span> </p>
                #<section name="articleBody" class="meteredContent css-1r7ky0e">
                paragraphs = html_tree.find_all("p", class_="css-axufdj evys1bk0")
                if (len(paragraphs) <= 0):
                    paragraphs = html_tree.find_all("p", class_="story-body-text", itemprop="articleBody")
                    if (len(paragraphs) <= 0):
                        possible_paragraphs = html_tree.find_all("div", class_="listy_body")
                        if (len(possible_paragraphs) > 0):
                            for pp in possible_paragraphs:
                                paragraph = pp.find("p")
                                if paragraph is not None:
                                    paragraphs.append(paragraph)
                        if (len(paragraphs) <= 0):
                            possible_sections = html_tree.find_all("section", attrs={"name":"articleBody"})
                            if (len(possible_sections) > 0):
                                for ps in possible_sections:
                                    possible_paragraphs = ps.find_all("p")
                                    for pp in possible_paragraphs:
                                        paragraphs.append(pp)
                                if (len(paragraphs) <= 0):
                                    print("No paragraphs found for file:"+filename)
                            
                for i, paragraph in enumerate(paragraphs):
                    if ("paragraph_num" in field_list):
                        row_dict["paragraph_num"] = i
                    if ("paragraph_text" in field_list):
                        # Take out newlines, tabs and multiple spaces from wordwraps in the html file
                        row_dict["paragraph_text"] = paragraph.text.strip().replace("\n", "").replace("\t","")
                        row_dict["paragraph_text"] = regex.sub("[ ]+", " ", row_dict["paragraph_text"])
                        #print(row_dict["paragraph_text"])
                    # Don't include empty paragraphs in the list of paragraphs to put into the text file
                    if (not (row_dict["paragraph_text"] == "")):
                        df = df.append(row_dict,ignore_index=True)
            #print(df["paragraph_text"])
        except TypeError as err:
            print("Following issue with file:"+filename)
            print(err)                    
    return df

# This function outputs the results (DataFrame) to a specified file.
# It prints the filename and the number of paragraphs in the DataFrame.
# It writes the DataFrame to the specified file using the specified delimiter.
def output_result_file(file, df, delimiter):
    print(file)
    print("Paragraphs :" +str(len(df)))
    df.to_csv(file, index = False, sep=delimiter, header=False)

# This function outputs the error statistics to a specified file.
# It prints the filename and the error statistics, including the number of occurrences for each error type.
# It writes the error_list dictionary to the specified file in JSON format.
def output_error_file(file, error_list):
    print(file)
    print("Error Statistics:")
    for key in error_list.keys():
        print("\t"+key+ " has been encountered: "+str(len(error_list[key]))+ " times." )
    with open(file,"w") as out_file:
        json.dump(error_list, out_file, indent="\t")

# This is the main function that orchestrates the parsing process.
# It defines command-line arguments using the argparse module to specify input and output directories/files.
# It initializes variables such as field_list and error_list.
# It collects a list of HTML files to parse from the specified directory.
# It iterates through each HTML file, parsing it using the parse_html_file function and accumulating the results in a DataFrame.
# It outputs the results and error statistics to specified files.
# It provides feedback on the progress and completion of the parsing process.       
def main():
    print("parse_NYT_HTML.py started at :" + datetime.datetime.now().isoformat())
    parser = argparse.ArgumentParser(add_help=True)
    parser.add_argument("--data_dir", action="store", dest="data_dir", required=False,
                        default=DEFAULT_DATA_DIRECTORY,
                        help="Full Path to location of HTML files to parse")
    parser.add_argument("--data_files", action="store", dest="data_files", required=False,
                        default="*.html",
                        #default="0050*.html",
                        #default="0050001.html", 
                        #default="0055001.html",
                        #default="0060001.html",
                        #default="0065001.html",
                        #default="0070001.html",
                        #default="0075389.html", 
                        help="Names of HTML files to parse")
    parser.add_argument("--results_dir", action="store", dest="results_dir", required=False,
                        default="/home/douglasvbellew/Workspaces/NYT_HTML_Parse/results_second", 
                        help="Full Path to location to store results")
    parser.add_argument("--result_filename", action='store', dest='result_filename', required=False,
                        default="nyt_html_data_2.tsv", help='Name of results file')
    parser.add_argument("--error_filename", action='store', dest='error_filename', required=False,
                        default="nyt_html_error_2.json", help='Name of error file')
    parser.add_argument("--field_delimieter", action='store', dest='field_delimiter', required=False,
                        default="\t", help="inter-field separator (generally \"\\t\" or \",\")")

    in_args = parser.parse_args()
    output_filename = os.path.join(in_args.results_dir,in_args.result_filename)
    error_filename = os.path.join(in_args.results_dir,in_args.error_filename)
    data_files = glob.glob(os.path.join(in_args.data_dir,in_args.data_files))
    
    #field_list = ["title", "article_id", "paragraph_num", "publish_date", "section", "paragraph_text", "keyword_list"]
    field_list = ["title", "article_id", "paragraph_num", "publish_date", "section", "paragraph_text"]
    error_list = {}
    if (len(data_files) > 0):
        result_df = pd.DataFrame(columns=field_list)
        for num, filename in enumerate(data_files):
            if (num%500 == 0):
                print(datetime.datetime.now().isoformat() + " " + filename)
            html_df = parse_html_file(filename, field_list, error_list)
            #print(html_df)
            #print(error_list)
            result_df = pd.concat([result_df,html_df], ignore_index=True)
        output_result_file(output_filename, result_df, in_args.field_delimiter)
        output_error_file(error_filename,error_list)
        print("parse_NYT_HTML.py finsihed - status nominal")
    else:
        print("Filename: "+in_args.data_files+" not found in directory:"+in_args.data_dir)
        print("parse_NYT_HTML.py finsihed - status failure")
    print("Finished at: " + datetime.datetime.now().isoformat())    

if __name__ == "__main__":
    main()