Author: Hugo Lyons Keenan

Aim: This script retrieves data for all years within a user-specified year range and saves it to a file with a custom format, and cleans the resulting file.

Note: To run this script successfully, you need to get your API key from the NYT Developer site and save it in a file called "nyt_api_key.txt"

In [None]:
# pip install tqdm if not already installed
%pip install tqdm 

In [None]:
from get_corpus import get_corpus, concat_data
import logging

In [None]:
logger = logging.getLogger('NYT downloader')
logging.basicConfig(level=logging.INFO)

# Example of how to use the get_corpus function
start_year = 2000
end_year = 2001
delimiter = '|||||'
missing_value = 'NA'

In [None]:
get_corpus(start_year, end_year, logger, delimiter, missing_value)

In [None]:
out_file = 'nyt_corpus.csv'
concat_data(out_file=out_file, dir_path='corpus', logger=logger)  # should be 1126 files (as we deleted two for containing no data - nyt_data_1978_9.csv; nyt_data_1978_10.csv)

In [20]:
# Clean the large data file
import re

def read_csv_multi_char_delimiter(file_path, delimiter, chunk_size = 1000):
    """
    A generator function to read a large CSV file with a multi-character delimiter and yield chunks of rows.
    Also checks for malformed rows and raises an error if the number of columns is not the same as the header.
    """
    with open(file_path, 'r') as f:
        # Escape special regex characters in the delimiter
        escaped_delimiter = re.escape(delimiter)
        # Read and split the header
        header = re.split(escaped_delimiter, f.readline().strip())
        print(f"Header: {header}")
        expected_column_count = len(header)
        print(f"Header has {expected_column_count} columns")
        n_malformed_rows = 0
        chunk = []
        working_line = ''
        for row_num, line in enumerate(f, start=2): 
            working_line += line.strip('\n')
            row = re.split(escaped_delimiter, working_line)
            # If the last element of the row is not a URL, it is malformed and we should look for the 
            # rest of the row in the next line
            if row[-1][0:4] != 'http':
                n_malformed_rows += 1
                continue
            
            # If the row has a URL, but the number of columns is not as expected, it is malformed
            # and this should be raised as an error as there is likely missing data
            if row[-1][0:4] == 'http' and len(row) != expected_column_count:
                raise ValueError(f"Row {row_num} has {len(row)} columns, expected {expected_column_count}")
            
            chunk.append(row)
            working_line = ''
            if len(chunk) == chunk_size:
                yield chunk
                chunk = []
        print(f"Number of malformed rows: {n_malformed_rows}")
        if chunk:  
            yield chunk

# Usage
file_path = 'nyt_corpus.csv'
delimiter = '|||||' 
chunk_size = 1000
new_file_path = 'nyt_corpus_cleaned.csv'
header = ['title', 'section_name', 'snippet', 'lead_paragraph', 'year', 'month', 'web_url']
invalid_rows = []

# make sure that the new file is empty before writing to it
open(new_file_path, 'w').close()
# write the header to the new file
with open(new_file_path, 'a') as file:
    file.write(delimiter.join(header) + '\n')
for chunk_num, chunk in enumerate(read_csv_multi_char_delimiter(file_path, delimiter, chunk_size), start=1):
    if chunk_num % 1000 == 0:
        print(f"Processing chunk {chunk_num}")
    # check if the chunk has the correct number of columns before writing to the new file using the delimiter
    to_write = []
    for row in chunk:
        if len(row) != 7:
            raise ValueError(f"Row has {len(row)} columns, expected 7")
        # make sure that the year is:
        # 1. a 4-digit number
        # 2. in the range 1930-2023
        year = row[header.index('year')]
        if (not year.isdigit()) or len(year) != 4 or int(year) < 1930 or int(year) > 2023:
            # add row to the list of invalid rows
            invalid_rows.append(row)
            continue
        to_write.append(row)
        
    string_to_write = '\n'.join([delimiter.join(row) for row in to_write]) + '\n'
    with open(new_file_path, 'a') as file:
        file.write(string_to_write)
print(f"Number of removed rows: {len(invalid_rows)}")

Header: ['title', 'section_name', 'snippet', 'lead_paragraph', 'year', 'month', 'web_url']
Header has 7 columns
Processing chunk 1000
Processing chunk 2000
Processing chunk 3000
Processing chunk 4000
Processing chunk 5000
Processing chunk 6000
Processing chunk 7000
Processing chunk 8000
Processing chunk 9000
Processing chunk 10000
Number of malformed rows: 116799
Number of removed rows: 79


In [21]:
# Check the cleaned file -> should result in 0 malformed rows
# Also collect frequency of different entries
from collections import defaultdict
counter = defaultdict(lambda: defaultdict(int))
for chunk_num, chunk in enumerate(read_csv_multi_char_delimiter(new_file_path, delimiter, chunk_size), start=1):
    if chunk_num % 1000 == 0:
        print(f"Processing chunk {chunk_num}")
    for row in chunk:
        if len(row) != 7:
            raise ValueError(f"Row has {len(row)} columns, expected 7")
        if not row[header.index('year')].isdigit() or len(row[header.index('year')]) != 4:
            raise ValueError(f"Row has an invalid year: {row[header.index('year')]}")
        if int(row[header.index('year')]) < 1930 or int(row[header.index('year')]) > 2023:
            raise ValueError(f"Row has an invalid year: {row[header.index('year')]}")
        for column in ['section_name', 'year', 'month']:
            counter[column][row[header.index(column)]] += 1
        

Header: ['title', 'section_name', 'snippet', 'lead_paragraph', 'year', 'month', 'web_url']
Header has 7 columns
Processing chunk 1000
Processing chunk 2000
Processing chunk 3000
Processing chunk 4000
Processing chunk 5000
Processing chunk 6000
Processing chunk 7000
Processing chunk 8000
Processing chunk 9000
Processing chunk 10000
Number of malformed rows: 0


In [22]:
# print the number of unique values for each column
for column, values in counter.items():
    print(f"Number of unique values for {column}: {len(values)} e.g. {list(values.keys())[:5]}")

Number of unique values for section_name: 91 e.g. ['Archives', '|Archives', '||Archives', 'Books', 'Business Day']
Number of unique values for year: 94 e.g. ['1930', '1931', '1932', '1933', '1934']
Number of unique values for month: 12 e.g. ['01', '02', '03', '04', '05']
