In [13]:
# @title
# This script is a tool to help you build a better correction
# dictionary for the 'kdn_cleaner.py' script.
#
# It does NOT clean any files.
# It reads all the GARBLED files in your organized directory
# and creates a new file called 'word_frequency.txt' that
# lists every word and how many times it appears,
# sorted from most frequent to least.
#
# You can then look at this 'word_frequency.txt' file,
# easily spot the misspelled words (like 'fubfcriber'),
# and add them to the OCR_ERROR_MAP in 'kdn_cleaner.py'.

import os
import re
import time
from google.colab import drive
from collections import Counter

# ------------------------------------------------------------------
# Cell 1: Mount your Google Drive
# ------------------------------------------------------------------
print("--- Running Step 1: Mount Google Drive ---")
try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting drive: {e}")

# ------------------------------------------------------------------
# Cell 2: Define Directories
# ------------------------------------------------------------------
print("\n--- Running Step 2: Configuring Paths ---")

# This is the directory with your ORGANIZED but GARBLED files
SOURCE_DIRECTORY = '/content/drive/MyDrive/KDN_Archive_Downloads/Lexington_Subset'

# This is the file where the word count will be saved
OUTPUT_FILE = '/content/drive/MyDrive/KDN_Archive_Downloads/word_frequency.txt'

print(f"Reading from: {SOURCE_DIRECTORY}")
print(f"Will write count to: {OUTPUT_FILE}")

# ------------------------------------------------------------------
# Cell 3: Word Counting Function
# ------------------------------------------------------------------
print("\n--- Running Step 3: Initializing Word Counter ---")

# A regex to find words: sequences of alphabetic characters
# It will ignore numbers and simple punctuation.
word_regex = re.compile(r'\b[a-z]+\b')

def count_words(source_dir):
    """
    Walks all subdirectories, reads all .txt files, counts all words.
    """
    word_counter = Counter()
    file_count = 0

    print(f"Starting scan of {source_dir}...")

    for dirpath, dirnames, filenames in os.walk(source_dir):
        for filename in filenames:
            if filename.endswith(".txt"):
                file_path = os.path.join(dirpath, filename)
                try:
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        text_content = f.read()

                    # Find all words, convert to lowercase, and update count
                    words = word_regex.findall(text_content.lower())
                    word_counter.update(words)
                    file_count += 1

                    if file_count % 500 == 0:
                        print(f"  ...scanned {file_count} files...")

                except Exception as e:
                    print(f"  ERROR: Could not process file {filename}. {e}")

    print(f"Scan complete. Found {len(word_counter)} unique words in {file_count} files.")
    return word_counter

# ------------------------------------------------------------------
# Cell 4: Main Processing Loop
# ------------------------------------------------------------------
print("\n--- Running Step 4: Starting Word Count Process ---")
start_time = time.time()

word_counts = count_words(SOURCE_DIRECTORY)

print(f"\nWriting word list to {OUTPUT_FILE}...")
try:
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        f.write("Word Frequency List\n")
        f.write("---------------------\n")
        f.write(f"Source: {SOURCE_DIRECTORY}\n")
        f.write("---------------------\n\n")

        # Sort words by frequency, from most common to least
        for word, count in word_counts.most_common():
            f.write(f"{count: <10}{word}\n")

    end_time = time.time()
    print(f"Successfully wrote {len(word_counts)} unique words.")
    print(f"Total time: {end_time - start_time:.2f} seconds.")

except Exception as e:
    print(f"  ERROR: Failed to write output file. {e}")

print("\n--- All tasks finished ---")
print(f"You can now open '{OUTPUT_FILE}' in your Google Drive,")
print("find the misspelled words, and add them to 'kdn_cleaner.py'.")

--- Running Step 1: Mount Google Drive ---
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.

--- Running Step 2: Configuring Paths ---
Reading from: /content/drive/MyDrive/KDN_Archive_Downloads/Lexington_Subset
Will write count to: /content/drive/MyDrive/KDN_Archive_Downloads/word_frequency.txt

--- Running Step 3: Initializing Word Counter ---

--- Running Step 4: Starting Word Count Process ---
Starting scan of /content/drive/MyDrive/KDN_Archive_Downloads/Lexington_Subset...
  ...scanned 500 files...
  ...scanned 1000 files...
  ...scanned 1500 files...
  ...scanned 2000 files...
Scan complete. Found 2163205 unique words in 2474 files.

Writing word list to /content/drive/MyDrive/KDN_Archive_Downloads/word_frequency.txt...
Successfully wrote 2163205 unique words.
Total time: 560.85 seconds.

--- All tasks finished ---
You can now open '/content/drive/MyDrive/KDN_Archive_

In [None]:
# @title
import subprocess
import time
import os
import requests # Import the requests library for making API calls

# Define the output directory and glob pattern as used previously
output_directory = '/content/drive/MyDrive/KDN_Archive_Downloads/Lexington_Subset'
glob_pattern = "*_djvu.txt"

# Add explicit checks and print statements for the output directory
print(f"Checking directory: {output_directory}")
if os.path.exists(output_directory):
    print("Output directory exists.")
    # Optional: Add a small delay to allow file system to sync
    # time.sleep(2)

    # Get a list of downloaded files to identify the last one
    try:
        downloaded_files = []
        # List all entries in the main download directory (expected to be item ID directories)
        all_entries = os.listdir(output_directory)
        print(f"Entries found in main directory (expected item IDs): {len(all_entries)}")

        for entry in all_entries:
            item_dir = os.path.join(output_directory, entry)
            # Check if the entry is a directory
            if os.path.isdir(item_dir):
                # List files within the item ID directory
                files_in_item_dir = os.listdir(item_dir)
                # Filter for the expected glob pattern
                for file_name in files_in_item_dir:
                    if file_name.endswith('_djvu.txt'): # More specific check than just .txt
                        downloaded_files.append(os.path.join(item_dir, file_name))

        # Sort the found files by modification time to find the last downloaded
        downloaded_files.sort(key=os.path.getmtime)

        print(f"Filtered downloaded files (.txt): {len(downloaded_files)}")
        if downloaded_files:
            last_downloaded_file = downloaded_files[-1]
            # Extract the item ID from the last downloaded file name
            last_downloaded_item_id = os.path.basename(last_downloaded_file).split('_', 1)[0]
            print(f"Last downloaded file identified: {last_downloaded_file}")
            print(f"Last downloaded item ID: {last_downloaded_item_id}")
        else:
            last_downloaded_item_id = None
            print("No previous .txt downloads found in the directory or its subdirectories.")

    except Exception as e:
        print(f"Error identifying last downloaded file: {e}")
        last_downloaded_item_id = None


# Find the index of the last downloaded item in the full item list
start_index = 0
if last_downloaded_item_id and item_identifiers:
    try:
        start_index = item_identifiers.index(last_downloaded_item_id) + 1
        print(f"Resuming download from item index: {start_index} (after {last_downloaded_item_id})")
    except ValueError:
        print(f"Last downloaded item ID '{last_downloaded_item_id}' not found in the full item list. Starting from the beginning.")
        start_index = 0

# Get the list of items remaining to download starting from the identified index
remaining_item_ids_for_download = item_identifiers[start_index:]

# Determine the total number of items to download in this session
total_items_to_download_this_session = len(remaining_item_ids_for_download)
print(f"Starting download of {total_items_to_download_this_session} remaining items...")

# Define the batch size for processing
batch_size = 10 # You can adjust this number

# Initialize download counter to keep track of overall progress
download_counter = start_index

# Iterate through the remaining item IDs in batches
for i in range(0, total_items_to_download_this_session, batch_size):
    batch_item_ids = remaining_item_ids_for_download[i:i + batch_size]
    current_batch_num = (i // batch_size) + 1
    total_batches = (total_items_to_download_this_session + batch_size - 1) // batch_size

    print(f"\n--- Processing Batch {current_batch_num}/{total_batches} ({len(batch_item_ids)} items) ---")

    # Get and display newspaper/issue info for the first item in the batch
    if batch_item_ids: # Ensure batch is not empty
        first_item_id_in_batch = batch_item_ids[0]
        metadata_url = f"https://archive.org/metadata/{first_item_id_in_batch}"
        try:
            response = requests.get(metadata_url)
            response.raise_for_status()
            metadata = response.json()

            title = metadata.get('metadata', {}).get('title', 'N/A')
            date = metadata.get('metadata', {}).get('date', 'N/A')

            print(f"  - Currently processing: Item ID: {first_item_id_in_batch}, Title: {title}, Date: {date}")

        except requests.exceptions.RequestException as e:
            print(f"  - Error fetching metadata for {first_item_id_in_batch}: {e}")


    # Prepare the batch item IDs for piping to standard input
    batch_items_input = "\n".join(batch_item_ids)

    # Construct the ia download command for the current batch
    download_command = [
        'ia',
        'download',
        '--itemlist=-',
        f'--destdir={output_directory}',
        f'--glob={glob_pattern}',
        '--retries', '5'
    ]

    try:
        start_time = time.time()
        result = subprocess.run(
            download_command,
            input=batch_items_input,
            text=True,
            capture_output=True,
            check=True,
        )
        end_time = time.time()
        batch_duration = end_time - start_time

        print(f"Batch {current_batch_num} completed in {batch_duration:.2f} seconds.")
        # Optional: Print stdout/stderr if needed for debugging
        # if result.stdout:
        #     print("Batch Stdout:")
        #     print(result.stdout)
        # if result.stderr:
        #     print("Batch Stderr:")
        #     print(result.stderr)

    except FileNotFoundError:
        print(f"Error: 'ia' command not found during batch {current_batch_num}. Ensure internetarchive is installed and in your PATH.")
        break
    except subprocess.CalledProcessError as e:
        print(f"Error executing ia download for batch {current_batch_num}: {e}")
        print(f"Stdout: {e.stdout}")
        print(f"Stderr: {e.stderr}")
    except subprocess.TimeoutExpired:
        print(f"Error: ia download command timed out for batch {current_batch_num}.")
    except Exception as e:
        print(f"An unexpected error occurred during batch {current_batch_num}: {e}")

    # Update download_counter based on the number of items processed in the batch
    download_counter = start_index + i + len(batch_item_ids)


print("\n--- All Batches Processed ---")
print(f"Check the folder '{output_directory}' in your Google Drive.")

In [9]:
# @title
# This script is designed to be run in a single Google Colab cell.
# It will ANALYZE the 2,478 items from the Lexington-specific
# search query and create a list of all the unique newspaper
# titles (not the individual issue titles) found in that search.
# It does NOT download any files.

# ------------------------------------------------------------------
# Cell 1: Install 'internetarchive' (if needed)
# ------------------------------------------------------------------
print("\n--- Running Step 1: Check/Install 'internetarchive' library ---")
try:
    import internetarchive
    print("internetarchive library is already installed.")
except ImportError:
    print("internetarchive library not found. Installing...")
    !pip install internetarchive
    print("Installation complete.")

# ------------------------------------------------------------------
# Cell 2: Mount your Google Drive
# ------------------------------------------------------------------
print("\n--- Running Step 2: Mount Google Drive ---")
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting drive: {e}")

# ------------------------------------------------------------------
# Cell 3: Set up File Paths
# ------------------------------------------------------------------
import os
output_directory = '/content/drive/MyDrive/KDN_Archive_Downloads'
os.makedirs(output_directory, exist_ok=True)

# This is the file where the final list will be saved.
unique_titles_list = f"{output_directory}/Lexington_Newspaper_Titles.txt"

print(f"List of unique newspaper titles will be saved to: {unique_titles_list}")

# ------------------------------------------------------------------
# Cell 4: Get All Unique Newspaper Titles from the Lexington Query
# ------------------------------------------------------------------
print("\n--- Running Step 4: Get All Unique Titles (Python Method) ---")
import internetarchive
import sys

# This is the same query from your downloader script.
subset_query = 'collection:kentuckynewspapers AND (place_of_publication:"Lexington" OR title:"Lexington")'
print(f"Running search for: {subset_query}")
print("This will take a few minutes...")

# A 'set' will automatically store only unique entries.
unique_newspaper_titles = set()

try:
    # We only need the 'title' field from the search
    search_results = internetarchive.search_items(subset_query, fields=['title'])

    # Using enumerate to show progress
    for i, item in enumerate(search_results):
        if 'title' in item:
            full_title = item['title']

            # This logic splits the title at the colon before the date
            # e.g., "Kentucky gazette (Lexington, Ky. : 1789): 1799-01-02"
            # becomes "Kentucky gazette (Lexington, Ky. : 1789)"
            parts = full_title.split(':')
            if len(parts) > 1:
                # Join all parts except the last one (the date)
                newspaper_name = ":".join(parts[:-1]).strip()
            else:
                # Use the full title if no colon is found
                newspaper_name = full_title.strip()

            # Add the cleaned newspaper name to the set
            unique_newspaper_titles.add(newspaper_name)

        if (i + 1) % 500 == 0:
            print(f"Processed {i+1} of {search_results.num_found} items...")

    print(f"Search complete. Processed {i+1} total items.")

except Exception as e:
    print(f"An error occurred during search: {e}")

# ------------------------------------------------------------------
# Cell 5: Save and Count Unique Titles
# ------------------------------------------------------------------
print("\n--- Running Step 5: Save and Count Unique Titles ---")

try:
    # Save the list of unique titles to the file
    print(f"Saving unique titles list to {unique_titles_list}...")
    with open(unique_titles_list, 'w', encoding='utf-8') as f:
        # Sort the set for a clean, alphabetical list
        for title in sorted(unique_newspaper_titles):
            f.write(f"{title}\n")
    print("Save complete.")

    print("\n--- FINAL COUNT ---")
    # Print the count by getting the length of the set
    print(f"Total unique newspaper titles found: {len(unique_newspaper_titles)}")

except Exception as e:
    print(f"An error occurred while writing the file: {e}")

print("\n--- All tasks finished ---")


--- Running Step 1: Check/Install 'internetarchive' library ---
internetarchive library is already installed.

--- Running Step 2: Mount Google Drive ---
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.
List of unique newspaper titles will be saved to: /content/drive/MyDrive/KDN_Archive_Downloads/Lexington_Newspaper_Titles.txt

--- Running Step 4: Get All Unique Titles (Python Method) ---
Running search for: collection:kentuckynewspapers AND (place_of_publication:"Lexington" OR title:"Lexington")
This will take a few minutes...
Processed 500 of 2478 items...
Processed 1000 of 2478 items...
Processed 1500 of 2478 items...
Processed 2000 of 2478 items...
Search complete. Processed 2478 total items.

--- Running Step 5: Save and Count Unique Titles ---
Saving unique titles list to /content/drive/MyDrive/KDN_Archive_Downloads/Lexington_Newspaper_Titles.txt...
Save complete.



In [None]:
# @title
# This script is designed to be run in a single Google Colab cell.
# It AUTOMATES the entire cleaning process. It will:
# 1. Scan all your GARBLED files to find common words.
# 2. Automatically build a correction dictionary (e.g., "fubfcriber" -> "subscriber").
# 3. Use that new dictionary to clean all your files and save them
#    to a new "Cleaned" directory.
#
# This script REPLACES both kdn_word_counter.py and kdn_cleaner.py.

import os
import re
import time
from google.colab import drive
from collections import Counter
import shutil
import nltk
from nltk.corpus import words

# ------------------------------------------------------------------
# Cell 1: Mount your Google Drive
# ------------------------------------------------------------------
print("--- Running Step 1: Mount Google Drive ---")
try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting drive: {e}")

# ------------------------------------------------------------------
# Cell 2: Install and Download NLTK Dictionary
# ------------------------------------------------------------------
print("\n--- Running Step 2: Downloading English Dictionary ---")
print("This is required for the script to 'guess' corrections.")
try:
    nltk.download('words')
    english_words = set(words.words())
    print("English dictionary loaded.")
except Exception as e:
    print(f"Error downloading NLTK dictionary: {e}")
    # We can't continue without the dictionary
    raise e

# ------------------------------------------------------------------
# Cell 3: Define Directories and Word Counter
# ------------------------------------------------------------------
print("\n--- Running Step 3: Configuring Paths & Counter ---")

# This is the directory with your ORGANIZED but GARBLED files
SOURCE_DIRECTORY = '/content/drive/MyDrive/KDN_Archive_Downloads/Lexington_Subset'
# This is the new directory where the CLEANED files will be saved
CLEANED_DIRECTORY = '/content/drive/MyDrive/KDN_Kernel_Downloads_Cleaned'
# This is where the auto-generated correction list will be saved
CORRECTION_MAP_FILE = '/content/drive/MyDrive/KDN_Archive_Downloads/auto_correction_map.txt'

print(f"Reading from: {SOURCE_DIRECTORY}")
print(f"Writing clean files to: {CLEANED_DIRECTORY}")
print(f"Writing correction map to: {CORRECTION_MAP_FILE}")
os.makedirs(CLEANED_DIRECTORY, exist_ok=True)

# A regex to find words: sequences of alphabetic characters
word_regex = re.compile(r'\b[a-z]+\b')

def count_words(source_dir):
    """
    Walks all subdirectories, reads all .txt files, counts all words.
    """
    word_counter = Counter()
    file_count = 0
    print(f"Starting scan of {source_dir} to find errors...")

    for dirpath, dirnames, filenames in os.walk(source_dir):
        for filename in filenames:
            if filename.endswith(".txt"):
                file_path = os.path.join(dirpath, filename)
                try:
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        text_content = f.read()

                    words = word_regex.findall(text_content.lower())
                    word_counter.update(words)
                    file_count += 1

                    if file_count % 500 == 0:
                        print(f"  ...scanned {file_count} files...")

                except Exception as e:
                    print(f"  ERROR: Could not process file {filename}. {e}")

    print(f"Scan complete. Found {len(word_counter)} unique words in {file_count} files.")
    return word_counter

# ------------------------------------------------------------------
# Cell 4: Phase 1 - Run Word Count
# ------------------------------------------------------------------
print("\n--- Running Phase 1: Counting All Words ---")
start_time = time.time()
word_counts = count_words(SOURCE_DIRECTORY)
print(f"Word count finished in {time.time() - start_time:.2f} seconds.")

# ------------------------------------------------------------------
# Cell 5: Phase 2 - Auto-Build Correction Map
# ------------------------------------------------------------------
print("\n--- Running Phase 2: Automatically Building Correction Map ---")
# This is the map we will build
OCR_ERROR_MAP = {}
# This is the minimum number of times a word must appear to be checked
FREQUENCY_THRESHOLD = 5

for word, count in word_counts.items():
    # Only check words that appear often enough
    if count < FREQUENCY_THRESHOLD:
        continue

    # Check for the "long s" (f) error
    if 'f' in word and word not in english_words:
        # Guess the correction by replacing 'f' with 's'
        guessed_word = word.replace('f', 's')

        # Check if our guess is a real English word
        if guessed_word in english_words:
            # We found a match! Add it to our map.
            # We use regex to be case-insensitive later
            regex_pattern = r'\b' + re.escape(word) + r'\b'
            OCR_ERROR_MAP[regex_pattern] = guessed_word

print(f"Auto-generated {len(OCR_ERROR_MAP)} correction rules.")

# Save the map to a file so you can inspect it
try:
    with open(CORRECTION_MAP_FILE, 'w', encoding='utf-8') as f:
        f.write("Auto-Generated OCR Correction Map\n")
        f.write("----------------------------------\n")
        for key, value in OCR_ERROR_MAP.items():
            f.write(f"'{key}'  ->  '{value}'\n")
    print(f"Saved correction map to {CORRECTION_MAP_FILE}")
except Exception as e:
    print(f"Warning: Could not save correction map. {e}")

# ------------------------------------------------------------------
# Cell 6: Phase 3 - Clean and Write Files
# ------------------------------------------------------------------
print("\n--- Running Phase 3: Cleaning All Files ---")
start_time = time.time()
cleaned_count = 0
skipped_count = 0

# Pre-compile the regex rules for speed
# We create one giant regex that finds any of the error words
# This is MUCH faster than looping through the dictionary for every file
if OCR_ERROR_MAP:
    regex_pattern = re.compile(
        "|".join(OCR_ERROR_MAP.keys()),
        re.IGNORECASE # Make it case-insensitive
    )

    # This function will be called by re.sub for every match
    def find_replacement(match):
        # Look up the lowercase version of the found word in our map
        key = r'\b' + re.escape(match.group(0).lower()) + r'\b'
        replacement = OCR_ERROR_MAP.get(key)

        # This handles capitalization (e.g., "Congrefs" -> "Congress")
        if replacement:
            if match.group(0).isupper():
                return replacement.upper()
            elif match.group(0)[0].isupper():
                return replacement.capitalize()
            else:
                return replacement
        return match.group(0) # Should not happen, but a good fallback
else:
    print("No correction rules were generated. Files will be copied as-is.")
    regex_pattern = None

# Now, walk the SOURCE directory and apply the cleaning
for dirpath, dirnames, filenames in os.walk(SOURCE_DIRECTORY):
    for filename in filenames:
        if filename.endswith(".txt"):
            source_file_path = os.path.join(dirpath, filename)

            # Create the matching directory structure in the CLEANED folder
            relative_path = os.path.relpath(dirpath, SOURCE_DIRECTORY)
            target_dir = os.path.join(CLEANED_DIRECTORY, relative_path)
            os.makedirs(target_dir, exist_ok=True)
            target_file_path = os.path.join(target_dir, filename)

            # --- Resumability: Skip files we've already cleaned ---
            if os.path.exists(target_file_path):
                skipped_count += 1
                continue

            try:
                # Read the garbled text
                with open(source_file_path, 'r', encoding='utf-8', errors='ignore') as f_in:
                    garbled_text = f_in.read()

                # Clean the text
                if regex_pattern:
                    cleaned_text = regex_pattern.sub(find_replacement, garbled_text)
                else:
                    cleaned_text = garbled_text # No rules, just copy

                # Write the new clean text
                with open(target_file_path, 'w', encoding='utf-8') as f_out:
                    f_out.write(cleaned_text)

                cleaned_count += 1

                if cleaned_count % 500 == 0:
                    print(f"  ...cleaned {cleaned_count} files...")

            except Exception as e:
                print(f"  ERROR: Could not clean file {filename}. {e}")

end_time = time.time()
print(f"Cleaning complete in {end_time - start_time:.2f} seconds.")
print(f"Successfully cleaned and wrote: {cleaned_count} files.")
print(f"Skipped (already clean): {skipped_count} files.")

print("\n--- All tasks finished ---")
print(f"Your clean data is ready in: {CLEANED_DIRECTORY}")

--- Running Step 1: Mount Google Drive ---
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.

--- Running Step 2: Downloading English Dictionary ---
This is required for the script to 'guess' corrections.
English dictionary loaded.

--- Running Step 3: Configuring Paths & Counter ---
Reading from: /content/drive/MyDrive/KDN_Archive_Downloads/Lexington_Subset
Writing clean files to: /content/drive/MyDrive/KDN_Kernel_Downloads_Cleaned
Writing correction map to: /content/drive/MyDrive/KDN_Archive_Downloads/auto_correction_map.txt

--- Running Phase 1: Counting All Words ---
Starting scan of /content/drive/MyDrive/KDN_Archive_Downloads/Lexington_Subset to find errors...


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


  ...scanned 500 files...
  ...scanned 1000 files...
  ...scanned 1500 files...
  ...scanned 2000 files...
Scan complete. Found 2163205 unique words in 2474 files.
Word count finished in 57.02 seconds.

--- Running Phase 2: Automatically Building Correction Map ---
Auto-generated 2493 correction rules.
Saved correction map to /content/drive/MyDrive/KDN_Archive_Downloads/auto_correction_map.txt

--- Running Phase 3: Cleaning All Files ---
  ...cleaned 500 files...
  ...cleaned 1000 files...
  ...cleaned 1500 files...
  ...cleaned 2000 files...
Cleaning complete in 3979.44 seconds.
Successfully cleaned and wrote: 2474 files.
Skipped (already clean): 0 files.

--- All tasks finished ---
Your clean data is ready in: /content/drive/MyDrive/KDN_Kernel_Downloads_Cleaned


In [None]:
# @title
# This script is designed to be run in a single Google Colab cell.
# It will download a subset of the 'kentuckynewspapers' collection
# for items related to "Lexington".

# ------------------------------------------------------------------
# Cell 1: Mount your Google Drive
# ------------------------------------------------------------------
print("--- Running Step 1: Mount Google Drive ---")
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting drive: {e}")
    print("Please ensure you are running this in Google Colab.")


# ------------------------------------------------------------------
# Cell 2: Install 'internetarchive' (if needed)
# ------------------------------------------------------------------
print("\n--- Running Step 2: Check/Install 'internetarchive' library ---")
try:
    import internetarchive
    print("internetarchive library is already installed.")
except ImportError:
    print("internetarchive library not found. Installing...")
    # The '!' character runs this as a shell command in Colab
    !pip install internetarchive
    print("Installation complete.")


# ------------------------------------------------------------------
# Cell 3: Set up your download directory
# ------------------------------------------------------------------
print("\n--- Running Step 3: Set up Download Directory ---")
import os

# We will create a new, specific folder for this subset
output_directory = '/content/drive/MyDrive/KDN_Archive_Downloads/Lexington_Subset'

os.makedirs(output_directory, exist_ok=True)
print(f"Files will be saved to: {output_directory}")

# This is the correct file format for "DjVuTXT"
glob_pattern = "*_djvu.txt"


# ------------------------------------------------------------------
# Cell 4: Download the "Lexington" Subset
# ------------------------------------------------------------------
print("\n--- Running Step 4: Download Lexington Subset ---")

print("Starting download for a subset: 'Lexington'...")

# This query searches for items in the collection that have
# "Lexington" in their title or in the 'place_of_publication' field.
# Using single quotes for the shell command is safer.
subset_query = 'collection:kentuckynewspapers AND (place_of_publication:"Lexington" OR title:"Lexington")'


# Run the download command for the subset
# This command first searches for all items matching the query,
# generates a list of them (--itemlist), and then pipes (|)
# that list to 'ia download' which reads the list from stdin (--itemlist=-).
print(f"Starting search... this may take a moment. Will download files matching {glob_pattern}")

# Note: The 'ia' tool is resumable. If this is interrupted,
# you can just run it again to continue.
!ia search '{subset_query}' --itemlist | ia download --itemlist=- --destdir="{output_directory}" --glob="{glob_pattern}"

print("\n--- Lexington Subset Download Complete ---")
print(f"Check the folder '{output_directory}' in your Google Drive.")
print("\n--- All tasks finished ---")

--- Running Step 1: Mount Google Drive ---
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.

--- Running Step 2: Check/Install 'internetarchive' library ---
internetarchive library is already installed.

--- Running Step 3: Set up Download Directory ---
Files will be saved to: /content/drive/MyDrive/KDN_Archive_Downloads/Lexington_Subset

--- Running Step 4: Download Lexington Subset ---
Starting download for a subset: 'Lexington'...
Starting search... this may take a moment. Will download files matching *_djvu.txt
kd9x639z9k7x (1/2478):
 skipping /content/drive/MyDrive/KDN_Archive_Downloads/Lexington_Subset/kd9x639z9k7x/kd9x639z9k7x_djvu.txt, file already exists based on length and date.
xt700000095d (2/2478):
 skipping /content/drive/MyDrive/KDN_Archive_Downloads/Lexington_Subset/xt700000095d/xt700000095d_djvu.txt, file already exists based on length and date.
xt70000009

In [None]:
import os

# The output_directory is already defined in the previous cell.
# For clarity and to make this cell runnable independently if needed,
# I will redefine it here, assuming the previous cell has been run.
output_directory = '/content/drive/MyDrive/KDN_Archive_Downloads/Lexington_Subset'

# Get all entries in the directory
all_entries = os.listdir(output_directory)

# Filter for files ending with '.txt'
downloaded_files = [entry for entry in all_entries if entry.endswith('.txt')]

# Print the list of downloaded text files
print("Downloaded text files:")
for file in downloaded_files:
    print(file)

Downloaded text files:


## Get the full item list

### Subtask:
Obtain the complete list of items that should be downloaded based on the search query.


**Reasoning**:
Define the search query and use the `ia search` command to get the list of item identifiers into a Python variable.



In [None]:
import subprocess

subset_query = 'collection:kentuckynewspapers AND (place_of_publication:"Lexington" OR title:"Lexington")'

# Use subprocess to run the ia search command and capture its output
try:
    # The output is likely large, so using capture_output=True and text=True is suitable.
    # Setting a timeout to prevent hanging indefinitely.
    result = subprocess.run(
        ['ia', 'search', subset_query, '--itemlist'],
        capture_output=True,
        text=True,
        check=True, # Raise CalledProcessError for bad exit codes
        timeout=600 # 10 minutes timeout
    )
    item_list_output = result.stdout
    print("Successfully obtained item list from ia search.")
except FileNotFoundError:
    print("Error: 'ia' command not found. Ensure internetarchive is installed and in your PATH.")
    item_list_output = "" # Set to empty to avoid errors in the next step
except subprocess.CalledProcessError as e:
    print(f"Error executing ia search: {e}")
    print(f"Stderr: {e.stderr}")
    item_list_output = "" # Set to empty to avoid errors in the next step
except subprocess.TimeoutExpired:
    print("Error: ia search command timed out.")
    item_list_output = "" # Set to empty to avoid errors in the next step
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    item_list_output = "" # Set to empty to avoid errors in the next step

# Split the captured output into a list of item identifiers
# Strip any leading/trailing whitespace from each item
item_identifiers = [item.strip() for item in item_list_output.splitlines() if item.strip()]

print(f"Found {len(item_identifiers)} items matching the query.")
# Display the first few identifiers as a check
print("First 10 item identifiers:")
display(item_identifiers[:10])

Successfully obtained item list from ia search.
Found 2478 items matching the query.
First 10 item identifiers:


['kd9x639z9k7x',
 'xt700000095d',
 'xt7000000960',
 'xt700000097k',
 'xt700000099r',
 'xt7000000b1m',
 'xt7000000p9t',
 'xt7000000q03',
 'xt7000000q28',
 'xt7000002c3c']

## Identify remaining items

### Subtask:
Compare the list of downloaded files with the full item list to determine which items still need to be downloaded.


**Reasoning**:
Implement steps 1-8 of the instructions to compare the full item list with the downloaded files and identify the remaining items.



In [None]:
# 1. Create an empty list called downloaded_item_ids
downloaded_item_ids = []

# 2. Iterate through the downloaded_files list
# 3. For each file name, extract the item ID and append to downloaded_item_ids
for file_name in downloaded_files:
    # Split the filename at the first underscore
    item_id = file_name.split('_', 1)[0]
    downloaded_item_ids.append(item_id)

# 4. Create a set of downloaded_item_ids for efficient lookup
downloaded_item_ids_set = set(downloaded_item_ids)

# 5. Create a list called remaining_item_ids
remaining_item_ids = []

# 6. Iterate through the item_identifiers list
# 7. For each item ID in item_identifiers, check if it is present in the set
# 8. If the item ID is NOT in the set, append it to the remaining_item_ids list
for item_id in item_identifiers:
    if item_id not in downloaded_item_ids_set:
        remaining_item_ids.append(item_id)

# 9. Print the number of remaining items and the first few items
print(f"Number of remaining items to download: {len(remaining_item_ids)}")
print("First 10 remaining item identifiers:")
display(remaining_item_ids[:10])

Number of remaining items to download: 2478
First 10 remaining item identifiers:


['kd9x639z9k7x',
 'xt700000095d',
 'xt7000000960',
 'xt700000097k',
 'xt700000099r',
 'xt7000000b1m',
 'xt7000000p9t',
 'xt7000000q03',
 'xt7000000q28',
 'xt7000002c3c']

## Resume download

### Subtask:
Use the identified list of remaining items to resume the download using `ia download`.


**Reasoning**:
Execute the `ia download` command using `subprocess.run` to download the remaining items, incorporating necessary arguments for destination, glob pattern, retries, and error handling.



In [None]:
import subprocess
import time
import os
import requests # Import the requests library for making API calls

# Define the output directory and glob pattern as used previously
output_directory = '/content/drive/MyDrive/KDN_Archive_Downloads/Lexington_Subset'
glob_pattern = "*_djvu.txt"

# Get a list of downloaded files to identify the last one
try:
    all_entries = os.listdir(output_directory)
    downloaded_files = sorted([os.path.join(output_directory, entry) for entry in all_entries if entry.endswith('.txt')], key=os.path.getmtime)

    last_downloaded_file = downloaded_files[-1] if downloaded_files else None

    if last_downloaded_file:
        last_downloaded_item_id = os.path.basename(last_downloaded_file).split('_', 1)[0]
        print(f"Last downloaded file identified: {last_downloaded_file}")
        print(f"Last downloaded item ID: {last_downloaded_item_id}")
    else:
        last_downloaded_item_id = None
        print("No previous downloads found in the directory.")

except Exception as e:
    print(f"Error identifying last downloaded file: {e}")
    last_downloaded_item_id = None

# Find the index of the last downloaded item in the full item list
start_index = 0
if last_downloaded_item_id and item_identifiers:
    try:
        start_index = item_identifiers.index(last_downloaded_item_id) + 1
        print(f"Resuming download from item index: {start_index} (after {last_downloaded_item_id})")
    except ValueError:
        print(f"Last downloaded item ID '{last_downloaded_item_id}' not found in the full item list. Starting from the beginning.")
        start_index = 0

remaining_item_ids_for_download = item_identifiers[start_index:]
total_items_to_download_this_session = len(remaining_item_ids_for_download)
print(f"Starting download of {total_items_to_download_this_session} remaining items...")

batch_size = 10
download_counter = start_index # Initialize counter based on where we are resuming

for i in range(0, total_items_to_download_this_session, batch_size):
    batch_item_ids = remaining_item_ids_for_download[i:i + batch_size]
    current_batch_num = (i // batch_size) + 1
    total_batches = (total_items_to_download_this_session + batch_size - 1) // batch_size

    print(f"\n--- Processing Batch {current_batch_num}/{total_batches} ({len(batch_item_ids)} items) ---")

    # Check if we should get newspaper/issue info for this batch
    if (download_counter + len(batch_item_ids)) // 50 > download_counter // 50:
        print("Fetching newspaper and issue information...")
        for item_id in batch_item_ids:
            # Construct the API URL for metadata
            metadata_url = f"https://archive.org/metadata/{item_id}"
            try:
                response = requests.get(metadata_url)
                response.raise_for_status() # Raise an exception for bad status codes
                metadata = response.json()

                title = metadata.get('metadata', {}).get('title', 'N/A')
                date = metadata.get('metadata', {}).get('date', 'N/A')

                print(f"  - Item ID: {item_id}, Title: {title}, Date: {date}")

            except requests.exceptions.RequestException as e:
                print(f"  - Error fetching metadata for {item_id}: {e}")

    # Prepare the batch item IDs for piping to standard input
    batch_items_input = "\n".join(batch_item_ids)

    # Construct the ia download command for the current batch
    download_command = [
        'ia',
        'download',
        '--itemlist=-',
        f'--destdir={output_directory}',
        f'--glob={glob_pattern}',
        '--retries', '5'
    ]

    try:
        start_time = time.time()
        result = subprocess.run(
            download_command,
            input=batch_items_input,
            text=True,
            capture_output=True,
            check=True,
        )
        end_time = time.time()
        batch_duration = end_time - start_time

        print(f"Batch {current_batch_num} completed in {batch_duration:.2f} seconds.")
        # Optional: Print stdout/stderr if needed for debugging
        # if result.stdout:
        #     print("Batch Stdout:")
        #     print(result.stdout)
        # if result.stderr:
        #     print("Batch Stderr:")
        #     print(result.stderr)

    except FileNotFoundError:
        print(f"Error: 'ia' command not found during batch {current_batch_num}. Ensure internetarchive is installed and in your PATH.")
        break
    except subprocess.CalledProcessError as e:
        print(f"Error executing ia download for batch {current_batch_num}: {e}")
        print(f"Stdout: {e.stdout}")
        print(f"Stderr: {e.stderr}")
    except subprocess.TimeoutExpired:
        print(f"Error: ia download command timed out for batch {current_batch_num}.")
    except Exception as e:
        print(f"An unexpected error occurred during batch {current_batch_num}: {e}")

    download_counter += len(batch_item_ids) # Increment counter after processing the batch

print("\n--- All Batches Processed ---")
print(f"Check the folder '{output_directory}' in your Google Drive.")

No previous downloads found in the directory.
Starting download of 2478 remaining items...

--- Processing Batch 1/248 (10 items) ---


KeyboardInterrupt: 

In [None]:
import subprocess
import time
import os
import requests # Import the requests library for making API calls

# Define the output directory and glob pattern as used previously
output_directory = '/content/drive/MyDrive/KDN_Archive_Downloads/Lexington_Subset'
glob_pattern = "*_djvu.txt"

# Add explicit checks and print statements for the output directory
print(f"Checking directory: {output_directory}")
if os.path.exists(output_directory):
    print("Output directory exists.")
    # Optional: Add a small delay to allow file system to sync
    # time.sleep(2)

    # Get a list of downloaded files to identify the last one
    try:
        downloaded_files = []
        # List all entries in the main download directory (expected to be item ID directories)
        all_entries = os.listdir(output_directory)
        print(f"Entries found in main directory (expected item IDs): {len(all_entries)}")

        for entry in all_entries:
            item_dir = os.path.join(output_directory, entry)
            # Check if the entry is a directory
            if os.path.isdir(item_dir):
                # List files within the item ID directory
                files_in_item_dir = os.listdir(item_dir)
                # Filter for the expected glob pattern
                for file_name in files_in_item_dir:
                    if file_name.endswith('_djvu.txt'): # More specific check than just .txt
                        downloaded_files.append(os.path.join(item_dir, file_name))

        # Sort the found files by modification time to find the last downloaded
        downloaded_files.sort(key=os.path.getmtime)

        print(f"Filtered downloaded files (.txt): {len(downloaded_files)}")
        if downloaded_files:
            last_downloaded_file = downloaded_files[-1]
            # Extract the item ID from the last downloaded file name
            last_downloaded_item_id = os.path.basename(last_downloaded_file).split('_', 1)[0]
            print(f"Last downloaded file identified: {last_downloaded_file}")
            print(f"Last downloaded item ID: {last_downloaded_item_id}")
        else:
            last_downloaded_item_id = None
            print("No previous .txt downloads found in the directory or its subdirectories.")

    except Exception as e:
        print(f"Error identifying last downloaded file: {e}")
        last_downloaded_item_id = None


# Find the index of the last downloaded item in the full item list
start_index = 0
if last_downloaded_item_id and item_identifiers:
    try:
        start_index = item_identifiers.index(last_downloaded_item_id) + 1
        print(f"Resuming download from item index: {start_index} (after {last_downloaded_item_id})")
    except ValueError:
        print(f"Last downloaded item ID '{last_downloaded_item_id}' not found in the full item list. Starting from the beginning.")
        start_index = 0

# Get the list of items remaining to download starting from the identified index
remaining_item_ids_for_download = item_identifiers[start_index:]

# Determine the total number of items to download in this session
total_items_to_download_this_session = len(remaining_item_ids_for_download)
print(f"Starting download of {total_items_to_download_this_session} remaining items...")

# Define the batch size for processing
batch_size = 10 # You can adjust this number

# Initialize download counter to keep track of overall progress
download_counter = start_index

# Iterate through the remaining item IDs in batches
for i in range(0, total_items_to_download_this_session, batch_size):
    batch_item_ids = remaining_item_ids_for_download[i:i + batch_size]
    current_batch_num = (i // batch_size) + 1
    total_batches = (total_items_to_download_this_session + batch_size - 1) // batch_size

    print(f"\n--- Processing Batch {current_batch_num}/{total_batches} ({len(batch_item_ids)} items) ---")

    # Get and display newspaper/issue info for the first item in the batch
    if batch_item_ids: # Ensure batch is not empty
        first_item_id_in_batch = batch_item_ids[0]
        metadata_url = f"https://archive.org/metadata/{first_item_id_in_batch}"
        try:
            response = requests.get(metadata_url)
            response.raise_for_status()
            metadata = response.json()

            title = metadata.get('metadata', {}).get('title', 'N/A')
            date = metadata.get('metadata', {}).get('date', 'N/A')

            print(f"  - Currently processing: Item ID: {first_item_id_in_batch}, Title: {title}, Date: {date}")

        except requests.exceptions.RequestException as e:
            print(f"  - Error fetching metadata for {first_item_id_in_batch}: {e}")


    # Prepare the batch item IDs for piping to standard input
    batch_items_input = "\n".join(batch_item_ids)

    # Construct the ia download command for the current batch
    download_command = [
        'ia',
        'download',
        '--itemlist=-',
        f'--destdir={output_directory}',
        f'--glob={glob_pattern}',
        '--retries', '5'
    ]

    try:
        start_time = time.time()
        result = subprocess.run(
            download_command,
            input=batch_items_input,
            text=True,
            capture_output=True,
            check=True,
        )
        end_time = time.time()
        batch_duration = end_time - start_time

        print(f"Batch {current_batch_num} completed in {batch_duration:.2f} seconds.")
        # Optional: Print stdout/stderr if needed for debugging
        # if result.stdout:
        #     print("Batch Stdout:")
        #     print(result.stdout)
        # if result.stderr:
        #     print("Batch Stderr:")
        #     print(result.stderr)

    except FileNotFoundError:
        print(f"Error: 'ia' command not found during batch {current_batch_num}. Ensure internetarchive is installed and in your PATH.")
        break
    except subprocess.CalledProcessError as e:
        print(f"Error executing ia download for batch {current_batch_num}: {e}")
        print(f"Stdout: {e.stdout}")
        print(f"Stderr: {e.stderr}")
    except subprocess.TimeoutExpired:
        print(f"Error: ia download command timed out for batch {current_batch_num}.")
    except Exception as e:
        print(f"An unexpected error occurred during batch {current_batch_num}: {e}")

    # Update download_counter based on the number of items processed in the batch
    download_counter = start_index + i + len(batch_item_ids)


print("\n--- All Batches Processed ---")
print(f"Check the folder '{output_directory}' in your Google Drive.")

Checking directory: /content/drive/MyDrive/KDN_Archive_Downloads/Lexington_Subset
Output directory exists.
Entries found in main directory (expected item IDs): 2478
Filtered downloaded files (.txt): 2474
Last downloaded file identified: /content/drive/MyDrive/KDN_Archive_Downloads/Lexington_Subset/xt7wdb7vnb2w/xt7wdb7vnb2w_djvu.txt
Last downloaded item ID: xt7wdb7vnb2w
Resuming download from item index: 2283 (after xt7wdb7vnb2w)
Starting download of 195 remaining items...

--- Processing Batch 1/20 (10 items) ---
  - Currently processing: Item ID: xt7wdb7vnb3g, Title: Kentucky gazette (Lexington, Ky. : 1789): 1799-01-02, Date: 1799-01-02
Batch 1 completed in 18.15 seconds.

--- Processing Batch 2/20 (10 items) ---
  - Currently processing: Item ID: xt7wm32n6r3m, Title: Kentucky gazette (Lexington, Ky. : 1809): 1836-12-29, Date: 1836-12-29
Batch 2 completed in 21.22 seconds.

--- Processing Batch 3/20 (10 items) ---
  - Currently processing: Item ID: xt7wpz51gz6k, Title: Kentucky gazett

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Reasoning**:
The previous `subprocess.run` command failed with a non-zero exit status (1), but the stderr was None, which is not helpful. To understand the cause of the error, I need to execute the command again, but this time capture the stderr output.

