In [1]:
from bs4 import BeautifulSoup
import re
import os

In [2]:
def get_file_names(directory):
    # List to store file names
    file_names = []

    # Iterate over all the files in the directory
    for file_name in os.listdir(directory):
        # Check if it is a file
        if os.path.isfile(os.path.join(directory, file_name)):
            file_names.append(file_name)
    
    return file_names

# Example usage
directory_path = 'hearings html'
file_names_list = get_file_names(directory_path)
print(file_names_list)

['CHRG-118shrg55851.htm', 'CHRG-118hhrg56705.htm', 'CHRG-118hhrg56063.htm', 'CHRG-118hhrg57369.htm', 'CHRG-118shrg57061.htm', 'CHRG-118hhrg55973.htm', 'CHRG-118shrg55299.htm', 'CHRG-118hhrg55634.htm', 'CHRG-118hhrg54502.htm', 'CHRG-118jhrg56240.htm', 'CHRG-118shrg56753.htm', 'CHRG-118hhrg55185.htm', 'CHRG-118hhrg57786.htm', 'CHRG-118jhrg56297.htm', 'CHRG-119shrg58427.htm', 'CHRG-118jhrg55563.htm', 'CHRG-118hhrg57221.htm', 'CHRG-118hhrg55436.htm', 'CHRG-118hhrg55378.htm', 'CHRG-118hhrg54927.htm', 'CHRG-118hhrg54926.htm', 'CHRG-118hhrg56664.htm', 'CHRG-119hhrg58538.htm', 'CHRG-118hhrg57220.htm', 'CHRG-118shrg56752.htm', 'CHRG-118jhrg56241.htm', 'CHRG-118hhrg56300.htm', 'CHRG-118hhrg56472.htm', 'CHRG-118hhrg55972.htm', 'CHRG-118hhrg56506.htm', 'CHRG-118shrg57060.htm', 'CHRG-118shrg55878.htm', 'CHRG-118hhrg59423.htm', 'CHRG-118hhrg56704.htm', 'CHRG-118hhrg57432.htm', 'CHRG-118hhrg55231.htm', 'CHRG-118shrg55850.htm', 'CHRG-118shrg55852.htm', 'CHRG-118hhrg55233.htm', 'CHRG-118hhrg57418.htm',

In [3]:
import shutil

# Create a directory named "hearings txt" if it doesn't already exist
directory_name = "hearings txt"
if os.path.exists(directory_name):
    # Delete all files in the directory
    for filename in os.listdir(directory_name):
        file_path = os.path.join(directory_name, filename)
        if os.path.isfile(file_path):
            os.unlink(file_path)
    print(f"All files in the directory '{directory_name}' have been deleted.")
else:
    os.makedirs(directory_name)
    print(f"Directory '{directory_name}' created successfully.")

All files in the directory 'hearings txt' have been deleted.


In [4]:
def clean_pre_content(pre_content):
    # Step 1: Split into lines and remove empty/whitespace-only lines
    lines = [line.strip() for line in pre_content.splitlines() if line.strip()]

    # Step 2: Join lines with a space instead of newlines
    cleaned = ' '.join(lines)

    # Step 3: Replace multiple spaces with a single space
    cleaned = re.sub(r'\s+', ' ', cleaned)

    return cleaned.strip()

In [5]:
def find_second_occurrence(text, search_text):
    # Find the first occurrence
    first_occurrence = text.find(search_text)
    
    if first_occurrence == -1:
        return "The search text does not occur in the given text."
    
    # Find the second occurrence
    second_occurrence = text.find(search_text, first_occurrence + len(search_text))
    
    if second_occurrence == -1:
        return "The search text does not occur twice in the given text."
    
    # Print from the second occurrence
    return text[second_occurrence:]

In [6]:
def remove_square_brackets(text):
    # Use regular expression to find and remove text within square brackets
    cleaned_text = re.sub(r'\[.*?\]', '', text)
    # use regular expression to remove text within angle brackets
    cleaned_text = re.sub(r'\<.*?\>', '', cleaned_text)
    return cleaned_text

In [7]:
count = 0
unprocessed_files = []
for i in range(0, len(file_names_list)):
    file_path = os.path.join(directory_path, file_names_list[i])
    with open(file_path , 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
    pre_text = soup.find('pre').get_text()
    
    heading_lines = []
    for line in pre_text.splitlines():
        if line.strip() == "=======================================================================":
            break
        if line.strip().isupper():
            heading_lines.append(line.strip())
    heading = " ".join(heading_lines)

    content = clean_pre_content(pre_text)
    second_occurrence = find_second_occurrence(content, heading)
    
    if second_occurrence == "The search text does not occur twice in the given text." or second_occurrence == "The search text does not occur in the given text.":
        count += 1
        print(f"File: {file_names_list[i]} - {second_occurrence}")
        unprocessed_files.append(file_names_list[i])
        cleaned_text = remove_square_brackets(content)
        output_file_path = os.path.join("hearings txt", file_names_list[i].replace('.htm', '_unprocessed.txt'))
        with open(output_file_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_text)
        print(f"Content has been written to {output_file_path}")
        continue
    
    cleaned_text = remove_square_brackets(second_occurrence)
    
    output_file_path = os.path.join("hearings txt", file_names_list[i].replace('.htm', '.txt'))
    with open(output_file_path, 'w', encoding='utf-8') as f:
        f.write(cleaned_text)
    print(f"Content has been written to {output_file_path}")

Content has been written to hearings txt/CHRG-118shrg55851.txt
Content has been written to hearings txt/CHRG-118hhrg56705.txt
Content has been written to hearings txt/CHRG-118hhrg56063.txt
Content has been written to hearings txt/CHRG-118hhrg57369.txt
Content has been written to hearings txt/CHRG-118shrg57061.txt
Content has been written to hearings txt/CHRG-118hhrg55973.txt
Content has been written to hearings txt/CHRG-118shrg55299.txt
Content has been written to hearings txt/CHRG-118hhrg55634.txt
Content has been written to hearings txt/CHRG-118hhrg54502.txt
Content has been written to hearings txt/CHRG-118jhrg56240.txt
Content has been written to hearings txt/CHRG-118shrg56753.txt
Content has been written to hearings txt/CHRG-118hhrg55185.txt
File: CHRG-118hhrg57786.htm - The search text does not occur in the given text.
Content has been written to hearings txt/CHRG-118hhrg57786_unprocessed.txt
File: CHRG-118jhrg56297.htm - The search text does not occur in the given text.
Content h

In [8]:
count

123

In [9]:
import os

def get_number_of_files(directory):
    # List all files in the directory
    files = [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]
    # Return the number of files
    return len(files)

# Example usage
directory_path = 'hearings html'
number_of_files = get_number_of_files(directory_path)
print(f"Number of files in the directory '{directory_path}': {number_of_files}")

Number of files in the directory 'hearings html': 521


In [10]:
directory_path = 'hearings txt'
number_of_files = get_number_of_files(directory_path)
print(f"Number of files in the directory '{directory_path}': {number_of_files}")

Number of files in the directory 'hearings txt': 521
