In [1]:
# Importing and installing packages
!pip install PyPDF2
from PyPDF2 import PdfReader
import re

Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [2]:
# Functions
def extract_text_from_pdf(pdf_path, start, end):
    pdf_reader = PdfReader(pdf_path)
    text = ""
    for page in pdf_reader.pages[start:end]:
        text += page.extract_text() + "\n"
    return text

def save_text_to_file(text, output_path):
    with open(output_path, 'w') as file:
        file.write(text)

In [3]:
# TB Flats transcripts
def remove_tbf(text):
    only_numbers = re.compile(r'^\s*\d+\s*$')
    cleaned_lines = []
    for line in text.split('\n'):
        if only_numbers.fullmatch(line):
            continue
        # Using regex to remove line numbers at the beginning of each line and headers
        cleaned_line = re.sub(r'^\s*\d+\s*', '', line)
        cleaned_line = re.sub(r'TB Flats Hearing DEQ/ISC 18-01', '', cleaned_line)
        cleaned_line = re.sub(r'.800.444.2826Wyoming Reporting Service, Inc.\d+', '', cleaned_line)
        cleaned_lines.append(cleaned_line)
    return '\n'.join(cleaned_lines)

output_txt = 'tbf1_cleaned.txt'
output_txt_2 = 'tbf2_cleaned.txt'
    
input_pdf = "TB Flats Part 1.pdf"
input_pdf_2 = "TB Flats Part 2.pdf"

# Extract text from PDF
raw_text = extract_text_from_pdf(input_pdf, 6, 343)
raw_text_2 = extract_text_from_pdf(input_pdf_2, 5, 202)
    
# Remove unwanted features from extracted text
cleaned_text = remove_tbf(raw_text)
cleaned_text_2 = remove_tbf(raw_text_2)
    
# Save cleaned text to a file
save_text_to_file(cleaned_text, output_txt)
save_text_to_file(cleaned_text_2, output_txt_2)
    
print(f"Line numbers, headers, and unwanted pages removed. Cleaned transcripts saved to {output_txt} and {output_txt_2}.")

Line numbers, headers, and unwanted pages removed. Cleaned transcripts saved to tbf1_cleaned.txt and tbf2_cleaned.txt.


In [7]:
# Boswell transcripts
def remove_boswell(text):
    only_numbers = re.compile(r'^\s*\d+\s*$')
    cleaned_lines = []
    for line in text.split('\n'):
        if only_numbers.fullmatch(line):
            continue
        # Using regex to remove line numbers at the beginning of each line
        cleaned_line = re.sub(r'^\s*\d+\s*', '', line)
        cleaned_line = re.sub(r'(Page)\s*\d+', '', cleaned_line)
        cleaned_lines.append(cleaned_line)
    return '\n'.join(cleaned_lines)

output_txt_1 = 'boswell_cleaned_1.txt'
output_txt_2 = 'boswell_cleaned_2.txt'
output_txt_3 = 'boswell_cleaned_3.txt'
    
boswell_pdf = "15-05 Hearing Transcript Vol 1.pdf"
boswell_pdf_2 = "15-05 Hearing Transcript Vol 2.pdf"
boswell_pdf_3 = "15-05 Hearing Transcript Vol 3.pdf"
# Extract text from PDF
boswell_raw = extract_text_from_pdf(boswell_pdf, 5, 237)
boswell_raw_2 = extract_text_from_pdf(boswell_pdf_2, 5, 358)
boswell_raw_3 = extract_text_from_pdf(boswell_pdf_3, 1, 157)

# Remove line numbers from extracted text
cleaned_boswell = remove_boswell(boswell_raw)
cleaned_2 = remove_boswell(boswell_raw_2)
cleaned_3 = remove_boswell(boswell_raw_3)
    
# Save cleaned text to a file
save_text_to_file(cleaned_boswell, output_txt_1)
save_text_to_file(cleaned_2, output_txt_2)
save_text_to_file(cleaned_3, output_txt_3)
    
print(f"Line numbers, headers, and unwanted pages removed. Cleaned transcript saved to {output_txt_1}, {output_txt_2}, and {output_txt_3}.")

Line numbers, headers, and unwanted pages removed. Cleaned transcript saved to boswell_cleaned_1.txt, boswell_cleaned_2.txt, and boswell_cleaned_3.txt.
