In [1]:
import os
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

# Ensure stopwords are available; if needed, run:
# import nltk
# nltk.download('stopwords')

# Define a set of extremely common words (you can adjust or extend this list)
stop_words = set(stopwords.words('english'))

# Initialize the SnowballStemmer (Porter2)
stemmer = SnowballStemmer("english")

def clean_text(text):
    # (i) Delete hyphens and apostrophes.
    text = text.replace("-", "").replace("'", "")
    
    # (ii) Remove non-spoken parenthetical insertions (remove text within parentheses).
    text = re.sub(r'\([^)]*\)', '', text)
    
    # (iii) Replace all other punctuation with spaces.
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Convert to lowercase and tokenize by whitespace.
    tokens = text.lower().split()
    
    # (iv) Drop extremely common words.
    tokens = [token for token in tokens if token not in stop_words]
    
    # (v) Reduce words to their stems using the SnowballStemmer (Porter2).
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Return the cleaned text as a single string (tokens separated by space)
    return " ".join(tokens)

def clean_speeches_file(input_file, output_file):
    with open(input_file, 'r', encoding='latin-1') as f_in, open(output_file, 'w', encoding='utf-8') as f_out:
        next(f_in)  # Skip header/sample line.
        for line in f_in:
            line = line.strip()
            if line:
                # Assuming the format: speech_id|speech_text
                try:
                    speech_id, speech_text = line.split('|', 1)
                except ValueError:
                    print(f"Skipping line with unexpected format: {line}")
                    continue
                cleaned_text = clean_text(speech_text)
                # Write the cleaned data preserving the original speech_id.
                f_out.write(f"{speech_id}|{cleaned_text}\n")

if __name__ == "__main__":
    # Process sessions 097 to 114.
    for session in range(97, 115):
        session_str = f"{session:03d}"  # e.g., "097"
        input_filename = f"C:\\Users\\Maxfield Evers\\Desktop\\Thesis\\Data\\gentzkow_work\\hein-daily\\hein-daily\\speeches_{session_str}.txt"
        output_filename = f"C:\\Users\\Maxfield Evers\\Desktop\\Thesis\\Data\\gentzkow_work\\hein-daily\\hein-daily\\cleaned_speeches_{session_str}.txt"
        if os.path.exists(input_filename):
            print(f"Cleaning {input_filename}...")
            clean_speeches_file(input_filename, output_filename)
            print(f"Saved cleaned file to {output_filename}")
        else:
            print(f"File {input_filename} not found.")

Cleaning C:\Users\Maxfield Evers\Desktop\Thesis\Data\gentzkow_work\hein-daily\hein-daily\speeches_097.txt...
Saved cleaned file to C:\Users\Maxfield Evers\Desktop\Thesis\Data\gentzkow_work\hein-daily\hein-daily\cleaned_speeches_097.txt
Cleaning C:\Users\Maxfield Evers\Desktop\Thesis\Data\gentzkow_work\hein-daily\hein-daily\speeches_098.txt...
Saved cleaned file to C:\Users\Maxfield Evers\Desktop\Thesis\Data\gentzkow_work\hein-daily\hein-daily\cleaned_speeches_098.txt
Cleaning C:\Users\Maxfield Evers\Desktop\Thesis\Data\gentzkow_work\hein-daily\hein-daily\speeches_099.txt...
Saved cleaned file to C:\Users\Maxfield Evers\Desktop\Thesis\Data\gentzkow_work\hein-daily\hein-daily\cleaned_speeches_099.txt
Cleaning C:\Users\Maxfield Evers\Desktop\Thesis\Data\gentzkow_work\hein-daily\hein-daily\speeches_100.txt...
Saved cleaned file to C:\Users\Maxfield Evers\Desktop\Thesis\Data\gentzkow_work\hein-daily\hein-daily\cleaned_speeches_100.txt
Cleaning C:\Users\Maxfield Evers\Desktop\Thesis\Data\gen