### Comp 479, Fall 2023, Project 1

#### Goal: text preprocessing with NLTK, proofreading results
#### Data: Reuter's Corpus Reuters-21578
#### http://www.daviddlewis.com/resources/testcollections/reuters21578/

#### 1. read the Reuter's collection and extract the raw text of each Reuter's news item (these are your documents) from the corpus
#### 2. tokenize
#### 3. make all text lowercase
#### 4. apply Porter stemmer
#### 5. given a list of stop words, remove those stop words from text. Note that your code has to accept the stop word list as a parameter, do not hardcode a particular list

In [36]:
import os
import nltk
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer

# Initialize NLTK
nltk.download('punkt')

# Function to read Reuters collection and extract raw text
def extract_raw_text_from_reuters(folder_path, num_documents):
    raw_text_collection = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.sgm') and len(raw_text_collection) < num_documents:
            sgml_file = os.path.join(folder_path, filename)

            with open(sgml_file, 'r', encoding='latin-1') as file:
                content = file.read()

            soup = BeautifulSoup(content, 'html.parser')
            body_elements = soup.find_all('body')

            for body_element in body_elements:
                raw_text = body_element.get_text()
                raw_text_collection.append(raw_text)

    return raw_text_collection

# Function to tokenize the text
def tokenize_text(text):
    return nltk.word_tokenize(text)

# Function to make text lowercase
def make_lowercase(text):
    return text.lower()

# Function to apply Porter stemmer
def apply_porter_stemmer(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokens]

# Function to remove stop words
def remove_stop_words(tokens, stop_words):
    return [word for word in tokens if word.lower() not in stop_words]

# Main pipeline function
def text_processing_pipeline(folder_path, stop_words, num_documents):
    # Step 1: Extract raw text from Reuters collection
    raw_text_collection = extract_raw_text_from_reuters(folder_path, num_documents)

    # Initialize output files
    tokenizer_output = open('Tokenizer-output.txt', 'w')
    lowercased_output = open('Lowercased-output.txt', 'w')
    stemmed_output = open('Stemmed-output.txt', 'w')
    no_stopword_output = open('No-stopword-output.txt', 'w')

    stopwords_used_output = open('Stopwords-used-for-output.txt', 'w')
    stopwords_used_output.write('\n'.join(stop_words))

    for i, raw_text in enumerate(raw_text_collection):
        # Step 2: Tokenize the raw text
        tokens = tokenize_text(raw_text)
        tokenizer_output.write(f"Document {i + 1}:\n{' '.join(tokens)}\n\n")

        # Step 3: Make text lowercase
        lowercase_text = make_lowercase(raw_text)
        lowercased_output.write(f"Document {i + 1}:\n{lowercase_text}\n\n")

        # Step 4: Apply Porter stemmer
        stemmed_tokens = apply_porter_stemmer(tokens)
        stemmed_output.write(f"Document {i + 1}:\n{' '.join(stemmed_tokens)}\n\n")

        # Step 5: Remove stop words
        no_stopwords_tokens = remove_stop_words(tokens, stop_words)
        no_stopword_output.write(f"Document {i + 1}:\n{' '.join(no_stopwords_tokens)}\n\n")

    # Close output files
    tokenizer_output.close()
    lowercased_output.close()
    stemmed_output.close()
    no_stopword_output.close()
    stopwords_used_output.close()

# Path to the reuters21578/ folder in the current repository
folder_path = 'reuters21578/'

# Example: Get a list of NLTK English stopwords
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

# Number of documents to process
num_documents = 5

# Run the pipeline
text_processing_pipeline(folder_path, stop_words, num_documents)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nazliensafi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nazliensafi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
