#### Goal: text preprocessing with NLTK, proofreading results
#### Data: Reuter's Corpus Reuters-21578
#### http://www.daviddlewis.com/resources/testcollections/reuters21578/

In [1]:
import os
import nltk
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer

# Initialize NLTK
nltk.download('punkt')
os.getcwd()

[nltk_data] Downloading package punkt to /home/kaveh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


'/home/kaveh/nazli/inforet/InfoRet/P1'

#### 1. read the Reuter's collection and extract the raw text of each Reuter's news item (these are your documents) from the corpus

In [2]:
# Function to read Reuters collection and extract raw text
def extract_raw_text_from_reuters(folder_path, num_documents):
    raw_text_collection = []
    document_count = 0  # Counter to keep track of the number of documents processed
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.sgm') :
            sgml_file = os.path.join(folder_path, filename)

            with open(sgml_file, 'r', encoding='latin-1') as file:
                content = file.read()

            fileContent = BeautifulSoup(content, 'html.parser')
            docs = fileContent.find('reuters')
            
            while( document_count < num_documents ):
                docId = docs['newid']
                body_elements = docs.find_all('body')
    
                # Initialize a list for the bodies of the current document
                document_bodies = []

                with open( f"raw_text_{docId}.txt", 'w' ) as output:
                
                    for body_element in body_elements:
                        raw_text = body_element.get_text()
                        #raw_text_collection.append(raw_text)
                        output.write(raw_text)
                        output.write("\n")
                
                # Increment the document count
                document_count += 1
                docs = docs.find_next("reuters")

    return




In [3]:
# Path to the reuters21578/ folder in the current repository
folder_path = 'reuters21578/'

# Number of documents to process
num_documents = 5

extract_raw_text_from_reuters(folder_path, num_documents)


In [4]:
# Function to tokenize the text
def tokenize_text(text):
    return nltk.word_tokenize(text)

In [5]:

# tokenizer

all_files = os.listdir(os.getcwd()) #list of files in the current directory
for each_file in all_files:
    if each_file.startswith('raw_text_'):
        end = each_file.find('.')
        idStr = each_file[ len( 'raw_text_' ):end]
        id = int( idStr )

        with open( each_file, 'r' ) as input:
            output = open( f"Tokenizer-output_{id}.txt", 'w')
            for line in input.readlines():
                #print( "line:", line )
                for token in tokenize_text( line ):
                    #print("token:", token)
                    output.write( token )
                    output.write( "\n" )

            output.close()


In [6]:
# Function to make text lowercase
def make_lowercase(text):
    return text.lower()

In [7]:

# lowercase

all_files = os.listdir(os.getcwd()) #list of files in the current directory
for each_file in all_files:
    if each_file.startswith('Tokenizer-output_'):
        end = each_file.find('.')
        idStr = each_file[ len( 'Tokenizer-output_' ):end]
        id = int( idStr )

        with open( each_file, 'r' ) as input:
            output = open( f"Lowercased-output_{id}.txt", 'w')
            for line in input.readlines():
                #print( "line:", line )
                low = make_lowercase( line )
                output.write( low )
                #output.write( "\n" )

            output.close()

In [20]:
# Function to apply Porter stemmer
def apply_porter_stemmer(token):
    stemmer = PorterStemmer()
    return stemmer.stem(token.strip())

In [23]:
# stemmer

all_files = os.listdir(os.getcwd()) #list of files in the current directory
for each_file in all_files:
    if each_file.startswith('Lowercased-output_'):
        end = each_file.find('.')
        idStr = each_file[ len( 'Lowercased-output_' ):end]
        id = int( idStr )

        with open( each_file, 'r' ) as input:
            output = open( f"Stemmed-output_{id}.txt", 'w')
            for line in input.readlines():
                #print( "line:", line )
                stemmed = apply_porter_stemmer( line )
                output.write( stemmed )
                output.write( "\n" )

            output.close()

In [24]:
#make a stopwords file

# Get a list of NLTK English stopwords
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

# Savle stopwords used in a file
stopwords_used_output = open('Stopwords-used-for-output.txt', 'w')
stopwords_used_output.write('\n'.join(stop_words))
stopwords_used_output.close()


[nltk_data] Downloading package stopwords to /home/kaveh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
# stop words

all_files = os.listdir(os.getcwd()) #list of files in the current directory
for each_file in all_files:
    if each_file.startswith('Stemmed-output_'):
        end = each_file.find('.')
        idStr = each_file[ len( 'Stemmed-output_' ):end]
        id = int( idStr )

        with open( each_file, 'r' ) as input:
            output = open( f"No-stopword-output_{id}.txt", 'w')
            for line in input.readlines():
                if line.strip() not in stop_words:
                    output.write( line )
                    #output.write( "\n" )
                else:
                    print( f"removed {line}" )

            output.close()

removed did

removed hasn't

removed was

removed hadn

removed it

removed any

removed o

removed after

removed of

removed if

removed as

removed yourselves

removed can

removed ours

removed further

removed few

removed before
removed and

removed they

removed to

removed a

removed to

removed the

removed and

removed of

removed both

removed is

removed a

removed of

removed which

removed a

removed in

removed the

removed will

removed be

removed and

removed will

removed be

removed by

removed under

removed the

removed of

removed a

removed it

removed an

removed with

removed the

removed of

removed the

removed in

removed an

removed to

removed the

removed in

removed the

removed the

removed having

removed in

removed and

removed in

removed is

removed not

removed under

removed to

removed on

removed its

removed and

removed do

removed to

removed it

removed because

removed of

removed the

removed some

removed they

removed have

removed its