In [83]:
import os.path
import re
import html
import string

In [85]:
documents = []
source_folder = os.path.join('.', 'reuters21578')   # join two strings with path specifier independent from the OS


# get every article as a document where the id of the document is NEWID parameter of the article
def getDocuments():


    for file in os.listdir(source_folder):
        
        # get all files with an extension .sgm
        if file.endswith('.sgm'):
            with open(os.path.join(source_folder, file), 'r') as f:
                file_as_string = f.read()

            # while splitting based on </REUTERS> tag, we will have (# of articles) + 1 elements in the list named reuters.
            # the last element in the list does not contain any information because regular expression finds the closing tag
            # for an article and splits from that point, resulting in that the last element is below the last article.
            reuters = re.split('</REUTERS>', file_as_string)

            # traverse articles one by one
            for reuter in reuters[:-1]:

                ### get the docid
                # find the pattern NEWID="[number]"> and get the number from the string.
                # then, convert the string to an integer
                doc_id_search = re.search(r"NEWID=\"([0-9]+)\">", reuter)
                doc_id = int(doc_id_search.group(1))
                

                # get rid of html escape characters like '&lt;', '&#3;'
                reuter = html.unescape(reuter)
                doc = ""

                ### get the title
                # find the pattern "<TEXT...<TITLE>[title_text]</TITLE>..." and get the title_text from the string
                title_search = re.search(r'<TEXT(.|\n)*<TITLE>((.|\n)*)</TITLE', reuter)
                if title_search is not None: # if title is found
                    title = title_search.group(2)
                    doc += title.lower()

                ### get the body
                # find the pattern "<TEXT...<BODY>[body_text]</BODY>..." and get the body_text from the string
                body_search = re.search(r'<TEXT(.|\n)*<BODY>((.|\n)*)</BODY>', reuter)
                if body_search is not None: # if the body is found
                    body = body_search.group(2)
                    doc += " " + body[0].lower() + body[1:]

                # if both title and body cannot be found, that means that the article is in UNPROC format and it contains
                # only <TEXT> parameter, not <TITLE> or <BODY>
                if (title_search is None) and (body_search is None):
                    try:
                        ### get the text
                        # find the pattern "<TEXT...[body_text]</TEXT>" and get the body_text from the string
                        text_search = re.search(r'<TEXT.+\n((.|\n)+)</TEXT>', reuter)
                        body = text_search.group(1)
                        doc = body[0].lower() + body[1:]
                    except:
                        print(reuter, '\nerror')
                        return

                ### get rid of "reuter" at the end
                # if the document contains "Reuter" or "REUTER" with some whitespace characters at the end, cut that part away from the document 
                # reuter_finish_search = re.search('((.|\n)+)\s*\Z', doc)
                # doc = reuter_finish_search.group(1) if reuter_finish_search is not None else doc

                # gather the documents in an array
                documents.append([doc_id, doc])

            f.close()


getDocuments()

In [32]:
def getClitics():

    clitics = set()
    clitic_path = os.path.join('.', 'clitics.txt')

    with open(clitic_path) as stop_file:
        stop_lines = stop_file.readlines()

    for line in stop_lines:
        stopword = line[:-1]
        clitics.add(stopword)

    return clitics

In [116]:
lst = re.split(r'\s+', documents[0][1])

# punctuation_marks = re.sub(r'[%$,/_]', '', string.punctuation)
clitics = getClitics()
shave_string = f'[{string.punctuation}]*([\d/:.,]+)|([\w-]+)[{string.punctuation}]*'
end_of_sentence = r'[.?!]|[...]'
sentence_beginning = False
tokens = []
for word in lst:
    
    if word.lower() not in clitics:

        shaved = re.search(shave_string, word)
        if shaved is not None:
            token = shaved.group(1) if shaved.group(1) is not None else shaved.group(2)
            dash_search = re.search('-', token)

            if dash_search:
                tokens_splitted = re.split('-', token) if token[0].isupper() else [re.sub('-', '', token)]
            else:
                tokens_splitted = [token]

            for splitted in tokens_splitted:

                splitted = splitted if (splitted[0].isupper() and (not sentence_beginning)) else splitted.lower()

                tokens.append(splitted)

        end_of_sentence_search = re.search(end_of_sentence, word)
        sentence_beginning = end_of_sentence_search is not None
    else:
        token = word.lower()
        tokens.append(token)


print(tokens)


['bahia', 'cocoa', 'review', 'showers', 'continued', 'throughout', 'the', 'week', 'in', 'the', 'Bahia', 'cocoa', 'zone', 'alleviating', 'the', 'drought', 'since', 'early', 'January', 'and', 'improving', 'prospects', 'for', 'the', 'coming', 'temporao', 'although', 'normal', 'humidity', 'levels', 'have', 'not', 'been', 'restored', 'Comissaria', 'Smith', 'said', 'in', 'its', 'weekly', 'review', 'the', 'dry', 'period', 'means', 'the', 'temporao', 'will', 'be', 'late', 'this', 'year', 'arrivals', 'for', 'the', 'week', 'ended', 'February', '22', 'were', '155,221', 'bags', 'of', '60', 'kilos', 'making', 'a', 'cumulative', 'total', 'for', 'the', 'season', 'of', '5.93', 'mln', 'against', '5.81', 'at', 'the', 'same', 'stage', 'last', 'year', 'again', 'it', 'seems', 'that', 'cocoa', 'delivered', 'earlier', 'on', 'consignment', 'was', 'included', 'in', 'the', 'arrivals', 'figures', 'comissaria', 'Smith', 'said', 'there', 'is', 'still', 'some', 'doubt', 'as', 'to', 'how', 'much', 'old', 'crop', 'co

In [109]:
end = 'As-As'
end_of_sentence = r'-'
search = re.split(end_of_sentence, end) if end[0].isupper() else re.sub(end_of_sentence, '', end)
print(search)


['As', 'As']
