In [169]:
import os.path
import re
import html
import string
from heapq import heapify, heappop, heappush
from math import log10
import pickle

In [5]:
documents_initial = []
source_folder = os.path.join('.', 'reuters21578')   # join two strings with path specifier independent from the OS


# get every article as a document where the id of the document is NEWID parameter of the article
def getDocuments():


    for file in os.listdir(source_folder):
        
        # get all files with an extension .sgm
        if file.endswith('.sgm'):
            with open(os.path.join(source_folder, file), 'r') as f:
                file_as_string = f.read()

            # while splitting based on </REUTERS> tag, we will have (# of articles) + 1 elements in the list named reuters.
            # the last element in the list does not contain any information because regular expression finds the closing tag
            # for an article and splits from that point, resulting in that the last element is below the last article.
            reuters = re.split('</REUTERS>', file_as_string)

            # traverse articles one by one
            for reuter in reuters[:-1]:

                ### get the docid
                # find the pattern NEWID="[number]"> and get the number from the string.
                # then, convert the string to an integer
                doc_id_search = re.search(r"NEWID=\"([0-9]+)\">", reuter)
                doc_id = int(doc_id_search.group(1))
                

                # get rid of html escape characters like '&lt;', '&#3;'
                reuter = html.unescape(reuter)
                doc = ""

                ### get the title
                # find the pattern "<TEXT...<TITLE>[title_text]</TITLE>..." and get the title_text from the string
                title_search = re.search(r'<TEXT(.|\n)*<TITLE>((.|\n)*)</TITLE', reuter)
                if title_search is not None: # if title is found
                    title = title_search.group(2)
                    doc += title.lower()

                ### get the body
                # find the pattern "<TEXT...<BODY>[body_text]</BODY>..." and get the body_text from the string
                body_search = re.search(r'<TEXT(.|\n)*<BODY>((.|\n)*)</BODY>', reuter)
                if body_search is not None: # if the body is found
                    body = body_search.group(2)
                    doc += " " + body[0].lower() + body[1:]

                # if both title and body cannot be found, that means that the article is in UNPROC format and it contains
                # only <TEXT> parameter, not <TITLE> or <BODY>
                if (title_search is None) and (body_search is None):
                    try:
                        ### get the text
                        # find the pattern "<TEXT...[body_text]</TEXT>" and get the body_text from the string
                        text_search = re.search(r'<TEXT.+\n((.|\n)+)</TEXT>', reuter)
                        body = text_search.group(1)
                        doc = body[0].lower() + body[1:]
                    except:
                        print(reuter, '\nerror')
                        return

                ### get rid of "reuter" at the end
                # if the document contains "Reuter" or "REUTER" with some whitespace characters at the end, cut that part away from the document 
                # reuter_finish_search = re.search('((.|\n)+)\s*\Z', doc)
                # doc = reuter_finish_search.group(1) if reuter_finish_search is not None else doc

                # gather the documents in an array
                documents_initial.append([doc_id, doc])

            f.close()


getDocuments()

In [6]:
def getClitics():

    clitics = set()
    clitic_path = os.path.join('.', 'clitics.txt')

    with open(clitic_path) as stop_file:
        stop_lines = stop_file.readlines()

    for line in stop_lines:
        stopword = line[:-1]
        clitics.add(stopword)

    return clitics

In [7]:
def normalize(token_list, clitics_set):

    # regular expression that will detect either a word, words with hyphens included, or a token composing of numbers
    # with special characters('/', ':', '.', ',') like 16/20, 19.02.2021, etc.
    shave_string = f'[{string.punctuation}]*([\d/:.,]+\d+)|([\w-]+)[{string.punctuation}]*'

    # a sentence will probably end with one of the following punctuation marks: '.', '!', '?', '...'
    end_of_sentence = r'[.?!]|[...]'
    # a flag that will show that the previous word is the last word of the sentence, which means the current word is the beginning of the sentence
    sentence_beginning = False

    # final document as a list of its tokens
    final_document = []

    tokens_after_normalization = 0

    for word in token_list:
            
        if word.lower() not in clitics_set:

            # shave the token
            shaved = re.search(shave_string, word)
            if shaved is not None:
                # as described in the definition of shave_string, token can be in one of the 2 different forms
                token = shaved.group(1) if shaved.group(1) is not None else shaved.group(2)

                # # keep terms before the normalization
                # dictionary_before_normalization.add(token)

                # detect hyphenated words
                dash_search = re.search('-', token)

                # after this conditional block, we will have a list of tokens named as token_splitted
                if dash_search:
                    # if the first character of the hyphenated word is upper, like Hewett-Pickard or New York-San Fransisco, then they should be 
                    # splitted and taken as different strings. if the first character is lower, then all hyphens should be deleted and the result will be
                    # one word('know-how' to 'knowhow')
                    tokens_splitted = re.split('-', token) if token[0].isupper() else [re.sub('-', '', token)]
                else:
                    # if no hyphen detected, then the token is taken directly
                    tokens_splitted = [token]

                for splitted in tokens_splitted:
                    if splitted:
                        # if the first character of the token is upper and it is not the beginning of the sentence, then keep the token with uppercase letters.
                        # otherwise, lower the letters
                        splitted = splitted if (splitted[0].isupper() and (not sentence_beginning)) else splitted.lower()
                        
                        # # keep terms after normalization
                        # dictionary_after_normalization.add(splitted)
                        # keep the number of tokens after normalization
                        tokens_after_normalization += 1

                        final_document.append(splitted)

            end_of_sentence_search = re.search(end_of_sentence, word)
            sentence_beginning = end_of_sentence_search is not None
        else:
            # # keep terms before the normalization
            # dictionary_before_normalization.add(word)

            token = word.lower()

            # # keep terms after normalization
            # dictionary_after_normalization.add(word)
            # keep the number of tokens after normalization
            tokens_after_normalization += 1
            
            final_document.append(splitted)

    return final_document, tokens_after_normalization

In [8]:
def getDictionary(documents):
    dictionary = set()

    for document_pair in documents:
        document = document_pair[1]
        
        for token in document:
            dictionary.add(token) if token else None

    return dictionary

In [9]:
def splitTokens(documents, splitted):
    
    for document_pair in documents:
        token_list = re.split('\s+', document_pair[1])
        splitted.append([document_pair[0], token_list])

In [10]:
def normalizeDocuments(documents):

    # get clitics: we're, i'm, etc.
    clitics = getClitics()
    
    tokens_before_normalization = 0
    tokens_after_normalization = 0

    normalized_documents = []
    splitted_documents = []
    splitTokens(documents, splitted_documents)

    dictionary_before_normalization = getDictionary(splitted_documents)

    for document_pair_index in range(len(splitted_documents)):
        
        word_list = splitted_documents[document_pair_index][1]
        tokens_before_normalization += len(word_list)

        normalized_document, normalization_tokens = normalize(word_list, clitics)
        tokens_after_normalization += normalization_tokens

        normalized_documents.append((documents[document_pair_index][0], normalized_document))
    
    return normalized_documents, tokens_before_normalization, tokens_after_normalization, len(dictionary_before_normalization)


In [11]:
normalized_documents, num_of_tokens_before_normalization, num_of_tokens_after_normalization, num_of_terms_before_normalization = normalizeDocuments(documents_initial)
dictionary_after_normalization = getDictionary(normalized_documents)

In [12]:
print('Number of tokens before normalizaion:', num_of_tokens_before_normalization)
print('Number of tokens after normalization:', num_of_tokens_after_normalization)
print('Number of terms before normalization:', num_of_terms_before_normalization)
print('Number of terms after normalization:', len(dictionary_after_normalization))

Number of tokens before normalizaion: 2781148
Number of tokens after normalization: 2760537
Number of terms before normalization: 130957
Number of terms after normalization: 85209


In [13]:
final_dictionary = list(dictionary_after_normalization)
final_dictionary.sort()

In [14]:
## this algorithm is taken from https://www.geeksforgeeks.org/python-program-for-binary-search/

# Iterative Binary Search Function
# It returns index of search_element in given dictionary arr if present,
# else returns -1
def binarySearch(dictionary, search_element):
    low = 0
    high = len(dictionary) - 1
    mid = 0
 
    while low <= high:
 
        mid = (high + low) // 2
 
        # If search_element is greater, ignore left half
        if dictionary[mid] < search_element:
            low = mid + 1
 
        # If search_element is smaller, ignore right half
        elif dictionary[mid] > search_element:
            high = mid - 1
 
        # means search_element is present at mid
        else:
            return mid
 
    # If we reach here, then the element was not present
    return -1

In [57]:
def createInvertedIndex(term_list, normalized_documents):
    final_inverted_index = []
    for _ in range(len(term_list)):
        final_inverted_index.append([0, {}])
    
    for document_pair in normalized_documents:

        document_id = document_pair[0]
        document = document_pair[1]

        for token_index in range(len(document)):
            token = document[token_index]

            if token:

                position = binarySearch(term_list, token)

                if position != -1:
                    final_inverted_index[position][0] += 1
                    
                    try:
                        final_inverted_index[position][1][document_id].append(token_index)
                    except:
                        final_inverted_index[position][1][document_id] = [token_index]
                    
    return final_inverted_index
    # print(term_list[20000], final_inverted_index[20000])
    # print('hello world')

In [84]:
def returnTopK(elements, k):
    
    if k >= len(elements):
        return [return_element for return_element in elements]
    else:
        max_heap = [(-sort_base, return_element) for return_element, sort_base in elements]
        heapify(max_heap)

        result = []

        while k > 0:
            sort_base, return_element = heappop(max_heap)
            result.append((return_element, -sort_base))
            k -= 1

        return result

In [76]:
def getFrequencies(inverted_index):

    result = []
    for index in range(len(inverted_index)):
        result.append((index, inverted_index[index][0]))

    return result

In [56]:
final_inverted_index = createInvertedIndex(final_dictionary, normalized_documents)

In [92]:
frequencies = getFrequencies(final_inverted_index)
top_k_terms = returnTopK(frequencies, 100)

with open('top_100_frequent_terms.txt', 'w') as f:
    for index, frequency in top_k_terms:
        f.write(f'{final_dictionary[index]}: {frequency} times appeared in documents\n')

In [143]:
def calculateScores(inverted_index, num_of_documents):
    tf_idf_scores = []
    document_frequencies = []

    for term_id in range(len(inverted_index)):
        idf = num_of_documents / len(inverted_index[term_id][1])
        score = inverted_index[term_id][0] * log10(idf)
        tf_idf_scores.append((term_id, score))

        document_frequencies.append(1 / idf)

    return tf_idf_scores, document_frequencies

In [170]:
def determineStopWords(tf_idf_scores, threshold, document_frequencies):

    total_score = 0
    for id, score in tf_idf_scores:
        total_score += score 

    stopword_scores = 0
    stopword_list = []
    for id, score in tf_idf_scores:
        document_frequency = document_frequencies[id]

        if document_frequency < 0.4:
            continue

        stopword_scores += score
        stopword_list.append(id)
       
        if stopword_scores >= total_score * threshold:
            break

    return stopword_list

In [168]:
tf_idf_list, document_frequencies = calculateScores(final_inverted_index, len(normalized_documents))
top_k_tf_idf = returnTopK(tf_idf_list, 1000)
stop_word_list = determineStopWords(top_k_tf_idf, 0.1, document_frequencies)

# result = ""
# for stop_word_id in stop_word_list:
#     result += " " + final_dictionary[stop_word_id]

# print(result)

 the to in a of and said for it reuter


In [173]:
with open('dictionary.pickle', 'wb') as dictionary_file:
    pickle.dump(final_dictionary, dictionary_file)

with open('index.pickle', 'wb') as index_file:
    pickle.dump(final_inverted_index,index_file)

with open('stopwords.pickle', 'wb') as stopword_file:
    pickle.dump(stop_word_list, stopword_file)