# **INITIALIZATION**

Import libraries.

In [25]:
# Library for reading and writing data to and from files
import os
# Library for getting dictionaries from data structures
from collections import defaultdict

Define paths.

In [26]:
# Dataset directory
corpus_directory='../Dataset/corpus2mw'

To iterate the reading procedure, get the names of the documents in the dataset.

In [27]:
# Get a list with the names of the documents
texts_names = os.listdir(corpus_directory)

# **GET TOKENS**

First, define the special characters to be separated from each word.

In [28]:
# List of characters
specialchars = [';', ':', '!', '?', '<', '>', '&', ')', '(', ']', '[', ',', '.', '"', '%', '$ ', '=', '}', '{', '-']

And define the function which separates the special characters from each word, assuming they can only be before or after each word.

In [29]:
def token(w):
    # Init the empty list of tokens
    res = []

    # If the length is 1, add the character whatever it is
    if len(w) == 1:
        res.append(w)
    
    # Otherwise (if it's at least two characters)
    else:
        # If the first character is special, add it to the list and remove it from the word
        if w[0] in specialchars:
            res.append(w[0])
            w = w[1:]
        
        # Now, if the length became 1 because of that, for the same reason as before, add the character whatever it is
        if len(w) == 1:
            res.append(w)
        # Otherwise (if it's at least two characters), both if I had removed the first or not
        # Check whether the last character is special
        elif w[-1] in specialchars:
            res.append(w[:-1])
            res.append(w[-1])
        # or not
        else:
            res.append(w)
        
    # Return the list of tokens
    return res

For each document, for each word (in each line), if either the last or the first character are specialchars, then split in multiple tokens.

In [30]:
corpus = []
corpus_lens = []

# For each document in the directory
for text in texts_names:

    # Init a temp empty list for the words in the current document
    words = []

    # Open the document
    with open(corpus_directory + '/' + text, 'r', errors='ignore') as file:

        # For each line
        for line in file:
            # For each word in the line
            for word in line.split():
                # Tokenize it
                aux = token(word)
                # And add each token to the list of words
                for t in aux:
                    words.append(t.lower())
    # Then append the list of tokens for the document in the corpus list
    corpus.append(words)
    corpus_lens.append(len(words))

# **STOP WORDS**

Define the function to find the stop words.

In [31]:
# Returns the extracted stop words
def get_stop_words():

    # Initialize a dictionary for ints with 0 as default value
    neighbour_counts = defaultdict(int)
    # Count how many neightbours each word has across the corpus
    for doc in corpus:
        # Increment by 1 the count for the first and last word of the current doc
        neighbour_counts[doc[0]] += 1
        neighbour_counts[doc[-1]] += 1
        # From second to second last word increment by 2
        for idx in range(1, len(doc) - 1):
            neighbour_counts[doc[idx]] += 2
    # And get the keys sorted by value
    neighbour_counts = sorted(neighbour_counts.items(), key=lambda item: item[1], reverse=True)

    # Remove from the neighbour_counts list the words that are special chars
    for special in specialchars:
        neighbour_counts = [tuple for tuple in neighbour_counts if tuple[0] != special]
    
    # Now look for the elbow point in the list
    elbow_point_index = 0
    param = 4
    ratio = 1
    # Iterate over the list (besides last word) to find the elbow point
    for index in range(len(neighbour_counts) - 1 - param):
        # Get the difference between the count for the current word and the next
        if abs(neighbour_counts[index][1] - neighbour_counts[index + param][1]) * ratio < param:
            elbow_point_index = index - 1
            break
    
    # Get the couples corresponding to all the words up to the elbow (highest num of neighbours)
    stop_word_counts = neighbour_counts[:elbow_point_index+1]
    # Get the stop words
    stop_words = [tuple[0] for tuple in stop_word_counts]
    
    # Return the stop words and the filtered expressions
    return stop_words

And compute them.

In [32]:
# Get stop words
stop_words = get_stop_words()
print(len(stop_words))
print(stop_words)

# And write them all on file
with open('../Output/stop_words.txt', 'w') as f:
    for word in stop_words:
        f.write("\"")
        f.write(word)
        f.write("\"")
        f.write(', ')
del corpus

242
['the', 'of', 'and', 'in', 'a', 'to', 'was', 'is', 'for', 'on', 'as', 'with', 'by', 'he', 'that', 'at', 'from', 'his', 'it', 'an', 'were', 'are', 'which', 'doc', 'this', 'also', 'be', 'or', 'has', 'had', 'first', 'their', 'one', 'but', 'its', 'after', 'not', 'new', 'they', 'who', 'have', 'two', 'her', 'she', 'been', 'other', 'when', 'during', 'there', 'into', 'all', 'time', 'more', 'only', 'may', 'most', 'school', 'years', 'would', 'over', 'some', 'out', 'such', 'national', 'up', 'him', 'later', 'about', 'used', 'where', 'between', 'world', 'then', 'city', 'many', 'can', 'made', 'three', 'while', 'state', 'year', 'under', 'known', 'part', 'these', 'united', 'than', 'university', 'second', 'being', 'became', 'no', 'american', 'season', 'before', 'both', 'team', 'states', 'through', 'however', 'war', 'including', 'early', 'born', 'film', 'them', 'against', 'well', 'family', 'since', 'will', 'until', 'history', 'area', 'series', 'high', 'south', 'album', 'name', 'number', 'group', 'pe