# **INITIALIZATION**

Import libraries.

In [1]:
# Library for reading and writing data to and from files
import os
# Library for numerical computing
import numpy as np
# Library for mathematical functions
import math
# Library for getting dictionaries from data structures
from collections import defaultdict

Define paths.

In [2]:
# Dataset directory
corpus_directory='../Dataset/corpus2mw'

To iterate the reading procedure, get the names of the documents in the dataset.

In [3]:
# Get a list with the names of the documents
texts_names = os.listdir(corpus_directory)

Set the seed for reproducibility.

In [4]:
np.random.seed(55555555)

# **GET TOKENS**

First, define the special characters to be separated from each word.

In [5]:
# List of characters
specialchars = [';', ':', '!', '?', '<', '>', '&', ')', '(', ']', '[', ',', '.', '"', '%', '$ ', '=', '}', '{', '-', '_', '+', '*', '#', '@']

And define the function which separates the special characters from each word, assuming they can only be before or after each word.

In [6]:
def token(w):
    # Init the empty list of tokens
    res = []

    # If the length is 1, add the character whatever it is
    if len(w) == 1:
        res.append(w)
    
    # Otherwise (if it's at least two characters)
    else:
        # If the first character is special, add it to the list and remove it from the word
        if w[0] in specialchars:
            res.append(w[0])
            w = w[1:]
        
        # Now, if the length became 1 because of that, for the same reason as before, add the character whatever it is
        if len(w) == 1:
            res.append(w)
        # Otherwise (if it's at least two characters), both if I had removed the first or not
        # Check whether the last character is special
        elif w[-1] in specialchars:
            res.append(w[:-1])
            res.append(w[-1])
        # or not
        else:
            res.append(w)
        
    # Return the list of tokens
    return res

For each document, for each word (in each line), if either the last or the first character are specialchars, then split in multiple tokens.

In [7]:
corpus = []
corpus_lens = []

# For each document in the directory
for text in texts_names:

    # Init a temp empty list for the words in the current document
    words = []

    # Open the document
    with open(corpus_directory + '/' + text, 'r', errors='ignore') as file:

        # For each line
        for line in file:
            # For each word in the line
            for word in line.split():
                # Tokenize it
                aux = token(word)
                # And add each token to the list of words
                for t in aux:
                    words.append(t.lower())
    # Then append the list of tokens for the document in the corpus list
    corpus.append(words)
    corpus_lens.append(len(words))

corpus_len = len(corpus)

This way, the corpus is a list of documents, which are lists of tokens.

In [8]:
# Print length for each document
print(corpus_lens)
# And compute the overall number of words
num_of_words_in_corpus = sum(corpus_lens)
print("There are ", num_of_words_in_corpus, " words in the corpus")

[5955, 2028, 312, 680, 351, 2773, 1253, 286, 856, 510, 442, 1484, 445, 1246, 341, 619, 823, 529, 620, 749, 1145, 1014, 214, 165, 354, 524, 391, 1343, 289, 1228, 452, 225, 160, 374, 722, 1275, 846, 464, 748, 213, 155, 494, 246, 956, 402, 1482, 795, 510, 364, 1355, 154, 200, 1492, 233, 3206, 261, 619, 218, 910, 417, 657, 762, 219, 1305, 412, 929, 343, 314, 1676, 1315, 284, 1211, 1638, 769, 888, 1272, 1118, 456, 1541, 215, 404, 175, 414, 1056, 772, 919, 860, 1215, 753, 249, 4310, 139, 197, 256, 163, 290, 659, 1920, 548, 881, 1722, 333, 292, 2153, 233, 1501, 731, 268, 259, 1166, 401, 1044, 1540, 355, 3404, 498, 247, 818, 3268, 733, 373, 178, 704, 685, 1547, 280, 343, 206, 364, 2181, 643, 480, 783, 232, 619, 130, 194, 615, 559, 760, 417, 305, 252, 836, 176, 1402, 519, 706, 1233, 675, 489, 276, 202, 550, 163, 755, 4634, 277, 251, 355, 260, 442, 822, 338, 1857, 1677, 250, 584, 310, 472, 643, 723, 499, 229, 218, 194, 224, 180, 178, 334, 293, 462, 522, 245, 244, 137, 431, 566, 1371, 679, 426, 4

Define a function to get a list of words from a string with words separated with a space.

In [9]:
# Given a list of strings, it returns a string
# in which the substrings will be separated by ' '
def list_to_str(strings):
    # Init res as an empty string
    res = ""
    # For each character/string in the list
    for i in range(len(strings)):
        # Concatenate the string plus a space to res
        res += strings[i] + ' '
    # Then return everything besides the last space
    return res[:-1]

Define a function to do the opposite.

In [10]:
# Given a string which contains substrings separated by ' ',
# it returns a list of words
def str_to_list(s):
    # Init res as an empty list
    res = []
    # Split the string by ' ' and for each substring
    for word in s.split():
        # Append the substring to res
        res.append(word)
    # Return the list of substrings
    return res

Declare a function which returns a list of n dictionaries, one for each n up to a fixed max, which contain the information about the frequencies for each n-gram.

In [11]:
# Create a list of n dictionaries, one for all the possible n-grams, with n in [1, max]
# Each n-dictionary will map each n-gram to a list with the absolute frequency in [0]
# and then the index of the documents in which it was found once at least,
# followed by the relative frequency (e.g. [25, 1, 20, 2, 1, 3, 4])
def create_list_of_dict_global(max):
    # Init an empty list of dictionaries
    list_dict=[]

    # For each n in [1, max], append an empty dictionary to the list
    for n in range(max):
        list_dict.append({})

    # For each index of a document in the corpus
    for i in range(corpus_len):
        # For each index of a token in the document
        for t in range(len(corpus[i])):
            # For each n in [1, max]
            for n in range(1, len(list_dict)+1):
                # If the document is not over (there is still space for an n-gram)
                if ( t + n ) <= len(corpus[i]) :
                    # If the n-gram is not yet in the n-dictionary
                    if not ( list_to_str(corpus[i][t : t+n]) in list_dict[n-1].keys() ) :
                        # Associate to the new n-gram an empty list
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])] = []
                        # Set the frequency to 1 in position [0]
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(1)
                        # And then append the index of the current document
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(i)
                        # and set the relative frequency to 1
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(1)
                    else:
                        # Add one to the frequency of the n-gram
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])][0] += 1
                        # And if the last document in which this n-gram was found
                        # is still the current one 
                        if list_dict[n-1][list_to_str(corpus[i][t : t+n])][-2] == i :
                            # Just increment the relative frequency
                            list_dict[n-1][list_to_str(corpus[i][t : t+n])][-1] += 1
                        # Otherwise
                        else:
                            # Append the new (current) document
                            list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(i)
                            # and set the relative frequency to 1
                            list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(1)
    # Then return the list of dictionaries
    return list_dict

Pick a max value for n. Remember that if you care about the 7-grams you need the 8-grams to be computed.

In [12]:
max_n = 8

Create a dictionary with the above function.

In [13]:
list_of_ngrams_info_dictionaries = create_list_of_dict_global(max_n)

Show how many n-grams for each value of n.

In [14]:
for n in range(len(list_of_ngrams_info_dictionaries)):
    print("N-grams of length", n+1)
    print(len(list_of_ngrams_info_dictionaries[n]))

N-grams of length 1
149865
N-grams of length 2
830902
N-grams of length 3
1555746
N-grams of length 4
1876677
N-grams of length 5
1967364
N-grams of length 6
1989746
N-grams of length 7
1994778
N-grams of length 8
1995258


# **STOP WORDS**

Define the function to find the stop words.

In [15]:
# Returns the extracted stop words
def get_stop_words(param=4, d=1):

    # Initialize a dictionary for sets
    neighbour_counts = defaultdict(set)

    # For each document in the corpus
    for doc in corpus:
        # For each word in the document
        for idx in range(len(doc)):
            # If the word is not the first word, add the previous word to its set of neighbours
            if idx > 0:
                neighbour_counts[doc[idx]].add(doc[idx - 1])
            # If the word is not the last word, add the next word to its set of neighbours
            if idx < len(doc) - 1:
                neighbour_counts[doc[idx]].add(doc[idx + 1])

    # Convert the sets of neighbours to counts of unique neighbours
    for word, neighbours in neighbour_counts.items():
        neighbour_counts[word] = len(neighbours)

    # Sort the words by their counts of unique neighbours
    neighbour_counts = sorted(neighbour_counts.items(), key=lambda item: item[1], reverse=True)

    # Remove from the neighbour_counts list the words that are special chars
    for special in specialchars:
        neighbour_counts = [tuple for tuple in neighbour_counts if tuple[0] != special]
    
    # Now look for the elbow point in the list
    elbow_point_index = 0
    # Iterate over the list (besides last word) to find the elbow point
    for index in range(len(neighbour_counts) - 1 - param):
        # Get the difference between the count for the current word and the next
        if abs(neighbour_counts[index][1] - neighbour_counts[index + param][1]) * d < param:
            elbow_point_index = index - 1
            break
    
    # Get the couples corresponding to all the words up to the elbow (highest num of neighbours)
    stop_word_counts = neighbour_counts[:elbow_point_index+1]
    # Get the stop words
    stop_words = [tuple[0] for tuple in stop_word_counts]
    
    # Return the stop words and the filtered expressions
    return stop_words

And compute them.

In [16]:
# Get stop words
stop_words = get_stop_words()
print(len(stop_words))
print(stop_words)

# And write them all on file
with open('../Output/stop_words.txt', 'w') as f:
    for word in stop_words:
        f.write("\"")
        f.write(word)
        f.write("\"")
        f.write(', ')
del corpus

154
['the', 'and', 'of', 'in', 'to', 'a', 'was', 'is', 'for', 'by', 'with', 'as', 'on', 'from', 'at', 'that', 'doc', 'or', 'were', 'are', 'his', 'an', 'their', 'has', 'had', 'which', 'its', 'also', 'he', 'this', 'first', 'it', 'after', 'her', 'have', 'be', 'when', 'who', 'two', 'into', 'new', 'other', 'between', 'but', 'not', 'de', 'during', 'one', 'all', 'about', 'more', 'over', 'would', 'through', 'only', 'some', 'against', 'may', 'they', 'can', 'under', 'being', 'been', 'â€“', 'while', 'then', 'national', 'many', 'three', 'before', 'until', 'both', 'these', 'will', 'including', 'became', 'most', 'than', 'john', 'out', 'where', 'no', 'him', 'she', 'state', 'school', 'american', 'several', 'later', 'such', 'made', 'international', 'since', 'up', 'around', 'group', 'county', 'people', 'called', 'high', 'team', 'any', 'family', 'each', 'them', 'four', 'like', 'second', 'include', 'another', 'city', 'system', 'began', 'early', 'world', 'local', 'river', 'could', 'season', 'time', 'compan

# **EXPLORING GLUES**

Define a function which computes the glues.

In [17]:
def compute_glues(gluename):
    # Initialize an empty list for storing the glue dictionaries
    glue_dicts = []

    # For each n in [1, len(d))
    for n in range(1, len(list_of_ngrams_info_dictionaries)):

        # Initialize the glue dictionary as a dictionary with the same keys as list_of_ngrams_info_dictionaries[n-1] associating 0 to each
        glue_dict = dict.fromkeys(list_of_ngrams_info_dictionaries[n - 1], 0)

        # For each n-gram with its frequencies list
        for key, value in list_of_ngrams_info_dictionaries[n - 1].items():
            # Only if n is at least 2 or if it is a monogram of length 3
            if ( n != 1 ) or ( len(key) >= 3 ):
                # Store the absolute frequency for the n-gram
                abs_freq = value[0]

                # If n is at least 2
                if n != 1:
                    # Convert the n-gram into a list instead of a string
                    key_list = str_to_list(key)

                    # Initialize the sum to zero
                    s = 0
                    
                    # Compute the right GLUE coefficient

                    # Do Dice
                    if gluename == 'Dice':

                        # Dividing the n-gram into two parts w1...wi and wi+1...wn
                        for i in range(len(key_list) - 1):
                            # Get the absolute frequencies of the two sub-n-grams
                            f1 = list_of_ngrams_info_dictionaries[i][list_to_str(key_list[:i+1])][0]
                            f2 = list_of_ngrams_info_dictionaries[n-i-2][list_to_str(key_list[i+1:])][0]
                            # And add to the sum the partial sum
                            s += (f1 + f2) / (n - 1)

                        gl = (2 * abs_freq) / s

                    # Do SCP
                    elif gluename == 'SCP':

                        # Dividing the n-gram into two parts w1...wi and wi+1...wn
                        for i in range(len(key_list) - 1):
                            # Get the absolute frequencies of the two sub-n-grams
                            f1 = list_of_ngrams_info_dictionaries[i][list_to_str(key_list[:i+1])][0]
                            f2 = list_of_ngrams_info_dictionaries[n-i-2][list_to_str(key_list[i+1:])][0]
                            # And add to the sum the partial sum
                            s += (f1 * f2) / (n - 1)

                        gl = (abs_freq**2) / s

                    # Compute MI
                    elif gluename == 'MI':

                        # Dividing the n-gram into two parts w1...wi and wi+1...wn
                        for i in range(len(key_list) - 1):
                            # Get the absolute frequencies of the two sub-n-grams
                            f1 = list_of_ngrams_info_dictionaries[i][list_to_str(key_list[:i+1])][0]
                            f2 = list_of_ngrams_info_dictionaries[n-i-2][list_to_str(key_list[i+1:])][0]
                            # And add to the sum the partial sum
                            s += (f1 * f2) / (n - 1)

                        gl = math.log(abs_freq * num_of_words_in_corpus / s)

                    else:
                        gl = 0

                    # Add the glue to the list of the current n-gram
                    glue_dict[key] = gl

            # Otherwise if we have a monogram which is not long enough (3 letters at least)
            else:
                # Remove the list related to such n-gram from the glue dictionary
                glue_dict.pop(key)

        # Append the n-th glue dictionary to the list of glue dictionaries
        glue_dicts.append(glue_dict)

    # Return the list of glue dictionaries
    return glue_dicts

Compute the glues for each n-gram.

In [18]:
print("Computing GLUEs with SCP...")
SCP_glues = compute_glues('SCP')
print("Computing GLUEs with Dice...")
dice_glues = compute_glues('Dice')
print("Computing GLUEs with MI...")
MI_glues = compute_glues('MI')

Computing GLUEs with SCP...
Computing GLUEs with Dice...
Computing GLUEs with MI...


# **REGULAR EXPRESSIONS**

Compute a list with n dictionaries, each associating to each n-gram all the (n+1)-grams which have one more word on the left/right.

In [19]:
# List of dictionaries, one for each value of n
fathers = []

# Notice n < max_n obviously
for n in range(1, max_n):

    # Get a dictionary with all the keys of the n-th dictionary in list_of_ngrams_info_dictionaries, and empty lists as values
    f = {key: [] for key in dict(list_of_ngrams_info_dictionaries[n - 1]).keys()}

    # For each (n+1)-gram in the (n+1)-th dictionary
    for key, value in list_of_ngrams_info_dictionaries[n].items():
        # Get the (n+1)-gram as list of words
        key_list = str_to_list(key)
        # Get the two n-grams
        subkey1 = list_to_str(key_list[1:])
        subkey2 = list_to_str(key_list[:-1])
        # Add them in the temp dictionary
        f[subkey1].append(key)
        f[subkey2].append(key)
        
    # And finally append the dictionary to the list fathers
    fathers.append(f)

Now define the function which returns a dictionary containing all the Multiword Expressions (REs).

In [20]:
# Auxiliary function (for readability) which checks whether the "key_string" n-gram is a RE
def check_MWE(key_string, glues, REglues):
    # Get n as the number of spaces in key_string + 1
    n = key_string.count(' ')
    
    # Get the glue for the n-gram
    glue = glues[n][key_string]

    # Get key_string as list to easily remove the first/last word
    key_list = str_to_list(key_string)

    # Get the set of glues for (n-1)-grams
    omega_n_minus = set()
    # If it's a 2-gram do not check for the glues of 1-grams
    if n > 1:
        omega_n_minus.add(glues[n - 1][list_to_str(key_list[1:])])
        omega_n_minus.add(glues[n - 1][list_to_str(key_list[:-1])])

    # Get the set of glues for (n+1)-grams
    omega_n_plus = set([glues[n + 1][fath] for fath in fathers[n][key_string]])
    # If the glue is bigger than the glue for all the sons and fathers
    if all( ( glue > g ) for g in omega_n_minus.union(omega_n_plus) ):
        # Add the info to the dict
        REglues[key_string] = glues[n][key_string]

# Returns two dictionaries, only containing REs as keys
def find_RE(glues):
    # Init the new dictionary containing only the REs
    REglues = {}

    # For each n in [1, max_n)
    for n in range(1, len(glues)-1):
        # For each n-gram with their list of tfidfs
        for key, _ in glues[n].items() :
            # Process the n-gram to decide whether it is a RE
            check_MWE(key, glues, REglues)

    # Return the two dictionaries
    return REglues

Then compute the REs information for each glue.

In [21]:
print("Finding REs with SCP...")
RE_SCP_glues = find_RE(SCP_glues)
print("Finding REs with Dice...")
RE_dice_glues = find_RE(dice_glues)
print("Finding REs with MI...")
RE_MI_glues = find_RE(MI_glues)

del SCP_glues
del dice_glues
del MI_glues
del fathers

Finding REs with SCP...
Finding REs with Dice...
Finding REs with MI...


# **FILTERING**

Define a function to delete REs containing special characters.

In [22]:
# Gets a string and returns false if must be deleted
def no_special(key_string):
    for i in range(len(key_string)):
        if (key_string[i] in specialchars):
            return False
    return True

Define a function to delete REs contained in one only document.

In [23]:
# Gets a string and returns false if must be deleted
def more_documents(key_list):
    if (list_of_ngrams_info_dictionaries[len(str_to_list(key_list)) - 1][key_list][0] > 1):
        return True
    else:
        return False

And then filter.

In [24]:
# Remember REs datastructures are now just dictionaries, not lists of dictionaries
RE_SCP_glues_filtered = {}
RE_dice_glues_filtered = {}
RE_MI_glues_filtered = {}

# Iterate through REs and filter SCP REs
for key, value in RE_SCP_glues.items():
    if no_special(key) and more_documents(key) :
        RE_SCP_glues_filtered[key] = value

# Iterate through REs and filter Dice REs
for key, value in RE_dice_glues.items():
    if no_special(key) and more_documents(key) :
        RE_dice_glues_filtered[key] = value

# Iterate through REs and filter MI REs
for key, value in RE_MI_glues.items():
    if no_special(key) and more_documents(key) :
        RE_MI_glues_filtered[key] = value
        
del RE_SCP_glues
del RE_dice_glues
del RE_MI_glues

How many REs did we get overall?

In [25]:
# Print
print(str(len(RE_SCP_glues_filtered)) + ' with SCP')
print(str(len(RE_dice_glues_filtered)) + ' with Dice')
print(str(len(RE_MI_glues_filtered)) + ' with MI')

16409 with SCP
35968 with Dice
12165 with MI


And now filter out the expressions containing stop words.

In [26]:
# All the REs in the corresponding list of n-grams if the first and the last word are not stop-words
RE_SCP_glues_filtered = {re: glue for re, glue in RE_SCP_glues_filtered.items() if str_to_list(re)[0] not in stop_words and str_to_list(re)[-1] not in stop_words}
RE_dice_glues_filtered = {re: glue for re, glue in RE_dice_glues_filtered.items() if str_to_list(re)[0] not in stop_words and str_to_list(re)[-1] not in stop_words}
RE_MI_glues_filtered = {re: glue for re, glue in RE_MI_glues_filtered.items() if str_to_list(re)[0] not in stop_words and str_to_list(re)[-1] not in stop_words}
del stop_words

Now print top 100 REs by glue values.

In [27]:
# Get 200 top REs for each and write them all on file
with open('../Output/100_top_REs.txt', 'w') as f:

    f.write('SCP\n')
    for line, value in sorted(RE_SCP_glues_filtered.items(), key=lambda x: x[1], reverse=True)[:100]:
        f.write(line + '\n')
    f.write('\n')

    f.write('Dice\n')
    for line, value in sorted(RE_dice_glues_filtered.items(), key=lambda x: x[1], reverse=True)[:100]:
        f.write(line + '\n')
    f.write('\n')

    f.write('MI\n')
    for line, value in sorted(RE_MI_glues_filtered.items(), key=lambda x: x[1], reverse=True)[:100]:
        f.write(line + '\n')
    f.write('\n')

And print 200 random REs on file, for each glue.

In [28]:
# Get 200 random REs for each and write them all on file
with open('../Output/200_random_REs.txt', 'w') as f:

    f.write('SCP\n')
    for line in np.random.choice(list(RE_SCP_glues_filtered.keys()), 200):
        f.write(line + '\n')
    f.write('\n')

    f.write('Dice\n')
    for line in np.random.choice(list(RE_dice_glues_filtered.keys()), 200):
        f.write(line + '\n')
    f.write('\n')

    f.write('MI\n')
    for line in np.random.choice(list(RE_MI_glues_filtered.keys()), 200):
        f.write(line + '\n')
    f.write('\n')

How many REs did we get overall?

In [29]:
# Print
print(str(len(RE_SCP_glues_filtered)) + ' with SCP')
print(str(len(RE_dice_glues_filtered)) + ' with Dice')
print(str(len(RE_MI_glues_filtered)) + ' with MI')

13055 with SCP
24237 with Dice
10269 with MI


# **EXPLICIT KEYWORDS**

Prepare REs for the second part of the project.

In [30]:
# Get the REs obtained with SCP
RE_MI_glues_filtered = RE_MI_glues_filtered.keys()

del RE_dice_glues_filtered
del RE_SCP_glues_filtered

Write a function which computes the _tfidf_ values (for the explicit keywords) and the _probabilities_ (to compute the correlations for the implicit keywords) for each RE.

In [31]:
# Given the list of dictionaries and a required glue
# Returns two lists containing dictionaries with tfidfs and probs
def create_tfidf_and_probs():

    # Init two empty lists for list_of_tfdidf_dict and list_of_probs_dict
    list_of_tfidf_dicts = []
    list_of_probs_dicts = []

    # For each n in [1, len(d))
    for n in range(1, len(list_of_ngrams_info_dictionaries)):

        # Init the two dictionaries
        temp_tfidf_dict = dict(list_of_ngrams_info_dictionaries[n - 1])
        temp_probs_dict = dict(list_of_ngrams_info_dictionaries[n - 1])

        # For each n-gram with its frequencies list
        for key, value in list_of_ngrams_info_dictionaries[n - 1].items():
            
            # Only if n is at least 2 or if it is a monogram of length 3
            # Re-compute
            if ( n != 1 ) or ( len(key) >= 3 ) :

                # Init empty lists for tfidf and probs
                tfidf_mod = []
                probs = []

                # Get the number of documents in which the n-gram was actually found
                num_non_zero_doc = ( len(value) - 1 ) / 2

                # And for each document in which it was found
                # Compute all the tfidf and probs
                for doc_idx in range(1, len(value), 2):

                    # Get the lengths of the words in the n-gram
                    words_lens=[]
                    for w in str_to_list(key):
                        words_lens.append(len(w))

                    # Store the number of the current document
                    # and the relative frequency
                    num = value[doc_idx]
                    rel_freq = value[doc_idx+1]

                    # And append in tfidf list the number of the current document
                    tfidf_mod.append(num)
                    # Followed by its tdidf
                    tfidf_mod.append( np.mean(words_lens) * rel_freq * np.log(corpus_len/num_non_zero_doc) / corpus_lens[num] ) 
                    
                    # And append in probs list the number of the current document
                    probs.append(num)
                    # Followed by its prob
                    probs.append( rel_freq / corpus_lens[num] )

                # Compute the average probability of the n-gram
                medprob = sum(probs[1::2]) / corpus_len

                # And subtract it from each probability
                probs[1::2] = [l1 - medprob for l1 in probs[1::2]]

                # Initialize an empty list for the current n-gram twice
                temp_tfidf_dict[key] = []
                temp_probs_dict[key] = []

                # Append the values docnum and tfidf to the list for the current n-gram in the list in the dict
                for tfidf in tfidf_mod:
                    temp_tfidf_dict[key].append(tfidf)
                for p in probs:
                    temp_probs_dict[key].append(p)
            
            # Otherwise if we have a monogram which is not long enough (3 letters at least)
            else:
                # Remove the list related to such n-gram from both the two dictionaries
                temp_tfidf_dict.pop(key)
                temp_probs_dict.pop(key)

        # Append the two n-th dictionaries to the corresponding list of dictionaries
        list_of_tfidf_dicts.append(temp_tfidf_dict)
        list_of_probs_dicts.append(temp_probs_dict)

    # Return the two lists of dictionaries (the glues for each n-gram are in the first position of their own list, in both)
    return list_of_tfidf_dicts, list_of_probs_dicts

Compute _tfidfs_ and _probs_ for each document in which they appear.

In [32]:
tfidfs, probs = create_tfidf_and_probs()

And only take the information about the filtered REs.

In [33]:
# Init all the dictionaries for the filtered REs
RE_MI_tfidfs_filtered = {}
RE_MI_probs_filtered = {}

# Take the REs from the keys of the previously filtered dictionaries
for key in RE_MI_glues_filtered:
    n = len(str_to_list(key))
    RE_MI_tfidfs_filtered[key] = tfidfs[n-1][key]
    RE_MI_probs_filtered[key] = probs[n-1][key]
    
del RE_MI_glues_filtered
del probs

Select _tfidfs_ for the 1-grams.

In [34]:
# Init empty dictionary
uni_tfidfs_filtered = {}
# For each n in [1, max_n)
for key, value in tfidfs[0].items():
    if no_special(key) and more_documents(key) :
        uni_tfidfs_filtered[key] = value
print(len(uni_tfidfs_filtered))

del list_of_ngrams_info_dictionaries
del tfidfs

46342


Define the function to find explicit keywords.

In [35]:
# Return a dictionary which associates to each document its explicit keywords
# Also return a dictionary which associates to each document the REs with at least two words
def find_explicit_keywords(REs_tfidfs, uni_tfidfs, uni_max, multi_max):

    #### Initialize dictionaries for unigrams and n-grams matches
    uni_REs_per_doc = {}
    multi_REs_per_doc = {}
    for k in range(corpus_len):
        uni_REs_per_doc['doc' + str(k)] = []
        uni_REs_per_doc['tfidf' + str(k)] = []
        multi_REs_per_doc['doc' + str(k)] = []
        multi_REs_per_doc['tfidf' + str(k)] = []

    #### Populate the unigram dictionary
    for key, value in uni_tfidfs.items():
        for index in range(0, len(value), 2):
            doc = value[index]
            uni_REs_per_doc['doc' + str(doc)].append(key)
            uni_REs_per_doc['tfidf' + str(doc)].append(value[index + 1])
    # Sort the unigrams by their tfidf scores and limit the number of unigrams
    uni_explks_per_doc = {}
    for k in range(corpus_len):
        sorted_uni = sorted(zip(uni_REs_per_doc['tfidf' + str(k)], uni_REs_per_doc['doc' + str(k)]), reverse=True)
        uni_explks_per_doc['doc' + str(k)] = [x for _, x in sorted_uni][:uni_max]

    #### Populate the REs match dictionary
    for key, value in REs_tfidfs.items():
        for index in range(0, len(value), 2):
            doc = value[index]
            multi_REs_per_doc['doc' + str(int(doc))].append(key)
            multi_REs_per_doc['tfidf' + str(int(doc))].append(value[index + 1])
    # Get the best multi-explicit keywords
    multi_explks_per_doc = {}
    for k in range(corpus_len):
        # By sorting for the current document the REs
        sorted_re = sorted(zip(multi_REs_per_doc['tfidf' + str(k)], multi_REs_per_doc['doc' + str(k)]), reverse=True)
        # ...storing the sorted REs
        multi_REs_per_doc['doc' + str(k)] = [x for _, x in sorted_re]
        # ...and choosing the explicit keywords which are multi-words
        multi_explks_per_doc['doc' + str(k)] = [x for _, x in sorted_re][:multi_max]

    ##### Combine the unigram and REs match dictionaries
    explks_per_doc = dict(uni_explks_per_doc)
    for key, value in uni_explks_per_doc.items():
        explks_per_doc[key].extend(multi_explks_per_doc[key])

    return explks_per_doc, multi_REs_per_doc

And compute keywords for both 1-grams and not.

In [36]:
# Choose number of keywords
uni_max = 5
multi_max = 10

# Get the explicit keywords
print("Computing explicit keywords with SCP")
MI_explicit_keywords, MI_REs_per_doc = find_explicit_keywords(RE_MI_tfidfs_filtered, uni_tfidfs_filtered, uni_max, multi_max)
print([len(MI_explicit_keywords[key]) for key in MI_explicit_keywords.keys()])
print([len(MI_REs_per_doc[key]) for key in MI_REs_per_doc.keys()])

del RE_MI_tfidfs_filtered
del uni_tfidfs_filtered

Computing explicit keywords with SCP
[15, 15, 15, 8, 8, 15, 15, 13, 12, 7, 9, 15, 14, 15, 7, 11, 14, 10, 15, 7, 15, 15, 5, 10, 8, 8, 10, 15, 8, 15, 12, 6, 8, 13, 9, 15, 15, 15, 14, 7, 6, 15, 9, 15, 9, 15, 12, 10, 7, 15, 5, 6, 15, 5, 15, 5, 8, 5, 15, 6, 10, 8, 10, 9, 6, 15, 11, 9, 15, 14, 10, 15, 15, 12, 12, 15, 14, 9, 15, 8, 10, 7, 6, 10, 11, 14, 10, 15, 14, 7, 15, 5, 6, 7, 6, 13, 10, 15, 10, 15, 15, 8, 9, 15, 6, 15, 11, 8, 7, 15, 9, 10, 15, 9, 15, 7, 9, 6, 15, 7, 9, 8, 11, 13, 15, 11, 14, 5, 7, 15, 11, 6, 9, 6, 8, 6, 8, 11, 9, 8, 11, 8, 5, 9, 6, 14, 10, 15, 15, 13, 9, 8, 9, 11, 5, 11, 15, 8, 7, 7, 5, 10, 14, 7, 15, 15, 7, 11, 12, 8, 9, 11, 10, 5, 6, 9, 7, 5, 7, 13, 9, 11, 8, 12, 7, 5, 7, 15, 15, 12, 9, 15, 8, 6, 14, 9, 15, 6, 15, 5, 15, 13, 9, 5, 10, 8, 10, 6, 9, 14, 8, 9, 8, 5, 7, 15, 10, 10, 7, 15, 15, 12, 9, 10, 15, 9, 15, 12, 5, 13, 15, 11, 10, 10, 7, 13, 7, 7, 9, 11, 7, 15, 6, 11, 14, 7, 7, 7, 15, 8, 15, 15, 12, 15, 13, 14, 7, 5, 8, 15, 15, 15, 8, 15, 15, 6, 10, 9, 10, 9, 15, 15,

Now print them on file.

In [37]:
# Write them all on file
with open('../Output/explicit_keywords.txt', 'w') as f:

    # Write keywords for each document using MI
    for key, value in MI_explicit_keywords.items():
        index = int(key.lstrip('doc'))
        f.write(texts_names[index])
        f.write(': ')
        for v in value:
            f.write(v)
            f.write('; ')
        f.write('\n')
    f.write('\n')

# **IMPLICIT KEYWORDS**

Define the function to get the dictionary with correlation values.

In [38]:
def cov(a, b):
    # Initialize indices and sum
    i = 0
    j = 0
    s = 0
    # Loop over the elements of a and b
    while j < len(b[::2]) and i < len(a[::2]):
        while j < len(b[::2]) and a[::2][i] >= b[::2][j]:
            # If the elements are equal, add their product to the sum
            if a[::2][i] == b[::2][j]:
                s += a[2*i+1] * b[2*j+1]
            j += 1
        i += 1
    # Return the score
    return 1000000 * s / (corpus_len - 1)

def create_corr_dict(REs_probs):
    # Copy the input dictionary and initialize new dictionaries
    temp_REs_probs = dict(REs_probs)
    temp_corr_scores = {}
    correlation_dict = {}

    # For each RE
    for key1, value1 in REs_probs.items():
        key1_connections = {}
        # For each RE (cartesian product)
        for key2, value2 in temp_REs_probs.items():
            # Calculate the score among the two 
            cv = cov(value1, value2)
            # If the score is non-zero, add it to the dictionary
            if abs(cv) > 0:
                key1_connections[key2] = cv
        # Add the dictionary to the output dictionary at key1
        temp_corr_scores[key1] = key1_connections
        # Remove the item from the copied dictionary so that it won't be computed twice
        temp_REs_probs.pop(key1)

    # For each of the dictionaries which were just computed
    for key1, value1 in temp_corr_scores.items():
        temp_correlations = {}
        # For each word in the connections dictionary for key3
        for key2, value2 in value1.items():
            # Compute the corr
            corr = value2 / (np.sqrt(value1[key1]) * np.sqrt(temp_corr_scores[key2][key2]))
            # If the correlation is non-zero, add it to the dictionary
            if abs(corr) > 0.0000000001:
                temp_correlations[key2] = corr
        # Add the dictionary to the final dictionary
        correlation_dict[key1] = temp_correlations

    return correlation_dict

Then compute them.

In [39]:
RE_MI_dict_corr = create_corr_dict(RE_MI_probs_filtered)
del RE_MI_probs_filtered

Define the function which computes the implicit keywords. We use Corr instead of SemProx for simplicity.

In [40]:
def find_implicit_keywords(dict_corr_re, explks, re_in_doc, num_implks, max_explks_to_consider, how_many_unis):
    # Initialize scores dictionary
    scores = {}
    for k in range(corpus_len):
        scores['doc' + str(k)] = []
        scores['scores' + str(k)] = []

    # Loop over each document in the corpus
    cont = 0
    for doc in range(corpus_len):
        # Loop over each document again for comparison
        for doc1 in range(corpus_len):
            if cont % ( round( corpus_len * corpus_len / 1000 ) ) == 0:             
                print(round( cont / (corpus_len * corpus_len) * 100 ), "%")
            # Skip if the documents are the same
            if doc != doc1:
                # Loop over each re in the second document
                for re in re_in_doc['doc' + str(doc1)]:
                    # Skip if the keyword is already in the scores or in the first document
                    if re not in scores['doc' + str(doc)] and re not in re_in_doc['doc' + str(doc)]:
                        somma = 0
                        # Loop over each explicit keyword in the first document
                        rank = 1
                        for j in range(min(max_explks_to_consider, len(explks["doc" + str(doc)][how_many_unis:]))):
                            keyword = explks["doc" + str(doc)][how_many_unis + j]
                            # Add the covariance score if the keyword is in the covariance dictionary
                            if keyword in dict_corr_re[re]:
                                somma += dict_corr_re[re][keyword] / rank
                            # Add the covariance score if the re is in the covariance dictionary
                            elif re in dict_corr_re[keyword]:
                                somma += dict_corr_re[keyword][re] / rank
                            # Else do nothing because the correlation was 0
                            else:
                                None
                            rank += 1
                        # Append the keyword and its score to the scores dictionary
                        scores['doc' + str(doc)].append(re)
                        scores['scores' + str(doc)].append(somma)
            cont += 1

    # Initialize the final dictionary for implicit keywords
    dict_implkey_re_final = {}
    for k in range(corpus_len):
        # Sort the keywords by their scores and add them to the final dictionary
        dict_implkey_re_final['doc' + str(k)] = [x for _, x in sorted(zip(scores['scores' + str(k)], scores['doc' + str(k)]), reverse=True)]
        # Limit the number of keywords to the specified number
        if len(dict_implkey_re_final['doc' + str(k)]) >= num_implks:
            dict_implkey_re_final['doc' + str(k)] = dict_implkey_re_final['doc' + str(k)][:num_implks]

    return dict_implkey_re_final

Then compute everything.

In [41]:
# Set parameters
how_many = 5
max_explks_to_consider = 10

# Compute
MI_implicit_keywords = find_implicit_keywords(RE_MI_dict_corr, MI_explicit_keywords, MI_REs_per_doc, how_many, max_explks_to_consider, uni_max)

0 %
0 %
0 %
0 %
0 %
1 %
1 %
1 %
1 %
1 %
1 %
1 %
1 %
1 %
1 %
2 %
2 %
2 %
2 %
2 %
2 %
2 %
2 %
2 %
2 %
3 %
3 %
3 %
3 %
3 %
3 %
3 %
3 %
3 %
3 %
4 %
4 %
4 %
4 %
4 %
4 %
4 %
4 %
4 %
4 %
5 %
5 %
5 %
5 %
5 %
5 %
5 %
5 %
5 %
5 %
6 %
6 %
6 %
6 %
6 %
6 %
6 %
6 %
6 %
6 %
7 %
7 %
7 %
7 %
7 %
7 %
7 %
7 %
7 %
7 %
8 %
8 %
8 %
8 %
8 %
8 %
8 %
8 %
8 %
8 %
9 %
9 %
9 %
9 %
9 %
9 %
9 %
9 %
9 %
9 %
10 %
10 %
10 %
10 %
10 %
10 %
10 %
10 %
10 %
10 %
11 %
11 %
11 %
11 %
11 %
11 %
11 %
11 %
11 %
11 %
12 %
12 %
12 %
12 %
12 %
12 %
12 %
12 %
12 %
12 %
13 %
13 %
13 %
13 %
13 %
13 %
13 %
13 %
13 %
13 %
14 %
14 %
14 %
14 %
14 %
14 %
14 %
14 %
14 %
14 %
15 %
15 %
15 %
15 %
15 %
15 %
15 %
15 %
15 %
15 %
16 %
16 %
16 %
16 %
16 %
16 %
16 %
16 %
16 %
16 %
17 %
17 %
17 %
17 %
17 %
17 %
17 %
17 %
17 %
17 %
18 %
18 %
18 %
18 %
18 %
18 %
18 %
18 %
18 %
18 %
19 %
19 %
19 %
19 %
19 %
19 %
19 %
19 %
19 %
19 %
20 %
20 %
20 %
20 %
20 %
20 %
20 %
20 %
20 %
20 %
21 %
21 %
21 %
21 %
21 %
21 %
21 %
21 %
21 %
21 %
22 %
22 %
22 %
22 %


And finally print on file.

In [42]:
# Print
with open('../Output/implicit_keywords.txt', 'w') as f:
    # Print using MI
    for key,value in MI_implicit_keywords.items():
        index = int(key.lstrip('doc'))
        f.write(texts_names[index])
        f.write(': ')
        for v in value:
            f.write(v)
            f.write('; ')
        f.write('\n')
    f.write('\n')