# **INITIALIZATION**

Import libraries.

In [1]:
# Library for reading and writing data to and from files
import os
# Library for numerical computing
import numpy as np
# Library for mathematical functions
import math
# Library for getting dictionaries from data structures
from collections import defaultdict

Define paths.

In [2]:
# Dataset directory
corpus_directory='../Dataset/corpus2mw'

To iterate the reading procedure, get the names of the documents in the dataset.

In [3]:
# Get a list with the names of the documents
texts_names = os.listdir(corpus_directory)

Set the seed for reproducibility.

In [4]:
np.random.seed(55555555)

# **GET TOKENS**

First, define the special characters to be separated from each word.

In [5]:
# List of characters
specialchars = [';', ':', '!', '?', '<', '>', '&', ')', '(', ']', '[', ',', '.', '"', '%', '$ ', '=', '}', '{', '-']

And define the function which separates the special characters from each word, assuming they can only be before or after each word.

In [6]:
def token(w):
    # Init the empty list of tokens
    res = []

    # If the length is 1, add the character whatever it is
    if len(w) == 1:
        res.append(w)
    
    # Otherwise (if it's at least two characters)
    else:
        # If the first character is special, add it to the list and remove it from the word
        if w[0] in specialchars:
            res.append(w[0])
            w = w[1:]
        
        # Now, if the length became 1 because of that, for the same reason as before, add the character whatever it is
        if len(w) == 1:
            res.append(w)
        # Otherwise (if it's at least two characters), both if I had removed the first or not
        # Check whether the last character is special
        elif w[-1] in specialchars:
            res.append(w[:-1])
            res.append(w[-1])
        # or not
        else:
            res.append(w)
        
    # Return the list of tokens
    return res

For each document, for each word (in each line), if either the last or the first character are specialchars, then split in multiple tokens.

In [7]:
corpus = []
corpus_lens = []

# For each document in the directory
for text in texts_names:

    # Init a temp empty list for the words in the current document
    words = []

    # Open the document
    with open(corpus_directory + '/' + text, 'r', errors='ignore') as file:

        # For each line
        for line in file:
            # For each word in the line
            for word in line.split():
                # Tokenize it
                aux = token(word)
                # And add each token to the list of words
                for t in aux:
                    words.append(t)
    # Then append the list of tokens for the document in the corpus list
    corpus.append(words)
    corpus_lens.append(len(words))

corpus_len = len(corpus)
del texts_names

This way, the corpus is a list of documents, which are lists of tokens.

In [8]:
# Visualize first 10 tokens in the second document corpus[1]
print(corpus[1][:10])

['Koalas', 'have', 'few', 'predators', ';', 'dingos', 'and', 'large', 'pythons', 'may']


Define a function to get a list of words from a string with words separated with a space.

In [9]:
# Given a list of strings, it returns a string
# in which the substrings will be separated by ' '
def list_to_str(strings):
    # Init res as an empty string
    res = ""
    # For each character/string in the list
    for i in range(len(strings)):
        # Concatenate the string plus a space to res
        res += strings[i] + ' '
    # Then return everything besides the last space
    return res[:-1]

Define a function to do the opposite.

In [10]:
# Given a string which contains substrings separated by ' ',
# it returns a list of words
def str_to_list(s):
    # Init res as an empty list
    res = []
    # Split the string by ' ' and for each substring
    for word in s.split():
        # Append the substring to res
        res.append(word)
    # Return the list of substrings
    return res

Declare a function which returns a list of n dictionaries, one for each n up to a fixed max, which contain the information about the frequencies for each n-gram.

In [11]:
# Create a list of n dictionaries, one for all the possible n-grams, with n in [1, max]
# Each n-dictionary will map each n-gram to a list with the absolute frequency in [0]
# and then the index of the documents in which it was found once at least,
# followed by the relative frequency (e.g. [25, 1, 20, 2, 1, 3, 4])
def create_list_of_dict_global(max):
    # Init an empty list of dictionaries
    list_dict=[]

    # For each n in [1, max], append an empty dictionary to the list
    for n in range(max):
        list_dict.append({})

    # For each index of a document in the corpus
    for i in range(corpus_len):
        # For each index of a token in the document
        for t in range(len(corpus[i])):
            # For each n in [1, max]
            for n in range(1, len(list_dict)+1):
                # If the document is not over (there is still space for an n-gram)
                if ( t + n ) <= len(corpus[i]) :
                    # If the n-gram is not yet in the n-dictionary
                    if not ( list_to_str(corpus[i][t : t+n]) in list_dict[n-1].keys() ) :
                        # Associate to the new n-gram an empty list
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])] = []
                        # Set the frequency to 1 in position [0]
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(1)
                        # And then append the index of the current document
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(i)
                        # and set the relative frequency to 1
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(1)
                    else:
                        # Add one to the frequency of the n-gram
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])][0] += 1
                        # And if the last document in which this n-gram was found
                        # is the current one
                        if list_dict[n-1][list_to_str(corpus[i][t : t+n])][-2] == i :
                            # Just increment the relative frequency
                            list_dict[n-1][list_to_str(corpus[i][t : t+n])][-1] += 1
                        # Otherwise
                        else:
                            # Append the new (current) document
                            list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(i)
                            # and set the relative frequency to 1
                            list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(1)
    # Then return the list of dictionaries
    return list_dict

Define a function to find all the indices at which a given n-gram occurs in a given document.

In [12]:
# Create a list of n dictionaries, one for all the possible n-grams, with n in [1, max]
def find_indices_ngram_doc(ngram_string, docnum):
    # Init an empty list for the indices
    indices = []
    # Get the document as a list of tokens 
    doc = corpus[docnum]
    # Get the n-gram as a list of words
    ngram_list = str_to_list(ngram_string)
    # For each index of a token in the document, up to the last possible n-gram starter
    for i in range(len(doc) - len(ngram_list) + 1):
        # If the current token is the first word of the n-gram
        if doc[i] == ngram_list[0] :
                # Init counter of words to 1
                c = 1
                # While document is not over and still checking for the n-gram
                while ( c+i < len(doc) ) and (c < len(ngram_list) ) :
                    # If it was not found, break
                    if doc[c+i] != ngram_list[c]:
                        break
                    # Otherwise, increment the counter and go on
                    else:
                        c += 1
                # If all the n words were found, append i as an index for the n_gram
                if c == len(ngram_list): 
                    indices.append(i)

    # Then return the list of indices
    return indices

Pick a max value for n. Remember that if you care about the 7-grams you need the 8-grams to be computed.

In [13]:
max_n = 8

Create a dictionary with the above function.

In [14]:
dizzo = create_list_of_dict_global(max_n)

# **EXPLORING GLUES**

Define a function which computes the glues.

In [15]:
def compute_glues(gluename):
    # Initialize an empty list for storing the glue dictionaries
    glue_dicts = []

    # For each n in [1, len(d))
    for n in range(1, len(dizzo)):

        # Initialize the glue dictionary as a dictionary with the same keys as dizzo[n-1] associating 0 to each
        glue_dict = dict.fromkeys(dizzo[n - 1], 0)

        # For each n-gram with its frequencies list
        for key, value in dizzo[n - 1].items():
            # Only if n is at least 2 or if it is a monogram of length 3
            if ( n != 1 ) or ( len(key) >= 3 ):
                # Store the absolute frequency for the n-gram
                abs_freq = value[0]

                # If n is at least 2
                if n != 1:
                    # Convert the n-gram into a list instead of a string
                    key_list = str_to_list(key)

                    # Initialize the sum to zero
                    s = 0
                    
                    # Compute the right GLUE coefficient

                    # Do Dice
                    if gluename == 'Dice':

                        # Dividing the n-gram into two parts w1...wi and wi+1...wn
                        for i in range(len(key_list) - 1):
                            # Get the absolute frequencies of the two sub-n-grams
                            f1 = dizzo[i][list_to_str(key_list[:i+1])][0]
                            f2 = dizzo[n-i-2][list_to_str(key_list[i+1:])][0]
                            # And add to the sum the partial sum
                            s += (f1 + f2) / (n - 1)

                        gl = (2 * abs_freq) / s

                    # Do SCP
                    elif gluename == 'SCP':

                        # Dividing the n-gram into two parts w1...wi and wi+1...wn
                        for i in range(len(key_list) - 1):
                            # Get the absolute frequencies of the two sub-n-grams
                            f1 = dizzo[i][list_to_str(key_list[:i+1])][0]
                            f2 = dizzo[n-i-2][list_to_str(key_list[i+1:])][0]
                            # And add to the sum the partial sum
                            s += (f1 * f2) / (n - 1)

                        gl = (abs_freq**2) / s


                    elif gluename == 'MI':
                        
                        # Dividing the n-gram into two parts w1...wi and wi+1...wn
                        for i in range(len(key_list) - 1):
                            # Get the absolute frequencies of the two sub-n-grams
                            f1 = dizzo[i][list_to_str(key_list[:i+1])][0]
                            f2 = dizzo[n-i-2][list_to_str(key_list[i+1:])][0]
                            # And add to the sum the partial sum
                            s += (f1 * f2) / (n - 1)

                        gl = np.log(abs_freq / s)
                    else:
                        gl = 0

                    # Add the glue to the list of the current n-gram
                    glue_dict[key] = gl

            # Otherwise if we have a monogram which is not long enough (3 letters at least)
            else:
                # Remove the list related to such n-gram from the glue dictionary
                glue_dict.pop(key)

        # Append the n-th glue dictionary to the list of glue dictionaries
        glue_dicts.append(glue_dict)

    # Return the list of glue dictionaries
    return glue_dicts

Compute the glues for each n-gram.

In [16]:
SCP_glues = compute_glues('SCP')
dice_glues = compute_glues('Dice')
MI_glues = compute_glues('MI')

Visualize the glues for the 3-grams.

In [17]:
# Print the first 10 items
for key in list(SCP_glues[2].keys())[:10]:
    print(f"SCP: {key} [{SCP_glues[2][key]:.4f}]")
    print(f"Dice: {key} [{dice_glues[2][key]:.4f}]")
    print(f"MI: {key} [{MI_glues[2][key]:.4f}]")
    print()

SCP: Greek Christian scribes [0.0093]
Dice: Greek Christian scribes [0.0185]
MI: Greek Christian scribes [-4.6728]

SCP: Christian scribes played [0.0016]
Dice: Christian scribes played [0.0031]
MI: Christian scribes played [-6.4552]

SCP: scribes played a [0.0001]
Dice: scribes played a [0.0001]
MI: scribes played a [-9.7615]

SCP: played a crucial [0.0012]
Dice: played a crucial [0.0105]
MI: played a crucial [-7.7929]

SCP: a crucial role [0.0002]
Dice: a crucial role [0.0006]
MI: a crucial role [-9.9685]

SCP: crucial role in [0.0002]
Dice: crucial role in [0.0005]
MI: crucial role in [-10.1860]

SCP: role in the [0.0003]
Dice: role in the [0.0020]
MI: role in the [-12.0791]

SCP: in the preservation [0.0000]
Dice: in the preservation [0.0001]
MI: in the preservation [-12.2640]

SCP: the preservation of [0.0001]
Dice: the preservation of [0.0002]
MI: the preservation of [-11.7386]

SCP: preservation of Aristotle [0.0067]
Dice: preservation of Aristotle [0.0615]
MI: preservation of A

# **REGULAR EXPRESSIONS**

Compute a list with n dictionaries, each associating to each n-gram all the (n+1)-grams which have one more word on the left/right.

In [18]:
# List of dictionaries, one for each value of n
fathers = []

# Notice n < max_n obviously
for n in range(1, max_n):

    # Get a dictionary with all the keys of the n-th dictionary in dizzo, and empty lists as values
    f = {key: [] for key in dict(dizzo[n - 1]).keys()}

    # For each (n+1)-gram in the (n+1)-th dictionary
    for key, value in dizzo[n].items():
        # Get the (n+1)-gram as list of words
        key_list = str_to_list(key)
        # Get the two n-grams
        subkey1 = list_to_str(key_list[1:])
        subkey2 = list_to_str(key_list[:-1])
        # Add them in the temp dictionary
        f[subkey1].append(key)
        f[subkey2].append(key)
        
    # And finally append the dictionary to the list fathers
    fathers.append(f)

Now define the function which returns a dictionary containing all the Multiword Expressions (REs).

In [19]:
# Auxiliary function (for readability) which checks whether the "key_string" n-gram is a RE
def process_keys_2(key_string, glues, REglues):
    # Get n as the number of spaces in key_string + 1
    n = key_string.count(' ')
    
    # Get the glue for the n-gram
    glue = glues[n][key_string]

    # Get key_string as list to easily remove the first/last word
    key_list = str_to_list(key_string)

    # Get the set of glues for (n-1)-grams
    omega_n_minus = set()
    # If it's a 2-gram do not check for the glues of 1-grams
    if n > 1:
        omega_n_minus.add(glues[n - 1][list_to_str(key_list[1:])])
        omega_n_minus.add(glues[n - 1][list_to_str(key_list[:-1])])

    # Get the set of glues for (n+1)-grams
    omega_n_plus = set([glues[n + 1][fath] for fath in fathers[n][key_string]])
    # If the glue is bigger than the glue for all the sons and fathers
    if all( ( glue > g ) for g in omega_n_minus.union(omega_n_plus) ):
        # Add the info to the dict
        REglues[key_string] = glues[n][key_string]

# Returns two dictionaries, only containing REs as keys
def find_RE(glues):
    # Init the new dictionary containing only the REs
    REglues = {}

    # For each n in [1, max_n)
    for n in range(1, len(glues)-1):
        # For each n-gram with their list of tfidfs
        for key, _ in glues[n].items() :
            # Process the n-gram to decide whether it is a RE
            process_keys_2(key, glues, REglues)

    # Return the two dictionaries
    return REglues

Then compute the REs information for each glue.

In [20]:
RE_SCP_glues = find_RE(SCP_glues)
RE_dice_glues = find_RE(dice_glues)
RE_MI_glues = find_RE(MI_glues)
del SCP_glues
del dice_glues
del MI_glues
del fathers

And visualize the REs with their glues.

In [21]:
# Print the first 10 items

for key in list(RE_SCP_glues.keys())[:10]:
    print(f"SCP: {key} [{RE_SCP_glues[key]:.4f}]")
print()

for key in list(RE_dice_glues.keys())[:10]:
    print(f"Dice: {key} [{RE_dice_glues[key]:.4f}]")
print() 

for key in list(RE_MI_glues.keys())[:10]:    
    print(f"MI: {key} [{RE_MI_glues[key]:.4f}]")
    

SCP: crucial role [0.0024]
SCP: in the [0.0281]
SCP: of the [0.0483]
SCP: . The [0.1130]
SCP: comment extensively [0.0012]
SCP: John Philoponus [0.0021]
SCP: , and [0.0204]
SCP: Philoponus stands [0.0071]
SCP: fundamental critique [0.0035]
SCP: Aristotle's views [0.0077]

Dice: crucial role [0.0240]
Dice: in the [0.1516]
Dice: copying all [0.0011]
Dice: manuscripts of [0.0001]
Dice: of the [0.2105]
Dice: . The [0.2503]
Dice: The first [0.0307]
Dice: Greek Christians [0.0080]
Dice: comment extensively [0.0317]
Dice: John Philoponus [0.0043]

MI: crucial role [-7.8224]
MI: comment extensively [-6.7569]
MI: Philoponus stands [-4.9416]
MI: fundamental critique [-5.6630]
MI: Aristotle's views [-4.8675]
MI: Aristotelian thought [-6.7044]
MI: formal commentary [-7.5756]
MI: Ephesus reappears [-3.6889]
MI: late eleventh [-9.0007]
MI: apparently sponsored [-8.2449]


# **FILTERING**

Define a function to delete REs containing special characters.

In [22]:
# Gets a string and returns false if must be deleted
def no_special(key_string):
    for i in range(len(key_string)):
        if (key_string[i] in specialchars):
            return False
    return True

Define a function to delete REs contained in one only document.

In [23]:
# Gets a string and returns false if must be deleted
def more_documents(key_list):
    if (dizzo[len(str_to_list(key_list)) - 1][key_list][0] > 1):
        return True
    else:
        return False

And then filter.

In [24]:
# Remember REs datastructures are now just dictionaries, not lists of dictionaries
RE_SCP_glues_filtered = {}
RE_dice_glues_filtered = {}
RE_MI_glues_filtered = {}

# Iterate through REs and filter SCP REs
for key, value in RE_SCP_glues.items():
    if no_special(key) and more_documents(key) :
        RE_SCP_glues_filtered[key] = value

# Iterate through REs and filter Dice REs
for key, value in RE_dice_glues.items():
    if no_special(key) and more_documents(key) :
        RE_dice_glues_filtered[key] = value

# Iterate through REs and filter MI REs
for key, value in RE_MI_glues.items():
    if no_special(key) and more_documents(key) :
        RE_MI_glues_filtered[key] = value

del RE_SCP_glues
del RE_dice_glues
del RE_MI_glues

How many REs did we get overall?

In [25]:
# Print
print(str(len(RE_SCP_glues_filtered)) + ' with SCP')
print(str(len(RE_dice_glues_filtered)) + ' with Dice')
print(str(len(RE_MI_glues_filtered)) + ' with MI')

17308 with SCP
37229 with Dice
13166 with MI


# **STOP WORDS**

Define the function to find the stop words.

In [26]:
# Returns the extracted stop words
def get_stop_words():

    # Initialize a dictionary for ints with 0 as default value
    neighbour_counts = defaultdict(int)
    # Count how many neightbours each word has across the corpus
    for doc in corpus:
        # Increment by 1 the count for the first and last word of the current doc
        neighbour_counts[doc[0]] += 1
        neighbour_counts[doc[-1]] += 1
        # From second to second last word increment by 2
        for idx in range(1, len(doc) - 1):
            neighbour_counts[doc[idx]] += 2
    # And get the keys sorted by value
    neighbour_counts = sorted(neighbour_counts.items(), key=lambda item: item[1], reverse=True)
    
    # Now look for the elbow point in the list
    elbow_point_index = 0
    max_tangens = 0
    # Iterate over the list (besides last word) to find the elbow point
    for index in range(len(neighbour_counts) - 1):
        # Get the difference between the count for the current word and the next
        neighbour_diff = neighbour_counts[index][1] - neighbour_counts[index + 1][1]
        # Apply the rule (find the tangents difference)
        tangents_diff = abs(math.tan(neighbour_counts[index][1] + neighbour_diff) - math.tan(neighbour_counts[index][1]))
        # Update the elbow point if the current difference is greater than the maximum difference
        if tangents_diff > max_tangens:
            elbow_point_index = index
            max_tangens = tangents_diff
    
    # Get the couples corresponding to all the words up to the elbow (highest num of neighbours)
    stop_word_counts = neighbour_counts[:elbow_point_index+1]
    # Get the stop words
    stop_words = [tuple[0] for tuple in stop_word_counts]
    
    # Return the stop words and the filtered expressions
    return stop_words

And compute them.

In [27]:
# Get stop words
stop_words = get_stop_words()

# And write them all on file
with open('../Output/stop_words.txt', 'w') as f:
    for word in stop_words:
        f.write("\"")
        f.write(word)
        f.write("\"")
        f.write(', ')
del corpus

And now filter out the expressions containing stop words.

In [28]:
# All the REs in the corresponding list of n-grams if the first and the last word are not stop-words
RE_SCP_glues_filtered = [re for re in RE_SCP_glues_filtered.keys() if str_to_list(re)[0] not in stop_words and str_to_list(re)[-1] not in stop_words]
RE_dice_glues_filtered = [re for re in RE_dice_glues_filtered.keys() if str_to_list(re)[0] not in stop_words and str_to_list(re)[-1] not in stop_words]
RE_MI_glues_filtered = [re for re in RE_MI_glues_filtered.keys() if str_to_list(re)[0] not in stop_words and str_to_list(re)[-1] not in stop_words]
del stop_words

How many REs did we get overall?

In [29]:
# Print
print(str(len(RE_SCP_glues_filtered)) + ' with SCP')
print(str(len(RE_dice_glues_filtered)) + ' with Dice')
print(str(len(RE_MI_glues_filtered)) + ' with MI')

15281 with SCP
29294 with Dice
12103 with MI


Now print 200 first REs on file, for each glue.

In [30]:
# Get 200 random REs for each and write them all on file
with open('../Output/200_random_REs.txt', 'w') as f:

    f.write('SCP\n')
    for line in np.array(RE_SCP_glues_filtered)[:200]:
        f.write(line + '\n')
    f.write('\n')

    f.write('Dice\n')
    for line in np.array(RE_dice_glues_filtered)[:200]:
        f.write(line + '\n')
    f.write('\n')

    f.write('MI\n')
    for line in np.array(RE_MI_glues_filtered)[:200]:
        f.write(line + '\n')

# **EXPLICIT KEYWORDS**

Write a function which computes the _tfidf_ values (for the explicit keywords) and the _probabilities_ (to compute the correlations for the implicit keywords) for each RE.

In [31]:
# Given the list of dictionaries and a required glue
# Returns two lists containing dictionaries with tfidfs and probs
def create_tfidf_and_probs():

    # Init two empty lists for list_of_tfdidf_dict and list_of_probs_dict
    list_of_tfidf_dicts = []
    list_of_probs_dicts = []

    # For each n in [1, len(d))
    for n in range(1, len(dizzo)):

        # Init the two dictionaries
        temp_tfidf_dict = dict(dizzo[n - 1])
        temp_probs_dict = dict(dizzo[n - 1])

        # For each n-gram with its frequencies list
        for key, value in dizzo[n - 1].items():
            
            # Only if n is at least 2 or if it is a monogram of length 3
            # Re-compute
            if ( n != 1 ) or ( len(key) >= 3 ) :

                # Init empty lists for tfidf and probs
                tfidf_mod = []
                probs = []

                # Get the number of documents in which the n-gram was actually found
                num_non_zero_doc = ( len(value) - 1 ) / 2

                # And for each document in which it was found
                # Compute all the tfidf and probs
                for doc_idx in range(1, len(value), 2):

                    # Get the lengths of the words in the n-gram
                    words_lens=[]
                    for w in str_to_list(key):
                        words_lens.append(len(w))

                    # Store the number of the current document
                    # and the relative frequency
                    num = value[doc_idx]
                    rel_freq = value[doc_idx+1]

                    # And append in tfidf list the number of the current document
                    tfidf_mod.append(num)
                    # Followed by its tdidf
                    tfidf_mod.append( np.mean(words_lens) * rel_freq * np.log(corpus_len/num_non_zero_doc) / corpus_lens[num] ) 
                    
                    # And append in probs list the number of the current document
                    probs.append(num)
                    # Followed by its prob
                    probs.append( rel_freq / corpus_lens[num] )

                # Compute the average probability of the n-gram
                medprob = sum(probs[1::2]) / corpus_len

                # And subtract it from each probability
                probs[1::2] = [l1 - medprob for l1 in probs[1::2]]

                # Initialize an empty list for the current n-gram twice
                temp_tfidf_dict[key] = []
                temp_probs_dict[key] = []

                # Append the values docnum and tfidf to the list for the current n-gram in the list in the dict
                for tfidf in tfidf_mod:
                    temp_tfidf_dict[key].append(tfidf)
                for p in probs:
                    temp_probs_dict[key].append(p)
            
            # Otherwise if we have a monogram which is not long enough (3 letters at least)
            else:
                # Remove the list related to such n-gram from both the two dictionaries
                temp_tfidf_dict.pop(key)
                temp_probs_dict.pop(key)

        # Append the two n-th dictionaries to the corresponding list of dictionaries
        list_of_tfidf_dicts.append(temp_tfidf_dict)
        list_of_probs_dicts.append(temp_probs_dict)

    # Return the two lists of dictionaries (the glues for each n-gram are in the first position of their own list, in both)
    return list_of_tfidf_dicts, list_of_probs_dicts

Compute _tfidfs_ and _probs_ for each document in which they appear.

In [32]:
tfidfs, probs = create_tfidf_and_probs()

And only take the information about the filtered REs.

In [33]:
# Init all the dictionaries for the filtered REs
RE_SCP_tfidfs_filtered = {}
RE_SCP_probs_filtered = {}

# Take the REs from the keys of the previously filtered dictionaries
for key in RE_SCP_glues_filtered:
    n = len(str_to_list(key))
    RE_SCP_tfidfs_filtered[key] = tfidfs[n-1][key]
    RE_SCP_probs_filtered[key] = probs[n-1][key]
del RE_SCP_glues_filtered
del RE_dice_glues_filtered
del RE_MI_glues_filtered
del probs

Select _tfidfs_ for the 1-grams.

In [34]:
# Init empty dictionary
uni_tfidfs_filtered = {}
# For each n in [1, max_n)
for key, value in tfidfs[0].items():
    if no_special(key) and more_documents(key) :
        uni_tfidfs_filtered[key] = value
del dizzo
del tfidfs

Define the function to find explicit keywords.

In [35]:
# Return a dictionary which associates to each document its explicit keywords
# Also return a dictionary which associates to each document the REs with at least two words
def findExplicit_Keywords(REs_tfidfs, uni_tfidfs, uni_max, multi_max):

    #### Initialize dictionaries for unigrams and n-grams matches
    uni_REs_per_doc = {}
    multi_REs_per_doc = {}
    for k in range(corpus_len):
        uni_REs_per_doc['doc' + str(k)] = []
        uni_REs_per_doc['tfidf' + str(k)] = []
        multi_REs_per_doc['doc' + str(k)] = []
        multi_REs_per_doc['tfidf' + str(k)] = []

    #### Populate the unigram dictionary
    for key, value in uni_tfidfs.items():
        for index in range(0, len(value), 2):
            doc = value[index]
            uni_REs_per_doc['doc' + str(doc)].append(key)
            uni_REs_per_doc['tfidf' + str(doc)].append(value[index + 1])
    # Sort the unigrams by their tfidf scores and limit the number of unigrams
    uni_expks_per_doc = {}
    for k in range(corpus_len):
        sorted_uni = sorted(zip(uni_REs_per_doc['tfidf' + str(k)], uni_REs_per_doc['doc' + str(k)]), reverse=True)
        uni_expks_per_doc['doc' + str(k)] = [x for _, x in sorted_uni][:uni_max]

    #### Populate the REs match dictionary
    for key, value in REs_tfidfs.items():
        for index in range(0, len(value), 2):
            doc = value[index]
            multi_REs_per_doc['doc' + str(int(doc))].append(key)
            multi_REs_per_doc['tfidf' + str(int(doc))].append(value[index + 1])
    # Sort the REs matches by their tfidf scores and limit the number of matches
    multi_expks_per_doc = {}
    for k in range(corpus_len):
        sorted_re = sorted(zip(multi_REs_per_doc['tfidf' + str(k)], multi_REs_per_doc['doc' + str(k)]), reverse=True)
        multi_expks_per_doc['doc' + str(k)] = [x for _, x in sorted_re][:multi_max]

    ##### Combine the unigram and REs match dictionaries
    expks_per_doc = dict(uni_expks_per_doc)
    for key, value in uni_expks_per_doc.items():
        expks_per_doc[key].extend(multi_expks_per_doc[key])

    return expks_per_doc, multi_REs_per_doc

And compute keywords for both 1-grams and not.

In [36]:
# Choose number of keywords
uni_max = 5
multi_max = 10

# Get the explicit keywords
SCP_explicit_keywords, SCP_REs_per_doc = findExplicit_Keywords(RE_SCP_tfidfs_filtered, uni_tfidfs_filtered, uni_max, multi_max)
del RE_SCP_tfidfs_filtered
del uni_tfidfs_filtered

Now print them on file, for each glue.

In [37]:
# Write them all on file
with open('../Output/explicit_keywords.txt', 'w') as f:

    # Write keywords for each document using SCP
    f.write('SCP\n')
    for key, value in SCP_explicit_keywords.items():
        f.write(key)
        f.write(': ')
        for v in value:
            f.write(v)
            f.write('; ')
        f.write('\n')
    f.write('\n')

    '''

    # Write keywords for each document using Dice
    f.write('Dice\n')
    for key, value in dice_explicit_keywords.items():
        f.write(key)
        f.write(': ')
        for v in value:
            f.write(v)
            f.write('; ')
        f.write('\n')
    f.write('\n')

    # Write keywords for each document using MI
    f.write('MI\n')
    for key, value in MI_explicit_keywords.items():
        f.write(key)
        f.write(': ')
        for v in value:
            f.write(v)
            f.write('; ')
        f.write('\n')
    f.write('\n')

    '''

# **IMPLICIT KEYWORDS**

Define the function to get the dictionary with correlation values.

In [38]:
def score(a, b):
    # Initialize indices and sum
    i = 0
    j = 0
    s = 0
    # Loop over the elements of a and b
    while j < len(b[::2]) and i < len(a[::2]):
        while j < len(b[::2]) and a[::2][i] >= b[::2][j]:
            # If the elements are equal, add their product to the sum
            if a[::2][i] == b[::2][j]:
                s += a[2*i+1] * b[2*j+1]
            j += 1
        i += 1
    # Return the score
    return 1000000 * s / (corpus_len - 1)

def create_corr_dict(REs_probs):
    # Copy the input dictionary and initialize new dictionaries
    temp_REs_probs = dict(REs_probs)
    temp_corr_scores = {}
    correlation_dict = {}

    # For each RE
    for key1, value1 in REs_probs.items():
        key1_connections = {}
        # For each RE (cartesian product)
        for key2, value2 in temp_REs_probs.items():
            # Calculate the score among the two 
            cv = score(value1, value2)
            # If the score is non-zero, add it to the dictionary
            if abs(cv) > 0:
                key1_connections[key2] = cv
        # Add the dictionary to the output dictionary at key1
        temp_corr_scores[key1] = key1_connections
        # Remove the item from the copied dictionary so that it won't be computed twice
        temp_REs_probs.pop(key1)

    # For each of the dictionaries which were just computed
    for key1, value1 in temp_corr_scores.items():
        temp_correlations = {}
        # Get the score of key3 with key3 itself
        covkey3 = value1[key1]
        # For each word in the connections dictionary for key3
        for key2, value2 in value1.items():
            # Compute the corr
            corr = value2 / (np.sqrt(covkey3) * np.sqrt(temp_corr_scores[key2][key2]))
            # If the correlation is non-zero, add it to the dictionary
            if abs(corr) > 0.0000000001:
                temp_correlations[key2] = corr
        # Add the dictionary to the final dictionary
        correlation_dict[key1] = temp_correlations

    return correlation_dict

Then compute them, for each glue.

In [39]:
# SCP
RE_SCP_dict_corr = create_corr_dict(RE_SCP_probs_filtered)
del RE_SCP_probs_filtered

In [40]:
# Dice
# RE_dice_dict_corr = create_corr_dict(RE_dice_probs_filtered)
# del RE_dice_probs_filtered

In [41]:
# MI
# RE_MI_dict_corr = create_corr_dict(RE_MI_probs_filtered)
# del RE_MI_probs_filtered

Define the function which computes the implicit keywords.

In [42]:
def findImplicit_Keywords(dict_cov_re, Explicit_Keywords, re_in_doc, numberImplicitKeywords, firstexplmultikeyword, numkeyscore):
    # Initialize scores dictionary
    scores = {}
    for k in range(corpus_len):
        scores['doc' + str(k)] = []
        scores['scores' + str(k)] = []

    # Loop over each document in the corpus
    for doc in range(corpus_len):
        # Loop over each document again for comparison
        for doc1 in range(corpus_len):
            # Skip if the documents are the same
            if doc != doc1:
                print("#########", doc, doc1, "#########")
                # Loop over each keyword in the second document
                cont = 0
                for re in re_in_doc['doc' + str(doc1)]:
                    # Skip if the keyword is already in the scores or in the first document
                    if re not in scores['doc' + str(doc)] and re not in re_in_doc['doc' + str(doc)]:
                        # Print the current document and the document being compared
                        # print each integer percentage of this loop
                        if cont % (len(re_in_doc['doc' + str(doc1)]) / 1000) == 0:
                            print("Percentage: ", cont / len(re_in_doc['doc' + str(doc1)] * 100), "%")
                        somma = 0
                        # Loop over each explicit keyword in the first document
                        for j in range(min(numkeyscore, len(Explicit_Keywords["doc" + str(doc)][firstexplmultikeyword:]))):
                            keyword = Explicit_Keywords["doc" + str(doc)][j + firstexplmultikeyword]
                            # Add the covariance score if the keyword is in the covariance dictionary
                            if keyword in dict_cov_re[re]:
                                somma += dict_cov_re[re][keyword]
                            # Add the covariance score if the keyword is in the covariance dictionary
                            elif re in dict_cov_re[keyword]:
                                somma += dict_cov_re[keyword][re]
                        # Append the keyword and its score to the scores dictionary
                        scores['doc' + str(doc)].append(re)
                        scores['scores' + str(doc)].append(somma / numkeyscore)
                    cont += 1

    # Initialize the final dictionary for implicit keywords
    dict_implkey_re_final = {}
    for k in range(corpus_len):
        # Sort the keywords by their scores and add them to the final dictionary
        dict_implkey_re_final['doc' + str(k)] = [x for _, x in sorted(zip(scores['scores' + str(k)], scores['doc' + str(k)]), reverse=True)]
        # Limit the number of keywords to the specified number
        if len(dict_implkey_re_final['doc' + str(k)]) >= numberImplicitKeywords:
            dict_implkey_re_final['doc' + str(k)] = dict_implkey_re_final['doc' + str(k)][:numberImplicitKeywords]

    return dict_implkey_re_final

Then compute everything.

In [43]:
# Set parameters
max = 5
first = 5
many = 10

# Compute
SCP_implicit_keywords = findImplicit_Keywords(RE_SCP_dict_corr, SCP_explicit_keywords, SCP_REs_per_doc, max, first, many)

######### 0 1 #########
######### 0 2 #########
######### 0 3 #########
Percentage:  0.0 %
######### 0 4 #########
######### 0 5 #########
######### 0 6 #########
######### 0 7 #########
######### 0 8 #########
######### 0 9 #########
######### 0 10 #########
######### 0 11 #########
######### 0 12 #########
######### 0 13 #########
######### 0 14 #########
######### 0 15 #########
######### 0 16 #########
######### 0 17 #########
######### 0 18 #########
######### 0 19 #########
######### 0 20 #########
######### 0 21 #########
######### 0 22 #########
Percentage:  0.0 %
######### 0 23 #########
Percentage:  0.0 %
######### 0 24 #########
######### 0 25 #########
######### 0 26 #########
######### 0 27 #########
######### 0 28 #########
Percentage:  0.0 %
######### 0 29 #########
######### 0 30 #########
Percentage:  0.0 %
######### 0 31 #########
######### 0 32 #########
######### 0 33 #########
######### 0 34 #########
######### 0 35 #########
######### 0 36 #########
######### 0 37

In [None]:
# Compute
# dice_implicit_keywords = findImplicit_Keywords(RE_dice_dict_corr, dice_explicit_keywords, dice_REs_per_doc, max, first, many)

In [None]:
# Compute
# MI_implicit_keywords = findImplicit_Keywords(RE_MI_dict_corr, MI_explicit_keywords, MI_REs_per_doc, max, first, many)

And finally print on file, for each glue.

In [None]:
# Print
with open('../Output/implicit_keywords.txt', 'w') as f:

    # Print using SCP
    f.write('SCP\n')
    for key,value in SCP_implicit_keywords.items():
        f.write(key)
        f.write(': ')
        for v in value:
            f.write(v)
            f.write('; ')
        f.write('\n')
    f.write('\n')

    '''

    # Print using Dice
    f.write('Dice\n')
    for key,value in dice_implicit_keywords.items():
        f.write(key)
        f.write(': ')
        for v in value:
            f.write(v)
            f.write('; ')
        f.write('\n')
    f.write('\n')

    # Print using MI
    f.write('MI\n')
    for key,value in MI_implicit_keywords.items():
        f.write(key)
        f.write(': ')
        for v in value:
            f.write(v)
            f.write('; ')
        f.write('\n')
    f.write('\n')

    '''