# **INITIALIZATION**

Import libraries.

In [28]:
# Library for reading and writing data to and from files
import os
# Library for numerical computing
import numpy as np
# 
import math
# 
from collections import defaultdict

Define paths.

In [29]:
# Dataset directory
corpus_directory='../Dataset/corpus2mw'

To iterate the reading procedure, get the names of the documents in the dataset.

In [30]:
# Get a list with the names of the documents
texts_names = os.listdir(corpus_directory)

# **GET TOKENS**

First, define the special characters to be separated from each word.

In [31]:
# List of characters
specialchars = [';', ':', '!', '?', '<', '>', '&', ')', '(', ']', '[', ',', '.', '"', '%', '$ ', '=', '}', '{', '-']

And define the function which separates the special characters from each word, assuming they can only be before or after each word.

In [32]:
def token(w):
    # Init the empty list of tokens
    res = []

    # If the length is 1, add the character whatever it is
    if len(w) == 1:
        res.append(w)
    
    # Otherwise (if it's at least two characters)
    else:
        # If the first character is special, add it to the list and remove it from the word
        if w[0] in specialchars:
            res.append(w[0])
            w = w[1:]
        
        # Now, if the length became 1 because of that, for the same reason as before, add the character whatever it is
        if len(w) == 1:
            res.append(w)
        # Otherwise (if it's at least two characters), both if I had removed the first or not
        # Check whether the last character is special
        elif w[-1] in specialchars:
            res.append(w[:-1])
            res.append(w[-1])
        # or not
        else:
            res.append(w)
        
    # Return the list of tokens
    return res

For each document, for each word (in each line), if either the last or the first character are specialchars, then split in multiple tokens.

In [33]:
corpus = []

# For each document in the directory
for text in texts_names:

    # Init a temp empty list for the words in the current document
    words = []

    # Open the document
    with open(corpus_directory + '/' + text, 'r', errors='ignore') as file:

        # For each line
        for line in file:
            # For each word in the line
            for word in line.split():
                # Tokenize it
                aux = token(word)
                # And add each token to the list of words
                for t in aux:
                    words.append(t)
    # Then append the list of tokens for the document in the corpus list
    corpus.append(words)

This way, the corpus is a list of documents, which are lists of tokens.

In [34]:
# Visualize first 10 tokens in the second document corpus[1]
print(corpus[1][:10])

['Koalas', 'have', 'few', 'predators', ';', 'dingos', 'and', 'large', 'pythons', 'may']


Define a function to get a list of words from a string with words separated with a space.

In [35]:
# Given a list of strings, it returns a string
# in which the substrings will be separated by ' '
def list_to_str(strings):
    # Init res as an empty string
    res = ""
    # For each character/string in the list
    for i in range(len(strings)):
        # Concatenate the string plus a space to res
        res += strings[i] + ' '
    # Then return everything besides the last space
    return res[:-1]

Define a function to do the opposite.

In [36]:
# Given a string which contains substrings separated by ' ',
# it returns a list of words
def str_to_list(s):
    # Init res as an empty list
    res = []
    # Split the string by ' ' and for each substring
    for word in s.split():
        # Append the substring to res
        res.append(word)
    # Return the list of substrings
    return res

Declare a function which returns a list of n dictionaries, one for each n up to a fixed max, which contain the information about the frequencies for each n-gram.

In [37]:
# Create a list of n dictionaries, one for all the possible n-grams, with n in [1, max]
# Each n-dictionary will map each n-gram to a list with the absolute frequency in [0]
# and then the index of the documents in which it was found once at least,
# followed by the relative frequency (e.g. [25, 1, 20, 2, 1, 3, 4])
def create_list_of_dict_global(max):
    # Init an empty list of dictionaries
    list_dict=[]

    # For each n in [1, max], append an empty dictionary to the list
    for n in range(max):
        list_dict.append({})

    # For each index of a document in the corpus
    for i in range(len(corpus)):
        # For each index of a token in the document
        for t in range(len(corpus[i])):
            # For each n in [1, max]
            for n in range(1, len(list_dict)+1):
                # If the document is not over (there is still space for an n-gram)
                if ( t + n ) <= len(corpus[i]) :
                    # If the n-gram is not yet in the n-dictionary
                    if not ( list_to_str(corpus[i][t : t+n]) in list_dict[n-1].keys() ) :
                        # Associate to the new n-gram an empty list
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])] = []
                        # Set the frequency to 1 in position [0]
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(1)
                        # And then append the index of the current document
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(i)
                        # and set the relative frequency to 1
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(1)
                    else:
                        # Add one to the frequency of the n-gram
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])][0] += 1
                        # And if the last document in which this n-gram was found
                        # is the current one
                        if list_dict[n-1][list_to_str(corpus[i][t : t+n])][-2] == i :
                            # Just increment the relative frequency
                            list_dict[n-1][list_to_str(corpus[i][t : t+n])][-1] += 1
                        # Otherwise
                        else:
                            # Append the new (current) document
                            list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(i)
                            # and set the relative frequency to 1
                            list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(1)
    # Then return the list of dictionaries
    return list_dict

Define a function to find all the indeces at which a given n-gram occurs in a given document.

In [38]:
# Create a list of n dictionaries, one for all the possible n-grams, with n in [1, max]
def find_indices_ngram_doc(ngram_string, docnum):
    # Init an empty list for the indices
    indices = []
    # Get the document as a list of tokens 
    doc = corpus[docnum]
    # Get the n-gram as a list of words
    ngram_list = str_to_list(ngram_string)
    # For each index of a token in the document, up to the last possible n-gram starter
    for i in range(len(doc) - len(ngram_list) + 1):
        # If the current token is the first word of the n-gram
        if doc[i] == ngram_list[0] :
                # Init counter of words to 1
                c = 1
                # While document is not over and still checking for the n-gram
                while ( c+i < len(doc) ) and (c < len(ngram_list) ) :
                    # If it was not found, break
                    if doc[c+i] != ngram_list[c]:
                        break
                    # Otherwise, increment the counter and go on
                    else:
                        c += 1
                # If all the n words were found, append i as an index for the n_gram
                if c == len(ngram_list): 
                    indices.append(i)

    # Then return the list of indices
    return indices

Pick a max value for n. Remember that if you care about the 7-grams you need the 8-grams to be computed.

In [39]:
max_n = 8

Create a dictionary with the above function.

In [40]:
dizzo = create_list_of_dict_global(max_n)

See some frequent 7-grams. Remember that the 7-grams will be in the dictionary in position [6].

In [41]:
# Set parameters
num_to_print = 5
min_freq = 10

# Print some key-value couples in dizzo[8] with some relevant absolute frequency
c = 0
for key, value in dizzo[max_n-1].items():
    if value[0] >= min_freq:
        c += 1
        print(key, value)
    if c == num_to_print:
        break

. The median income for a household in [30, 2, 1, 51, 3, 308, 1, 322, 1, 410, 4, 439, 1, 453, 1, 464, 1, 481, 1, 492, 1, 503, 3, 527, 1, 560, 1, 569, 1, 586, 2, 632, 2, 656, 2, 681, 1, 688, 1, 718, 1]
The median income for a household in the [34, 2, 1, 51, 3, 308, 1, 322, 1, 410, 4, 439, 1, 453, 1, 464, 2, 481, 1, 492, 1, 503, 3, 527, 1, 543, 1, 560, 1, 569, 1, 586, 2, 632, 3, 656, 3, 681, 1, 688, 1, 718, 1]
, and the median income for a family [35, 2, 1, 51, 3, 308, 1, 322, 1, 410, 4, 439, 1, 453, 1, 464, 2, 481, 1, 492, 1, 503, 3, 527, 1, 543, 1, 560, 1, 569, 1, 586, 2, 593, 1, 632, 3, 656, 3, 681, 1, 688, 1, 718, 1]
and the median income for a family was [36, 2, 1, 51, 3, 308, 2, 322, 1, 410, 4, 439, 1, 453, 1, 464, 2, 481, 1, 492, 1, 503, 3, 527, 1, 543, 1, 560, 1, 569, 1, 586, 2, 593, 1, 632, 3, 656, 3, 681, 1, 688, 1, 718, 1]
for females . The per capita income for [37, 2, 1, 51, 3, 308, 2, 322, 1, 410, 4, 439, 1, 453, 1, 464, 2, 481, 1, 492, 1, 503, 3, 527, 1, 543, 2, 560, 1, 56

# **EXPLORING GLUES**

Write a function which computes a given glue, plus all the _tfidf_ values and the _probabilities_ for each document, for each n-gram in the dictionaries. Doing both let us recycle the same iterations to compute both.

In [42]:
# Given a list of dictionaries and a required glue
# Returns a list with the glue value for each n-gram and their relative tfidf and probs
def create_tfidf_and_probs():

    # Init two empty lists for list_of_tfdidf_dict and list_of_probs_dict
    list_of_tfidf_dicts = []
    list_of_probs_dicts = []

    # For each n in [1, len(d))
    for n in range(1, len(dizzo)):

        # Init the two dictionaries
        temp_tfidf_dict = dict(dizzo[n - 1])
        temp_probs_dict = dict(dizzo[n - 1])

        # For each n-gram with its frequencies list
        for key, value in dizzo[n - 1].items():
            
            # Only if n is at least 2 or if it is a monogram of length 3
            # Re-compute
            if ( n != 1 ) or ( len(key) >= 3 ) :

                # Init empty lists for tfidf and probs
                tfidf_mod = []
                probs = []

                # Get the number of documents in which the n-gram was actually found
                num_non_zero_doc = ( len(value) - 1 ) / 2

                # And for each document in which it was found
                # Compute all the tfidf and probs
                for doc_idx in range(1, len(value), 2):

                    # Get the lengths of the words in the n-gram
                    words_lens=[]
                    for w in str_to_list(key):
                        words_lens.append(len(w))

                    # Store the number of the current document
                    # and the relative frequency
                    num = value[doc_idx]
                    rel_freq = value[doc_idx+1]

                    # And append in tfidf list the number of the current document
                    tfidf_mod.append(num)
                    # Followed by its tdidf
                    tfidf_mod.append( np.mean(words_lens) * rel_freq * np.log(len(corpus)/num_non_zero_doc) / len(corpus[num]) ) 
                    
                    # And append in probs list the number of the current document
                    probs.append(num)
                    # Followed by its prob
                    probs.append( rel_freq / len(corpus[num]) )

                # Compute the average probability of the n-gram
                medprob = sum(probs[1::2]) / len(corpus)

                # And subtract it from each probability
                probs[1::2] = [l1 - medprob for l1 in probs[1::2]]

                # Initialize an empty list for the current n-gram twice
                temp_tfidf_dict[key] = []
                temp_probs_dict[key] = []

                # Append the values docnum and tfidf to the list for the current n-gram in the list in the dict
                for tfidf in tfidf_mod:
                    temp_tfidf_dict[key].append(tfidf)
                for p in probs:
                    temp_probs_dict[key].append(p)
            
            # Otherwise if we have a monogram which is not long enough (3 letters at least)
            else:
                # Remove the list related to such n-gram from both the two dictionaries
                temp_tfidf_dict.pop(key)
                temp_probs_dict.pop(key)

        # Append the two n-th dictionaries to the corresponding list of dictionaries
        list_of_tfidf_dicts.append(temp_tfidf_dict)
        list_of_probs_dicts.append(temp_probs_dict)

    # Return the two lists of dictionaries (the glues for each n-gram are in the first position of their own list, in both)
    return list_of_tfidf_dicts, list_of_probs_dicts

And also write a function which only computes the glues.

In [43]:
def compute_glues(gluename):
    # Initialize an empty list for storing the glue dictionaries
    glue_dicts = []

    # For each n in [1, len(d))
    for n in range(1, len(dizzo)):

        # Initialize the glue dictionary as a dictionary with the same keys as dizzo[n-1] associating 0 to each
        glue_dict = dict.fromkeys(dizzo[n - 1], 0)

        # For each n-gram with its frequencies list
        for key, value in dizzo[n - 1].items():
            # Only if n is at least 2 or if it is a monogram of length 3
            if ( n != 1 ) or ( len(key) >= 3 ):
                # Store the absolute frequency for the n-gram
                abs_freq = value[0]

                # If n is at least 2
                if n != 1:
                    # Convert the n-gram into a list instead of a string
                    key_list = str_to_list(key)

                    # Initialize the sum to zero
                    s = 0
                    
                    # Compute the right GLUE coefficient

                    # Do Dice
                    if gluename == 'Dice':

                        # Dividing the n-gram into two parts w1...wi and wi+1...wn
                        for i in range(len(key_list) - 1):
                            # Get the absolute frequencies of the two sub-n-grams
                            f1 = dizzo[i][list_to_str(key_list[:i+1])][0]
                            f2 = dizzo[n-i-2][list_to_str(key_list[i+1:])][0]
                            # And add to the sum the partial sum
                            s += (f1 + f2) / (n - 1)

                        gl = (2 * abs_freq) / s

                    # Do SCP
                    elif gluename == 'SCP':

                        # Dividing the n-gram into two parts w1...wi and wi+1...wn
                        for i in range(len(key_list) - 1):
                            # Get the absolute frequencies of the two sub-n-grams
                            f1 = dizzo[i][list_to_str(key_list[:i+1])][0]
                            f2 = dizzo[n-i-2][list_to_str(key_list[i+1:])][0]
                            # And add to the sum the partial sum
                            s += (f1 * f2) / (n - 1)

                        gl = (abs_freq**2) / s


                    elif gluename == 'MI':
                        
                        # Dividing the n-gram into two parts w1...wi and wi+1...wn
                        for i in range(len(key_list) - 1):
                            # Get the absolute frequencies of the two sub-n-grams
                            f1 = dizzo[i][list_to_str(key_list[:i+1])][0]
                            f2 = dizzo[n-i-2][list_to_str(key_list[i+1:])][0]
                            # And add to the sum the partial sum
                            s += (f1 * f2) / (n - 1)

                        gl = np.log(abs_freq / s)
                    else:
                        gl = 0

                    # Add the glue to the list of the current n-gram
                    glue_dict[key] = gl

            # Otherwise if we have a monogram which is not long enough (3 letters at least)
            else:
                # Remove the list related to such n-gram from the glue dictionary
                glue_dict.pop(key)

        # Append the n-th glue dictionary to the list of glue dictionaries
        glue_dicts.append(glue_dict)

    # Return the list of glue dictionaries
    return glue_dicts

Compute the glues for each n-gram.

In [44]:
SCP_glues = compute_glues('SCP')
dice_glues = compute_glues('Dice')
MI_glues = compute_glues('MI')

Compute _tfidfs_ and _probs_ for each document in which they appear.

In [45]:
tfidfs, probs = create_tfidf_and_probs()

Visualize the glues for the 3-grams.

In [46]:
# Print the first 10 items
for key in list(SCP_glues[2].keys())[:10]:
    print(f"SCP: {key} [{SCP_glues[2][key]:.4f}]")
    print(f"Dice: {key} [{dice_glues[2][key]:.4f}]")
    print(f"MI: {key} [{MI_glues[2][key]:.4f}]")
    print()

SCP: Greek Christian scribes [0.0211]
Dice: Greek Christian scribes [0.0412]
MI: Greek Christian scribes [-3.8607]

SCP: Christian scribes played [0.0052]
Dice: Christian scribes played [0.0104]
MI: Christian scribes played [-5.2575]

SCP: scribes played a [0.0002]
Dice: scribes played a [0.0003]
MI: scribes played a [-8.7579]

SCP: played a crucial [0.0026]
Dice: played a crucial [0.0235]
MI: played a crucial [-6.6399]

SCP: a crucial role [0.0003]
Dice: a crucial role [0.0006]
MI: a crucial role [-8.8203]

SCP: crucial role in [0.0002]
Dice: crucial role in [0.0005]
MI: crucial role in [-8.9991]

SCP: role in the [0.0005]
Dice: role in the [0.0027]
MI: role in the [-11.0263]

SCP: in the preservation [0.0000]
Dice: in the preservation [0.0002]
MI: in the preservation [-11.2575]

SCP: the preservation of [0.0001]
Dice: the preservation of [0.0003]
MI: the preservation of [-10.5682]

SCP: preservation of Aristotle [0.0392]
Dice: preservation of Aristotle [0.1429]
MI: preservation of Ar

Visualize the tfidfs dictionary for the 3-grams. Remember, the even indeces are document numbers and the odd ones are the _tfidf_.

In [47]:
print("### TFIDF ###")

# Print the first 10 items
for key in list(tfidfs[2].keys())[:10]:
    # Empty temp list
    formatted_elements = []
    # Format numbers and append them to the temp list
    for i, num in enumerate(tfidfs[2][key]):
        if i % 2 == 0:
            # Even index (integer), print normally
            formatted_elements.append(str(num))
        else:
            # Odd index (double), print with four digits after the decimal point
            formatted_elements.append(f"{num:.4f}")
    # Print the key and the formatted string
    print(f"{key}: {formatted_elements}")
    

### TFIDF ###
Greek Christian scribes: ['0', '0.0078']
Christian scribes played: ['0', '0.0081']
scribes played a: ['0', '0.0052']
played a crucial: ['0', '0.0046', '79', '0.0063']
a crucial role: ['0', '0.0040', '79', '0.0054']
crucial role in: ['0', '0.0043', '79', '0.0059']
role in the: ['0', '0.0017', '19', '0.0066', '36', '0.0126', '55', '0.0039', '73', '0.0051', '132', '0.0029', '133', '0.0050', '138', '0.0057', '147', '0.0021', '154', '0.0024', '164', '0.0044', '212', '0.0136', '223', '0.0367', '247', '0.0137', '256', '0.0051', '272', '0.0112', '318', '0.0070', '323', '0.0242', '357', '0.0189', '360', '0.0112', '391', '0.0133', '460', '0.0388', '530', '0.0419', '643', '0.0201', '709', '0.0103']
in the preservation: ['0', '0.0063']
the preservation of: ['0', '0.0050', '65', '0.0710', '188', '0.0178', '628', '0.0979']
preservation of Aristotle: ['0', '0.0085']


Visualize the probs dictionary for the 3-grams. Remember, the even indeces are document numbers and the odd ones are the _prob_.

In [48]:
print("### PROBS ###")

# Print the first 10 items
for key in list(probs[2].keys())[:10]:
    # Empty temp list
    formatted_elements = []
    # Format numbers and append them to the temp list
    for i, num in enumerate(probs[2][key]):
        if i % 2 == 0:
            # Even index (integer), print normally
            formatted_elements.append(str(num))
        else:
            # Odd index (double), print with four digits after the decimal point
            formatted_elements.append(f"{num:.4f}")
    # Print the key and the formatted string
    print(f"{key}: {formatted_elements}")
    

### PROBS ###
Greek Christian scribes: ['0', '0.0002']
Christian scribes played: ['0', '0.0002']
scribes played a: ['0', '0.0002']
played a crucial: ['0', '0.0002', '79', '0.0002']
a crucial role: ['0', '0.0002', '79', '0.0002']
crucial role in: ['0', '0.0002', '79', '0.0002']
role in the: ['0', '0.0001', '19', '0.0006', '36', '0.0012', '55', '0.0003', '73', '0.0005', '132', '0.0002', '133', '0.0004', '138', '0.0005', '147', '0.0002', '154', '0.0002', '164', '0.0004', '212', '0.0013', '223', '0.0036', '247', '0.0013', '256', '0.0005', '272', '0.0011', '318', '0.0006', '323', '0.0023', '357', '0.0018', '360', '0.0011', '391', '0.0013', '460', '0.0038', '530', '0.0041', '643', '0.0019', '709', '0.0010']
in the preservation: ['0', '0.0002']
the preservation of: ['0', '0.0002', '65', '0.0024', '188', '0.0006', '628', '0.0033']
preservation of Aristotle: ['0', '0.0002']


# **REGULAR EXPRESSIONS**

Compute a list with n dictionaries, each associating to each n-gram all the (n+1)-grams which have one more word on the left/right.

In [49]:
# List of dictionaries, one for each value of n
fathers = []

# Notice n < max_n obviously
for n in range(1, max_n):

    # Get a dictionary with all the keys of the n-th dictionary in dizzo, and empty lists as values
    f = {key: [] for key in dict(dizzo[n - 1]).keys()}

    # For each (n+1)-gram in the (n+1)-th dictionary
    for key, value in dizzo[n].items():
        # Get the (n+1)-gram as list of words
        key_list = str_to_list(key)
        # Get the two n-grams
        subkey1 = list_to_str(key_list[1:])
        subkey2 = list_to_str(key_list[:-1])
        # Add them in the temp dictionary
        f[subkey1].append(key)
        f[subkey2].append(key)
        
    # And finally append the dictionary to the list fathers
    fathers.append(f)

Visualize first ten items for n = 1.

In [50]:
# Print the first 10 items
for key in list(fathers[0].keys())[:10]:
    # For each key print the list
    print(f"{key}: {fathers[0][key]}") 

Greek: ['Greek Christian', 'extant Greek', 'Greek language', 'first Greek', 'Greek Christians', 'the Greek', 'Greek word', 'In Greek', 'Greek practice', 'Greek Ptolemaic', 'in Greek', 'Greek ,', ', Greek', 'Dateable Greek', 'Greek pottery', 'Byzantine Greek', 'Greek manuscripts', 'a Greek', 'Greek manuscript', 'Greek and', 'from Greek', 'Greek into', 'to Greek', 'Greek gods', 'Greek mythology', 'for Greek', 'Greek ethnographer', 'their Greek', 'Greek counterparts', 'Greek City', 'Hearst Greek', 'Greek Theatre', 'delays Greek', 'Greek Recruitment', 'Greek "', '. Greek', 'Greek life', 'Greek People."', 'Greek Island', '( Greek', 'Greek :', 'by Greek', 'Greek letters', 'Greek Î¼Î¿Î½Î¿Î¸ÎµÎ»Î·Ï„Î¹ÏƒÎ¼ÏŒÏ‚', 'ancient Greek', 'Greek art', 'Ancient Greek', 'Greek Septuagint', 'Greek genocide', 'classical Greek', 'Greek historian', 'of Greek', 'Greek rhetoric', 'Greek cuisine', 'Greek literature', 'Greek usage', 'Greek nature', 'The Greek', 'Greek National', 'Greek soldiers', 'young Greek', 'G

Now define the function which returns a dictionary containing the _tfidfs_ and _probs_ lists only for the REs (regardless of n).

In [51]:
# Auxiliary function (for readability) which checks whether the "key_string" n-gram is a RE
def process_keys_2(key_string, glues, tdfids, probs, REglues, REtfidfs, REprobs):
    # Get n as the number of spaces in key_string + 1
    n = key_string.count(' ')
    
    # Get the glue for the n-gram
    glue = glues[n][key_string]

    # Get key_string as list to easily remove the first/last word
    key_list = str_to_list(key_string)

    # Get the set of glues for (n-1)-grams
    omega_n_minus = set()
    # If it's a 2-gram do not check for the glues of 1-grams
    if n > 1:
        omega_n_minus.add(glues[n - 1][list_to_str(key_list[1:])])
        omega_n_minus.add(glues[n - 1][list_to_str(key_list[:-1])])

    # Get the set of glues for (n+1)-grams
    omega_n_plus = set([glues[n + 1][fath] for fath in fathers[n][key_string]])
    # If the glue is bigger than the glue for all the sons and fathers
    if all( ( glue > g ) for g in omega_n_minus.union(omega_n_plus) ):
        # Add the info to the two dictionaries
        REglues[key_string] = glues[n][key_string]
        REtfidfs[key_string] = tdfids[n][key_string]
        REprobs[key_string] = probs[n][key_string]

# Returns two dictionaries, only containing REs as keys
def find_RE(tfidfs, probs, glues):
    # Init the new dictionaries containing only the REs
    REtfidfs = {}
    REprobs = {}
    REglues = {}

    # For each n in [1, max_n)
    for n in range(1, len(glues)-1):
        # For each n-gram with their list of tfidfs
        for key, value in glues[n].items() :
            # Process the n-gram to decide whether it is a RE
            process_keys_2(key, glues, tfidfs, probs, REglues, REtfidfs, REprobs)

    # Return the two dictionaries
    return REtfidfs, REprobs, REglues

Then compute the REsn information using the three glues.

In [52]:
RE_SCP_tfidfs, RE_SCP_probs, RE_SCP_glues = find_RE(tfidfs, probs, SCP_glues)
RE_dice_tfidfs, RE_dice_probs, RE_dice_glues = find_RE(tfidfs, probs, dice_glues)
RE_MI_tfidfs, RE_MI_probs, RE_MI_glues = find_RE(tfidfs, probs, MI_glues)

len(RE_SCP_tfidfs), len(RE_dice_tfidfs), len(RE_MI_tfidfs)

(411954, 433572, 419043)

And visualize the REs with their glues.

In [53]:
# Print the first 10 items

for key in list(RE_SCP_glues.keys())[:10]:
    print(f"SCP: {key} [{RE_SCP_glues[key]:.4f}]")
print()

for key in list(RE_dice_glues.keys())[:10]:
    print(f"Dice: {key} [{RE_dice_glues[key]:.4f}]")
print() 

for key in list(RE_MI_glues.keys())[:10]:    
    print(f"MI: {key} [{RE_MI_glues[key]:.4f}]")
    

SCP: crucial role [0.0012]
SCP: in the [0.0262]
SCP: of the [0.0488]
SCP: . The [0.1247]
SCP: comment extensively [0.0071]
SCP: John Philoponus [0.0060]
SCP: , and [0.0231]
SCP: Philoponus stands [0.0135]
SCP: fundamental critique [0.0333]
SCP: Aristotle's views [0.0192]

Dice: crucial role [0.0194]
Dice: in the [0.1440]
Dice: copying all [0.0027]
Dice: of the [0.2118]
Dice: . The [0.2643]
Dice: The first [0.0348]
Dice: Greek Christians [0.0175]
Dice: comment extensively [0.0741]
Dice: John Philoponus [0.0120]
Dice: , and [0.1433]

MI: crucial role [-7.4337]
MI: comment extensively [-4.9416]
MI: Philoponus stands [-4.3041]
MI: fundamental critique [-3.4012]
MI: Aristotle's views [-3.9512]
MI: eternity of [-10.0512]
MI: Aristotelian thought [-5.9814]
MI: formal commentary [-5.3375]
MI: Ephesus reappears [-2.4849]
MI: late eleventh [-7.3212]


# **FILTERING**

Define a function to delete REs containing special characters.

In [54]:
# Gets a string and returns false if must be deleted
def no_special(key_string):
    for i in range(len(key_string)):
        if (key_string[i] in specialchars):
            return False
    return True

Define a function to delete REs contained in one only document.

In [55]:
# Gets a string and returns false if must be deleted
def more_documents(key_list):
    if (dizzo[len(str_to_list(key_list)) - 1][key_list][0] > 1):
        return True
    else:
        return False

And then filter.

In [57]:
# Remember REs datastructures are now just dictionaries, not lists of dictionaries
RE_SCP_glues_filtered = {}
RE_SCP_tfidfs_filtered = {}
RE_SCP_probs_filtered = {}

RE_dice_glues_filtered = {}
RE_dice_tfidfs_filtered = {}
RE_dice_probs_filtered = {}

RE_MI_glues_filtered = {}
RE_MI_tfidfs_filtered = {}
RE_MI_probs_filtered = {}

# Iterate through REs and filter SCP REs
for key, value in RE_SCP_glues.items():
    if no_special(key) and more_documents(key) :
        RE_SCP_glues_filtered[key] = value
        RE_SCP_tfidfs_filtered[key] = RE_SCP_tfidfs[key]
        RE_SCP_probs_filtered[key] = RE_SCP_probs[key]

# Iterate through REs and filter Dice REs
for key, value in RE_dice_glues.items():
    if no_special(key) and more_documents(key) and no_stop_words(key) :
        RE_dice_glues_filtered[key] = value
        RE_dice_tfidfs_filtered[key] = RE_dice_tfidfs[key]
        RE_dice_probs_filtered[key] = RE_dice_probs[key]

# Iterate through REs and filter MI REs
for key, value in RE_MI_glues.items():
    if no_special(key) and more_documents(key) and no_stop_words(key) :
        RE_MI_glues_filtered[key] = value
        RE_MI_tfidfs_filtered[key] = RE_MI_tfidfs[key]
        RE_MI_probs_filtered[key] = RE_MI_probs[key]

Print the glues for some filtered REs.

In [61]:
# Print 10 items

# Find the intersection of keys
intersection_keys = set(RE_SCP_glues_filtered.keys()).intersection(set(RE_dice_glues_filtered.keys()), set(RE_MI_glues_filtered.keys()))

# Print
for key in list(intersection_keys)[:10]:
    print(f"SCP: {key} [{RE_SCP_glues_filtered[key]:.4f}]")
    print(f"Dice: {key} [{RE_dice_glues_filtered[key]:.4f}]")
    print(f"MI: {key} [{RE_MI_glues_filtered[key]:.4f}]")
    print()

SCP: Tel Aviv [0.5000]
Dice: Tel Aviv [0.6667]
MI: Tel Aviv [-1.3863]

SCP: masterâ€™s programs [0.0107]
Dice: masterâ€™s programs [0.0500]
MI: masterâ€™s programs [-5.2338]

SCP: Khun Sa [0.4000]
Dice: Khun Sa [0.5714]
MI: Khun Sa [-1.6094]

SCP: southern part [0.0033]
Dice: southern part [0.0423]
MI: southern part [-8.2681]

SCP: Puget Sound [0.0952]
Dice: Puget Sound [0.1739]
MI: Puget Sound [-3.0445]

SCP: Courier typeface [0.6667]
Dice: Courier typeface [0.8000]
MI: Courier typeface [-1.0986]

SCP: work hard [0.0002]
Dice: work hard [0.0088]
MI: work hard [-9.3750]

SCP: NiÅ¡ was [0.0001]
Dice: NiÅ¡ was [0.0005]
MI: NiÅ¡ was [-9.6306]

SCP: affectionately referred [0.0068]
Dice: affectionately referred [0.0385]
MI: affectionately referred [-5.6836]

SCP: Santa Rosa [0.0147]
Dice: Santa Rosa [0.0870]
MI: Santa Rosa [-4.9163]



How many REs did we get?

In [58]:
# Print
print(str(len(RE_SCP_glues_filtered)) + ' with SCP')
print(str(len(RE_dice_glues_filtered)) + ' with Dice')
print(str(len(RE_MI_glues_filtered)) + ' with MI')

4548 with SCP
9105 with Dice
3302 with MI


Get the REs with maximum glue.

In [62]:
max(RE_SCP_glues_filtered, key=RE_SCP_glues_filtered.get), max(RE_dice_glues_filtered, key=RE_dice_glues_filtered.get), max(RE_MI_glues_filtered, key=RE_MI_glues_filtered.get)

("CÃ´te d'Ivoire", "CÃ´te d'Ivoire", 'Haugesunds Avis')

# **STOP WORDS**