# **Initialization**

Import libraries.

In [68]:
# Library for reading and writing data to and from files
import os
# Library for numerical computing
import numpy as np

Define paths.

In [69]:
# Dataset directory
corpus_directory='../Dataset/corpus2mw'

To iterate the reading procedure, get the names of the documents in the dataset.

In [70]:
# Get a list with the names of the documents
texts_names = os.listdir(corpus_directory)

len(texts_names)

3170

# **Get tokens**

First, define the special characters to be separated from each word.

In [71]:
# List of characters
specialchars = [';', ':', '!', '?', ')', '(', ']', '[', ',', '.', '"', '%', '$ ', '=', '}', '{', '-']
# specialchars = [';', ':', '!', '?', '<', '>', '&', ')', '(', ']', '[', ',', '.', '"', '%', '$ ', '=', '}', '{', '-']

And define the function which separates the special characters from each word, assuming they can only be before or after each word.

In [72]:
def token(w):
    # Init the empty list of tokens
    res = []

    # If the length is 1, add the character whatever it is
    if len(w) == 1:
        res.append(w)
    
    # Otherwise (if it's at least two characters)
    else:
        # If the first character is special, add it to the list and remove it from the word
        if w[0] in specialchars:
            res.append(w[0])
            w = w[1:]
        
        # Now, if the length became 1 because of that, for the same reason as before, add the character whatever it is
        if len(w) == 1:
            res.append(w)
        # Otherwise (if it's at least two characters), both if I had removed the first or not
        # Check whether the last character is special
        elif w[-1] in specialchars:
            res.append(w[:-1])
            res.append(w[-1])
        # or not
        else:
            res.append(w)
        
    # Return the list of tokens
    return res

For each document, for each word (in each line), if either the last or the first character are specialchars, then split in multiple tokens.

In [73]:
corpus = []

# For each document in the directory
for text in texts_names:

    # Init a temp empty list for the words in the current document
    words = []

    # Open the document
    with open(corpus_directory + '/' + text, 'r', errors='ignore') as file:

        # For each line
        for line in file:
            # For each word in the line
            for word in line.split():
                # Tokenize it
                aux = token(word)
                # And add each token to the list of words
                for t in aux:
                    words.append(t)
    # Then append the list of tokens for the document in the corpus list
    corpus.append(words)

This way, the corpus is a list of documents, which are lists of tokens.

In [74]:
# Visualize first 10 tokens in the second document corpus[1]
print(corpus[1][:10])

['Koalas', 'have', 'few', 'predators', ';', 'dingos', 'and', 'large', 'pythons', 'may']


Define a function to get a list of words from a string with words separated with a space.

In [75]:
# Given a list of strings, it returns a string
# in which the substrings will be separated by ' '
def list_to_str(strings):
    # Init res as an empty string
    res = ""
    # For each character/string in the list
    for i in range(len(strings)):
        # Concatenate the string plus a space to res
        res += strings[i] + ' '
    # Then return everything besides the last space
    return res[:-1]

Define a function to do the opposite.

In [76]:
# Given a string which contains substrings separated by ' ',
# it returns a list of words
def str_to_list(s):
    # Init res as an empty list
    res = []
    # Split the string by ' ' and for each substring
    for word in s.split():
        # Append the substring to res
        res.append(word)
    # Return the list of substrings
    return res

Declare a function which returns a list of n dictionaries, one for each n up to a fixed max, which contain the information about the frequencies for each n-gram.

In [77]:
# Create a list of n dictionaries, one for all the possible n-grams, with n in [1, max]
# Each n-dictionary will map each n-gram to a list with the absolute frequency in [0]
# and then the index of the documents in which it was found once at least,
# followed by the relative frequency (e.g. [25, 1, 20, 2, 1, 3, 4])
def create_list_of_dict_global(max):
    # Init an empty list of dictionaries
    list_dict=[]

    # For each n in [1, max], append an empty dictionary to the list
    for n in range(max):
        list_dict.append({})

    # For each index of a document in the corpus
    for i in range(len(corpus)):
        # For each index of a token in the document
        for t in range(len(corpus[i])):
            # For each n in [1, max]
            for n in range(1, len(list_dict)+1):
                # If the document is not over (there is still space for an n-gram)
                if ( t + n ) <= len(corpus[i]) :
                    # If the n-gram is not yet in the n-dictionary
                    if not ( list_to_str(corpus[i][t : t+n]) in list_dict[n-1].keys() ) :
                        # Associate to the new n-gram an empty list
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])] = []
                        # Set the frequency to 1 in position [0]
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(1)
                        # And then append the index of the current document
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(i)
                        # and set the relative frequency to 1
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(1)
                    else:
                        # Add one to the frequency of the n-gram
                        list_dict[n-1][list_to_str(corpus[i][t : t+n])][0] += 1
                        # And if the last document in which this n-gram was found
                        # is the current one
                        if list_dict[n-1][list_to_str(corpus[i][t : t+n])][-2] == i :
                            # Just increment the relative frequency
                            list_dict[n-1][list_to_str(corpus[i][t : t+n])][-1] += 1
                        # Otherwise
                        else:
                            # Append the new (current) document
                            list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(i)
                            # and set the relative frequency to 1
                            list_dict[n-1][list_to_str(corpus[i][t : t+n])].append(1)
    # Then return the list of dictionaries
    return list_dict

Define a function to find all the indeces at which a given n-gram occurs in a given document.

In [78]:
# Create a list of n dictionaries, one for all the possible n-grams, with n in [1, max]
def find_indices_ngram_doc(ngram_string, docnum):
    # Init an empty list for the indices
    indices = []
    # Get the document as a list of tokens 
    doc = corpus[docnum]
    # Get the n-gram as a list of words
    ngram_list = str_to_list(ngram_string)
    # For each index of a token in the document, up to the last possible n-gram starter
    for i in range(len(doc) - len(ngram_list) + 1):
        # If the current token is the first word of the n-gram
        if doc[i] == ngram_list[0] :
                # Init counter of words to 1
                c = 1
                # While document is not over and still checking for the n-gram
                while ( c+i < len(doc) ) and (c < len(ngram_list) ) :
                    # If it was not found, break
                    if doc[c+i] != ngram_list[c]:
                        break
                    # Otherwise, increment the counter and go on
                    else:
                        c += 1
                # If all the n words were found, append i as an index for the n_gram
                if c == len(ngram_list): 
                    indices.append(i)

    # Then return the list of indices
    return indices

Pick a max value for n.

In [79]:
max_n = 7

Create a dictionary with the above function.

In [80]:
dizzo = create_list_of_dict_global(max_n)

KeyboardInterrupt: 

Remember that the 7-grams will be in the dictionary in position [6].

In [None]:
# Set parameters
num_to_print = 5

# Print
for i in list(dizzo[6].keys())[:5]:
    print(i, dizzo[6][i])

Greek Christian scribes played a crucial role [1, 0, 1]
Christian scribes played a crucial role in [1, 0, 1]
scribes played a crucial role in the [1, 0, 1]
played a crucial role in the preservation [1, 0, 1]
a crucial role in the preservation of [1, 0, 1]


In [None]:
# Set parameters
num_to_print = 5
min_freq = 10

# Print some key-value couples in dizzo[8] with some relevant absolute frequency
c = 0
for key, value in dizzo[max_n-1].items():
    if value[0] >= min_freq:
        c += 1
        print(key, value)
    if c == num_to_print:
        break

. The median income for a household [30, 2, 1, 369, 3, 2216, 1, 2326, 1, 2803, 4, 2832, 1, 2846, 1, 2857, 1, 2874, 1, 2885, 1, 2896, 3, 2920, 1, 2953, 1, 2962, 1, 2979, 2, 3025, 2, 3049, 2, 3074, 1, 3081, 1, 3111, 1]
The median income for a household in [34, 2, 1, 369, 3, 2216, 1, 2326, 1, 2803, 4, 2832, 1, 2846, 1, 2857, 2, 2874, 1, 2885, 1, 2896, 3, 2920, 1, 2936, 1, 2953, 1, 2962, 1, 2979, 2, 3025, 3, 3049, 3, 3074, 1, 3081, 1, 3111, 1]
median income for a household in the [36, 2, 1, 369, 3, 2216, 2, 2326, 1, 2803, 4, 2832, 1, 2846, 1, 2857, 2, 2874, 1, 2885, 1, 2896, 3, 2920, 1, 2936, 1, 2953, 1, 2962, 1, 2979, 2, 2986, 1, 3025, 3, 3049, 3, 3074, 1, 3081, 1, 3111, 1]
, and the median income for a [35, 2, 1, 369, 3, 2216, 1, 2326, 1, 2803, 4, 2832, 1, 2846, 1, 2857, 2, 2874, 1, 2885, 1, 2896, 3, 2920, 1, 2936, 1, 2953, 1, 2962, 1, 2979, 2, 2986, 1, 3025, 3, 3049, 3, 3074, 1, 3081, 1, 3111, 1]
and the median income for a family [36, 2, 1, 369, 3, 2216, 2, 2326, 1, 2803, 4, 2832, 1, 2

# **EXPLORING GLUES**

In [None]:
# Given a list of dictionaries and a required glues
def create_glue_and_tfidfmod_and_probs(d, gluename):

    # Init two empty lists for list_of_tfdidf_dict and list_of_probs_dict
    list_of_tfidf_dicts = []
    list_of_probs_dicts = []
    
    # If Dice is the required glue
    if gluename == 'Dice':

        # For each n in [1, len(d)]
        for n in range(1, len(d)+1):

            # Get the dictionary of n-grams twice
            # ( Consider replacing with {} )
            temp_tfidf_dict = dict(d[n - 1])
            temp_probs_dict = dict(d[n - 1])

            # For each couple n-gram / list of frequencies
            for key, value in dict(d[n - 1]).items():

                # Only if n is at least 2 or if it is a monogram of length 3
                # Re-compute
                if ( n != 1 ) or ( len(key) >= 3 ) :

                    # Init empty lists for tfidf and probs
                    tfidf_mod = []
                    probs = []

                    # Get the number of documents in which the n-gram was actually found
                    num_non_zero_doc = ( len(value) - 1 ) / 2

                    # And for each document in which it was found
                    for doc_idx in range(1, len(value), 2):

                        # Get the lengths of the words in the n-gram
                        words_lens=[]
                        for w in str_to_list(key):
                            words_lens.append(len(w))

                        # Store the number of the current document
                        # and the relative frequency
                        num = value[doc_idx]
                        rel_freq = value[doc_idx+1]
                        
                        # And append in tfidf list the number of the current document
                        tfidf_mod.append(num)
                        # Followed by its tdidf
                        tfidf_mod.append( np.mean(words_lens) * rel_freq * np.log(len(corpus)/num_non_zero_doc) / len(corpus[num]) )
                        
                        # And append in probs list the number of the current document
                        probs.append(num)
                        # Followed by its prob
                        probs.append( rel_freq / len(corpus[num]) )

                    # Compute the average probability of the n-gram
                    medprob = sum(probs[1::2])/len(corpus)
                    # And subtract it from each probability
                    probs[1::2] = [ ( l1 - medprob ) for l1 in probs[1::2] ]

                    # Store the absolute frequency for the n-gram
                    abs_freq = value[0]

                    # Initialize an empty list for the current n-gram twice
                    temp_tfidf_dict[key] = []
                    temp_probs_dict[key] = []

                    # If n is at least 2
                    if n != 1:
                        # Convert the n-gram into a list instead of a string
                        key_list = str_to_list(key)

                        # Initialize the sum to zero
                        s = 0
                        # Dividing the n-gram into two parts w1...wi and wi+1...wn
                        for i in range(len(key_list) - 1):
                            # Get the absolute frequencies of the two sub-n-grams
                            f1 = d[i][key[:i+1]][0]
                            f2 = d[n - i - 2][key[i+1:]][0]
                            # And add to the sum the partial sum
                            s += (f1 + f2) / (n - 1)

                        # Append the Dice coefficient to the list of the current n-gram
                        # (first value of the list associated to the current n-gram)
                        temp_tfidf_dict[key].append((2 * abs_freq) / s)
                        temp_probs_dict[key].append((2 * abs_freq) / s)
                    
                    # Append the values docnum and tfidf to the list for the current n-gram in the list in the dict
                    for tfidf in tfidf_mod:
                        temp_tfidf_dict[key].append(tfidf)
                    for p in probs:
                        temp_probs_dict[key].append(p)

                # Otherwise if we have a monogram which is not long enough (3 letters at least)
                else:
                    # Remove the list related to such n-gram from both the two dictionaries
                    temp_tfidf_dict.pop(key)
                    temp_probs_dict.pop(key)

            # Append the two n-th dictionaries to the corresponding list of dictionaries
            list_of_tfidf_dicts.append(temp_tfidf_dict)
            list_of_probs_dicts.append(temp_probs_dict)

    elif gluename == 'SCP':

        # For each n in [1, len(d)]
        for n in range(1, len(d)):

            g = dict(d[n - 1])
            g2 = dict(d[n - 1])
            for keys, value in d[n - 1].items():
                if not (n == 1 and len(keys) < 3):
                    tfidf_mod = []
                    probs = []
                    num_non_zero_doc = (len(value) - 1) / 2
                    for doc_num in range(1, len(value), 2):
                        vec_len_words = []
                        for j in str_to_list(keys):
                            vec_len_words.append(len(j))
                        tfidf_mod.append(value[doc_num])
                        tfidf_mod.append(np.mean(vec_len_words) * value[doc_num + 1] * np.log(
                            len(corpus) / num_non_zero_doc) / len(corpus[value[doc_num]]))
                        probs.append(value[doc_num])
                        probs.append(value[doc_num + 1] / len(corpus[value[doc_num]]))

                    medprob = sum(probs[1::2]) / len(corpus)
                    probs[1::2] = [l1 - medprob for l1 in probs[1::2]]

                    num = value[0]
                    g[keys] = []
                    g2[keys] = []
                    if n != 1:
                        key = str_to_list(keys)
                        somma = 0
                        for i in range(len(key) - 1):
                            f1 = d[i][list_to_str(key[:i + 1])][0]
                            f2 = d[n - i - 2][list_to_str(key[i + 1:])][0]
                            somma += (f1*f2) / (n - 1)
                        g[keys].append((num**2) / somma)
                        g2[keys].append((num**2) / somma)
                    for val in tfidf_mod:
                        g[keys].append(val)
                    for val2 in probs:
                        g2[keys].append(val2)
                else:
                    g.pop(keys)
                    g2.pop(keys)

            list_of_tfidf_dicts.append(g)
            list_of_probs_dicts.append(g2)

    elif gluename == 'MI':
        for n in range(1, len(d)):
            g = dict(d[n - 1])
            g2 = dict(d[n - 1])
            for keys, value in d[n - 1].items():
                if not (n == 1 and len(keys) < 3):
                    tfidf_mod = []
                    probs = []
                    num_non_zero_doc = (len(value) - 1) / 2
                    for doc_num in range(1, len(value), 2):
                        vec_len_words = []
                        for j in str_to_list(keys):
                            vec_len_words.append(len(j))
                        tfidf_mod.append(value[doc_num])
                        tfidf_mod.append(np.mean(vec_len_words) * value[doc_num + 1] * np.log(
                            len(corpus) / num_non_zero_doc) / len(corpus[value[doc_num]]))
                        probs.append(value[doc_num])
                        probs.append(value[doc_num + 1] / len(corpus[value[doc_num]]))

                    medprob = sum(probs[1::2]) / len(corpus)
                    probs[1::2] = [l1 - medprob for l1 in probs[1::2]]

                    num = value[0]
                    g[keys] = []
                    g2[keys] = []
                    if n != 1:
                        key = str_to_list(keys)
                        somma = 0
                        for i in range(len(key) - 1):
                            f1 = d[i][list_to_str(key[:i + 1])][0]
                            f2 = d[n - i - 2][list_to_str(key[i + 1:])][0]
                            somma += (f1*f2) / (n - 1)
                        g[keys].append(np.log(num / somma))
                        g2[keys].append(np.log(num / somma))
                    for val in tfidf_mod:
                        g[keys].append(val)
                    for val2 in probs:
                        g2[keys].append(val2)
                else:
                    g.pop(keys)
                    g2.pop(keys)

            list_of_tfidf_dicts.append(g)
            list_of_probs_dicts.append(g2)

    return list_of_tfidf_dicts, list_of_probs_dicts