# Automatic Extraction of Explicit and Implicit Keywords

> 1) Extract Relevant Expressions (REs) from a set of several documents, by using LocalMaxs extractor you have implemented. Create adequate criteria to select the most informative REs.

Read 20 documents and create corpus out of it.

In [31]:
from pathlib import Path
from collections import defaultdict
from local_max_all_metrics import tokenize
import os
CORPUS_PATH = "./corpus2mw"
CORPUS_NAME = 'corpus2mw'

corpus = defaultdict(tuple)
doc_names = []
# open all files starting with "fil" in corpus path
doc_count = 1
for doc_path in Path(CORPUS_PATH).glob('fil_*'):
    # read whole document and strip new lines
    doc = Path(doc_path).read_text().replace('\n', '')
    # standardize space characters
    doc = doc.replace(u'\xa0', u' ').replace(u'\u3000', u' ').replace(u'\u2009', u' ')
    
    doc_name = os.path.basename(doc_path)
    doc_names.append(doc_name)
    corpus[doc_name] = tokenize(doc)
    doc_count = doc_count + 1
    if doc_count == 20: break

corpus_words = [word for doc_list in corpus.values() for word in doc_list]
corpus_size = len(corpus_words)

In [1]:
# until we finish LocalMaxs extractor use mock list of REs
res = ['John F. Kennedy', 'he said', 
       'in the United States of America', 
       'United States of America',
       'fun park']
# still might be useful to have the corpus 
# (maybe we can export creating it from the other notebook)
corpus = []

Filter-Implementation to select most informative REs:

In [21]:
# return only res which appear at least min_freq times in each document
def min_freq(res, corpus, min_freq):
    res_filtered = []
    for res_item in res:
        # count how often res_item appears in corpus
        count = 0
        for doc_tokens in corpus:
            if res_item in doc_tokens:
                count += 1
        # if count is greater than min_freq, add res_item to res_filtered
        if count >= min_freq:
            res_filtered.append(res_item)
    return res_filtered

In [22]:
# compute the tf_idf for each res inside of each document and return the res sorted from highest to lowest
def tf_idf(res, corpus):
    # create a dictionary with the res as keys and empty lists as values
    tf_idf_dict = {res_item: [] for res_item in res}
    # for each document in corpus
    for doc_tokens in corpus:
        # for each res_item in res
        for res_item in res:
            # count how often res_item appears in doc_tokens
            count = 0
            for token in doc_tokens:
                if res_item == token:
                    count += 1
            # if count is greater than 0, add count to tf_idf_dict
            if count > 0:
                tf_idf_dict[res_item].append(count)
    # for each res_item in res
    for res_item in res:
        # compute the tf_idf for res_item
        tf_idf_dict[res_item] = sum(tf_idf_dict[res_item]) / len(corpus)
    # sort tf_idf_dict by value (tf_idf)
    tf_idf_dict_sorted = sorted(tf_idf_dict.items(), key=lambda x: x[1], reverse=True)
    # return the res sorted by tf_idf
    return [res_item[0] for res_item in tf_idf_dict_sorted]
    

Compute relevant expressions for documents.

In [23]:
from local_max_all_metrics import local_max, calculate_n_gram_frequencies, minimum_frequency_filter, special_characters_filter, SPECIAL_CHARACTERS, mi_f

max_re_size = 7

n_gram_freq_dict = calculate_n_gram_frequencies(corpus, max_re_size + 1, corpus_size)

filtered_n_gram_freq_dict = minimum_frequency_filter(n_gram_freq_dict, 2)
filtered_n_gram_freq_dict = special_characters_filter(filtered_n_gram_freq_dict, SPECIAL_CHARACTERS)
relevant_expressions = local_max(corpus, max_re_size, filtered_n_gram_freq_dict, mi_f)

[('party', 'in'), ('doc', 'id'), ('the', 'global'), ('he', 'participated'), ('gwf', 'world'), ('championship', 'tournament'), ('in', 'june'), ('during', 'the'), ('he', 'would'), ('the', 'blue'), ('north', 'american'), ('the', 'opening'), ('feet', 'at'), ('the', 'chief'), ('called', 'for'), ('senate', 'investigation'), ('including', 'the'), ('council', 'of'), ('and', 'the'), ('missions', 'of'), ('for', 'an'), ('members', 'of'), ('from', 'the'), ('in', 'response'), ('all', 'the'), ('in', 'his'), ('minister', 'of'), ('of', 'india'), ('chief', 'minister'), ('the', 'state'), ('participates', 'in'), ('its', 'own'), ('the', 'song'), ('won', 'the'), ('three', 'times'), ('on', 'the'), ('with', 'a'), ('the', 'album'), ('embarked', 'on'), ('the', 'world'), ('for', 'which'), ('in', 'australia'), ('her', 'the'), ('and', 'released'), ('she', 'sang'), ('in', '2006'), ('sunday', 'night'), ('all', 'day'), ('which', 'is'), ('the', 'film'), ('her', 'name'), ('name', 'to'), ('a', 'special'), ('edition', '

In [None]:
#TODO:
# 1. calculate n-gram frequencies for every document
# 2. get relevant expressions for every document
# 3. calculate most relevant

In [3]:
#!python -m spacy download en_core_web_sm
import spacy

In [4]:
x = "John F. Kennedy said in the United States of America that the United States of America is a fun park"

def get_highest_word_similarity(words):
    #create a loop comparing each entry in x with every other entry in x
    similarities = []
    nlp = spacy.load('en_core_web_sm')
    tokens = nlp(words)
    for token1 in range(len(tokens)):
        for token2 in range(token1+1,len(tokens)):
            similarities.append([tokens[token1].similarity(tokens[token2]),tokens[token1],tokens[token2]])
    #return the similarities sorted by highest to lowest
    return sorted(similarities, key=lambda x: x[0], reverse=True)

    

In [5]:
print(get_highest_word_similarity(x))

[[1.0, the, the], [1.0, United, United], [1.0, States, States], [1.0, of, of], [1.0, America, America], [0.7141181826591492, F., United], [0.7137698531150818, F., United], [0.6824235916137695, John, F.], [0.6668247580528259, Kennedy, America], [0.6625233888626099, Kennedy, States], [0.653134286403656, Kennedy, States], [0.6440538167953491, in, of], [0.6406058669090271, John, United], [0.6386597156524658, in, of], [0.6230368614196777, John, United], [0.5909967422485352, States, America], [0.5669664144515991, States, America], [0.5455108880996704, States, America], [0.5390477776527405, America, States], [0.5074913501739502, the, a], [0.4755392372608185, United, States], [0.4671423137187958, Kennedy, America], [0.46241238713264465, States, United], [0.4611896872520447, United, States], [0.4545450806617737, F., Kennedy], [0.44691920280456543, United, States], [0.42313408851623535, the, a], [0.4203069508075714, United, fun], [0.420208215713501, Kennedy, United], [0.4031030237674713, Kennedy

  similarities.append([tokens[token1].similarity(tokens[token2]),tokens[token1],tokens[token2]])
