# Automatic Extraction of Explicit and Implicit Keywords

> 1) Extract Relevant Expressions (REs) from a set of several documents, by using LocalMaxs extractor you have implemented. Create adequate criteria to select the most informative REs.

Read 20 documents and create corpus out of it.

In [6]:
from pathlib import Path
from collections import defaultdict
from local_max_all_metrics import tokenize, local_max, calculate_n_gram_frequencies, minimum_frequency_filter, special_characters_filter, SPECIAL_CHARACTERS, mi_f
import os
import math
from copy import copy

max_re_size = 7

In [10]:
CORPUS_PATH = "./corpus2mw"
CORPUS_NAME = 'corpus2mw'

corpus_total = defaultdict(tuple)
n_gram_freq_dict_per_doc = defaultdict(tuple)
doc_names = []
# open all files starting with "fil" in corpus path
doc_count = 1
for doc_path in Path(CORPUS_PATH).glob('fil_*'):
    corpus_per_doc = defaultdict(tuple)
    # read whole document and strip new lines
    doc = Path(doc_path).read_text().replace('\n', '')
    # standardize space characters
    doc = doc.replace(u'\xa0', u' ').replace(u'\u3000', u' ').replace(u'\u2009', u' ')
    
    doc_name = os.path.basename(doc_path)
    doc_names.append(doc_name)
    corpus_total[doc_name] = tokenize(doc)
    corpus_per_doc[doc_name] = tokenize(doc)
    n_gram_freq_dict_doc = calculate_n_gram_frequencies(corpus_per_doc, max_re_size + 1, len(corpus_total[doc_name]))
    filtered_n_gram_freq_dict_doc = minimum_frequency_filter(n_gram_freq_dict_doc, 2)
    filtered_n_gram_freq_dict_doc = special_characters_filter(filtered_n_gram_freq_dict_doc, SPECIAL_CHARACTERS)
    n_gram_freq_dict_per_doc[doc_name] = filtered_n_gram_freq_dict_doc

    doc_count = doc_count + 1
    if doc_count == 20: break

corpus_words = [word for doc_list in corpus_total.values() for word in doc_list]
corpus_total_size = len(corpus_words)
n_gram_freq_dict_per_doc.keys()

dict_keys(['fil_2269', 'fil_191', 'fil_131', 'fil_3051', 'fil_688', 'fil_775', 'fil_2439', 'fil_1238', 'fil_919', 'fil_4561', 'fil_2036', 'fil_650', 'fil_3278', 'fil_569', 'fil_4568', 'fil_3289', 'fil_36', 'fil_4007', 'fil_1977'])

In [12]:
len(corpus_total['fil_2269'])

370

Filter-Implementation to select most informative REs:

In [13]:
# return only res which appear at least min_freq times in each document
def min_freq(re_per_doc, min_freq):
    best_re_per_doc = defaultdict(tuple)
    for doc_name in re_per_doc.keys():
        frequent_re = list(filter(lambda x:x['abs_freq'] >= min_freq, re_per_doc[doc_name]))[:10]
        best_re_per_doc[doc_name] = frequent_re
    return best_re_per_doc

In [14]:
# compute the tf_idf for each res inside of each document and return the res sorted from highest to lowest

def tf_idf_val(re_abs_freq, document_size, amount_of_docs, amount_of_docs_with_re):
    return (re_abs_freq / document_size) * math.log(amount_of_docs / amount_of_docs_with_re)

def get_amount_of_docs_with_re(re, re_per_doc):
    re_tup = re['re']
    count = 0
    for doc_name in re_per_doc.keys():
        re_tups = list(map(lambda x:x['re'], re_per_doc[doc_name]))
        if re_tup in re_tups: count += 1
    return count
    

def tf_idf(re_per_doc, doc_amount):
    re_per_doc_with_tfidf_val = copy(re_per_doc)
    best_re_per_doc = defaultdict(tuple)
    for doc_name in re_per_doc.keys():
        re_obs_with_tfidf_val = []
        for re_ob in re_per_doc[doc_name]:
            #calculate tf_idf value
            re_abs_freq = re_ob['abs_freq']
            document_size = len(corpus_total[doc_name])
            amount_of_docs_with_re = get_amount_of_docs_with_re(re_ob, re_per_doc)
            tf_idf_of_re = tf_idf_val(re_abs_freq, document_size, doc_amount, amount_of_docs_with_re)
            # update object(s)
            re_ob_with_tfidf_val = copy(re_ob)
            re_ob_with_tfidf_val['tf_idf'] = tf_idf_of_re
            re_obs_with_tfidf_val.append(re_ob_with_tfidf_val)
        re_per_doc_with_tfidf_val[doc_name] = re_obs_with_tfidf_val
    
    # sorting
    for doc_name in re_per_doc_with_tfidf_val.keys():
        re_per_doc_with_tfidf_val[doc_name] = sorted(re_per_doc_with_tfidf_val[doc_name], key=lambda x: x['tf_idf'], reverse=True)[:10]
    return re_per_doc_with_tfidf_val
    

Compute relevant expressions for documents.

In [17]:
n_gram_freq_dict = calculate_n_gram_frequencies(corpus_total, max_re_size + 1, corpus_total_size)

filtered_n_gram_freq_dict = minimum_frequency_filter(n_gram_freq_dict, 2)
filtered_n_gram_freq_dict = special_characters_filter(filtered_n_gram_freq_dict, SPECIAL_CHARACTERS)
relevant_expressions = local_max(corpus_total, max_re_size, filtered_n_gram_freq_dict, mi_f)

In [18]:
#TODO:
# calculate n-gram frequencies for every document
re_per_doc = defaultdict(tuple)
for doc_name in doc_names:
    re_per_doc[doc_name] = []
    for re_len in range(0, max_re_size):
        n_grams_with_specific_size_of_doc = n_gram_freq_dict_per_doc[doc_name][re_len+1].keys()
        # which relevant expressions extracted over the corpus are in our current document?
        re_with_specific_size_of_doc = [re for re in relevant_expressions if re in n_grams_with_specific_size_of_doc]
        for re in re_with_specific_size_of_doc:
            re_per_doc[doc_name].append({'re': re, 'abs_freq': n_gram_freq_dict_per_doc[doc_name][re_len+1][re]['abs_freq']})

In [19]:
# calculate most relevant relevant expressions
best_re_per_doc_minfreq = min_freq(re_per_doc,3)
best_re_per_doc_tfidf = tf_idf(re_per_doc, 20)
print(best_re_per_doc_tfidf)
print(best_re_per_doc_minfreq)

NameError: name 'corpus' is not defined

In [None]:
#!python -m spacy download en_core_web_sm
import spacy

In [None]:
x = "John F. Kennedy said in the United States of America that the United States of America is a fun park"

def get_highest_word_similarity(words):
    #create a loop comparing each entry in x with every other entry in x
    similarities = []
    nlp = spacy.load('en_core_web_sm')
    tokens = nlp(words)
    for token1 in range(len(tokens)):
        for token2 in range(token1+1,len(tokens)):
            similarities.append([tokens[token1].similarity(tokens[token2]),tokens[token1],tokens[token2]])
    #return the similarities sorted by highest to lowest
    return sorted(similarities, key=lambda x: x[0], reverse=True)

    

In [None]:
print(get_highest_word_similarity(x))