# Automatic Extraction of Explicit and Implicit Keywords

> 1) Extract Relevant Expressions (REs) from a set of several documents, by using LocalMaxs extractor you have implemented. Create adequate criteria to select the most informative REs.

Read 20 documents and create corpus out of it.

In [1]:
from pathlib import Path
from collections import defaultdict
from local_max_all_metrics import stop_words_filter, tokenize, local_max, calculate_n_gram_frequencies, minimum_frequency_filter, special_characters_filter, SPECIAL_CHARACTERS, mi_f, scp_f
import os
import math
from copy import copy

max_re_size = 7

In [2]:
CORPUS_PATH = "./corpus2mw"
CORPUS_NAME = 'corpus2mw'

corpus_total = defaultdict(tuple)
n_gram_freq_dict_per_doc = defaultdict(tuple)
doc_names = []
# open all files starting with "fil" in corpus path
doc_count = 1
for doc_path in Path(CORPUS_PATH).glob('fil_*'):
    corpus_per_doc = defaultdict(tuple)
    # read whole document and strip new lines
    doc = Path(doc_path).read_text().replace('\n', '')
    # standardize space characters
    doc = doc.replace(u'\xa0', u' ').replace(u'\u3000', u' ').replace(u'\u2009', u' ')
    
    doc_name = os.path.basename(doc_path)
    doc_names.append(doc_name)
    corpus_total[doc_name] = tokenize(doc)
    corpus_per_doc[doc_name] = tokenize(doc)
    n_gram_freq_dict_doc = calculate_n_gram_frequencies(corpus_per_doc, max_re_size + 1, len(corpus_total[doc_name]))
    #filtered_n_gram_freq_dict_doc = minimum_frequency_filter(n_gram_freq_dict_doc, 2)
    #filtered_n_gram_freq_dict_doc = special_characters_filter(filtered_n_gram_freq_dict_doc, SPECIAL_CHARACTERS)
    n_gram_freq_dict_per_doc[doc_name] = n_gram_freq_dict_doc

    doc_count = doc_count + 1
    if doc_count == 20: break

corpus_words = [word for doc_list in corpus_total.values() for word in doc_list]
corpus_total_size = len(corpus_words)
n_gram_freq_dict_per_doc.keys()

dict_keys(['fil_2269', 'fil_191', 'fil_131', 'fil_3051', 'fil_688', 'fil_775', 'fil_2439', 'fil_1238', 'fil_919', 'fil_4561', 'fil_2036', 'fil_650', 'fil_3278', 'fil_569', 'fil_4568', 'fil_3289', 'fil_36', 'fil_4007', 'fil_1977'])

In [3]:
len(corpus_total['fil_2269'])

370

Filter-Implementation to select most informative REs:

In [4]:
# return only res which appear at least min_freq times in each document
def min_freq(re_per_doc, min_freq):
    best_re_per_doc = defaultdict(tuple)
    for doc_name in re_per_doc.keys():
        frequent_re = list(filter(lambda x:x['abs_freq'] >= min_freq, re_per_doc[doc_name]))[:10]
        best_re_per_doc[doc_name] = frequent_re
    return best_re_per_doc

In [5]:
# compute the tf_idf for each res inside of each document and return the res sorted from highest to lowest

def tf_idf_val(re_abs_freq, document_size, amount_of_docs, amount_of_docs_with_re):
    return (re_abs_freq / document_size) * math.log(amount_of_docs / amount_of_docs_with_re)

def get_amount_of_docs_with_re(re, re_per_doc):
    re_tup = re['re']
    count = 0
    for doc_name in re_per_doc.keys():
        re_tups = list(map(lambda x:x['re'], re_per_doc[doc_name]))
        if re_tup in re_tups: count += 1
    return count
    

def tf_idf(re_per_doc, doc_amount):
    re_per_doc_with_tfidf_val = copy(re_per_doc)
    best_re_per_doc = defaultdict(tuple)
    for doc_name in re_per_doc.keys():
        re_obs_with_tfidf_val = []
        for re_ob in re_per_doc[doc_name]:
            #calculate tf_idf value
            re_abs_freq = re_ob['abs_freq']
            document_size = len(corpus_total[doc_name])
            amount_of_docs_with_re = get_amount_of_docs_with_re(re_ob, re_per_doc)
            tf_idf_of_re = tf_idf_val(re_abs_freq, document_size, doc_amount, amount_of_docs_with_re)
            # update object(s)
            re_ob_with_tfidf_val = copy(re_ob)
            re_ob_with_tfidf_val['tf_idf'] = tf_idf_of_re
            re_obs_with_tfidf_val.append(re_ob_with_tfidf_val)
        re_per_doc_with_tfidf_val[doc_name] = re_obs_with_tfidf_val
    
    # sorting
    for doc_name in re_per_doc_with_tfidf_val.keys():
        re_per_doc_with_tfidf_val[doc_name] = sorted(re_per_doc_with_tfidf_val[doc_name], key=lambda x: x['tf_idf'], reverse=True)[:10]
    return re_per_doc_with_tfidf_val
    

Compute relevant expressions for documents.

In [14]:
n_gram_freq_dict = calculate_n_gram_frequencies(corpus_total, max_re_size + 1, corpus_total_size)

#filtered_n_gram_freq_dict = minimum_frequency_filter(n_gram_freq_dict, 2)
filtered_n_gram_freq_dict = special_characters_filter(n_gram_freq_dict, SPECIAL_CHARACTERS)
relevant_expressions = local_max(corpus_total, max_re_size, filtered_n_gram_freq_dict, scp_f)
stop_words, relevant_expressions = stop_words_filter(corpus_total,relevant_expressions)

print(relevant_expressions)

[('badil', 'al'), ('civilizational', 'alternative'), ('doc', 'id'), ('yves', 'larock'), ('defeating', 'matt'), ('king', 'cobra'), ('wrestling', 'federation'), ('championship', 'tournament'), ('blue', 'avenger'), ('johnny', 'ace'), ('opening', 'rounds'), ('physically', 'tall'), ('813', 'feet'), ('540', 'khz'), ('612', 'feet'), ('1600', 'khz'), ('generally', 'restricted'), ('1400', 'khz'), ('1500', 'khz'), ('1530', 'khz'), ('1935', 'senator'), ('republican', 'specialist'), ('foreign', 'policy'), ('senate', 'investigation'), ('protestant', 'organizations'), ('episcopal', 'church'), ('foreign', 'missions'), ('methodist', 'church'), ('investigation', 'signed'), ('250', 'members'), ('thee', 'knights'), ('attacking', 'roosevelt'), ('turning', 'away'), ('calles', 'hard'), ('backstage', 'efforts'), ('roosevelt', 'easily'), ('catholic', 'strongholds'), ('1936', 'landslide'), ('twelfth', 'prime'), ('further', 'reading'), ('dansk', 'melodi'), ('prix', 'competition'), ('eurovision', 'contest'), ('j

In [15]:
# calculate n-gram frequencies for every document
re_per_doc = defaultdict(tuple)
for doc_name in doc_names:
    re_per_doc[doc_name] = []
    for re_len in range(0, max_re_size):
        n_grams_with_specific_size_of_doc = n_gram_freq_dict_per_doc[doc_name][re_len+1].keys()
        # which relevant expressions extracted over the corpus are in our current document?
        #print(n_grams_with_specific_size_of_doc)
        re_with_specific_size_of_doc = [re for re in relevant_expressions if re in n_grams_with_specific_size_of_doc]
        for re in re_with_specific_size_of_doc:
            re_per_doc[doc_name].append({'re': re, 'abs_freq': n_gram_freq_dict_per_doc[doc_name][re_len+1][re]['abs_freq']})

In [16]:
#calculate most relevant relevant expressions, choose metric
#best_re_per_doc = min_freq(re_per_doc,3)
best_re_per_doc = tf_idf(re_per_doc, 20)

In [17]:
best_re_per_doc

defaultdict(tuple,
            {'fil_2269': [{'re': ('championship', 'tournament'),
               'abs_freq': 3.0,
               'tf_idf': 0.02428972113692425},
              {'re': ('senate', 'investigation'),
               'abs_freq': 2.0,
               'tf_idf': 0.016193147424616167},
              {'re': ('badil', 'al'),
               'abs_freq': 1.0,
               'tf_idf': 0.008096573712308083},
              {'re': ('civilizational', 'alternative'),
               'abs_freq': 1.0,
               'tf_idf': 0.008096573712308083},
              {'re': ('yves', 'larock'),
               'abs_freq': 1.0,
               'tf_idf': 0.008096573712308083},
              {'re': ('defeating', 'matt'),
               'abs_freq': 1.0,
               'tf_idf': 0.008096573712308083},
              {'re': ('king', 'cobra'),
               'abs_freq': 1.0,
               'tf_idf': 0.008096573712308083},
              {'re': ('wrestling', 'federation'),
               'abs_freq': 1.0,
      

In [18]:
def get_absolute_frequency_from_dict(re_per_doc, re, doc_name):
    res_list = list(filter(lambda x:x['re'] == re, re_per_doc[doc_name]))
    if len(res_list) == 0: return 0
    else: return res_list[0]['abs_freq']

def calculate_average_over_documents(re, re_per_doc):
    rel_freq_in_current_doc_sum = 0
    for doc_name in re_per_doc.keys():
        rel_freq_in_current_doc = get_absolute_frequency_from_dict(re_per_doc, re, doc_name) / len(corpus_total[doc_name])
        rel_freq_in_current_doc_sum += rel_freq_in_current_doc

    return (1 / len(re_per_doc.keys())) * rel_freq_in_current_doc_sum

def corr(re1, re2, re_per_doc):
    return cov(re1, re2, re_per_doc) / (math.sqrt(cov(re1, re1, re_per_doc)) * math.sqrt(cov(re2, re2, re_per_doc)))

def cov(re1, re2, re_per_doc):
    sum_probs = 0
    rel_freq_in_all_docs_re1 = calculate_average_over_documents(re1, re_per_doc)
    rel_freq_in_all_docs_re2 = calculate_average_over_documents(re2, re_per_doc)
    for doc_name in re_per_doc.keys():
        rel_freq_in_current_doc_re1 = get_absolute_frequency_from_dict(re_per_doc, re1, doc_name) / len(corpus_total[doc_name])
        rel_freq_in_current_doc_re2 = get_absolute_frequency_from_dict(re_per_doc, re2, doc_name) / len(corpus_total[doc_name])
        sum_probs += ((rel_freq_in_current_doc_re1 - rel_freq_in_all_docs_re1) * (rel_freq_in_current_doc_re2 - rel_freq_in_all_docs_re2))
    return (1 / (len(re_per_doc.keys()) - 1)) * sum_probs

for doc_name in best_re_per_doc.keys():
    print(doc_name)
    re_tups_of_doc = list(map(lambda x:x['re'], best_re_per_doc[doc_name]))
    for relevant_expression_pos in range(0, len(best_re_per_doc[doc_name])):
        implicit_keywords = []
        re_doc = best_re_per_doc[doc_name][relevant_expression_pos]['re']
        for re_corpus in relevant_expressions:
            if re_corpus in re_tups_of_doc: continue
            correlation = corr(re_doc, re_corpus, re_per_doc)
            implicit_keywords.append({'re': re_corpus, 'corr': correlation})
        implicit_keywords_sorted = sorted(implicit_keywords, key=lambda x: x['corr'], reverse=True)[:10]
        best_re_per_doc[doc_name][relevant_expression_pos]['implicit'] = implicit_keywords_sorted

fil_2269


KeyboardInterrupt: 

In [None]:
best_re_per_doc['fil_131']