# Keywords Generator
Inspiration for using Fasttext for grouping simmilar concepts and candidate generation: 
    https://aclweb.org/anthology/N18-2100

First the train set is procressed to make assumptions. The actual evaluation will be done on text set at the end of this notebook.
        
##Data
The Semeval 2010 is the dataset. It is available in the folder data. For evaluation, please unpack it to any location and make sure the correct locations are set in the cell below

In [None]:
TRAINSET_LOCATION = "/tmp/keyword_gen/SemEval2010/train/"
TESTSET_LOCATION = "/tmp/keyword_gen/SemEval2010/test/"

In [2]:
from gensim.test.utils import datapath
from gensim.models import FastText

cap_path = datapath("/tmp/cc.en.300.bin")
fb_partial = FastText.load_fasttext_format(cap_path, full_model=False)


In [4]:
#Helper class for discovery of relevant POS sequences
acceptable_pos = set()
acceptable_pos.add("A")
acceptable_pos.add("N")

class Trie():
    def __init__(self, pos, parent=None, terminal = 0):
        self.pos = pos
        self.parent = parent
        self.children = {}
        self.terminal = terminal
        self.deph = 0
        par = parent
        while par is not None:
            self.deph += 1
            par = par.parent
            
    def add_node(self, pos):
        if pos not in self.children:
            self.children[pos] = Trie(pos, self)
        return self.children[pos]
        
    def print_me(self, indent=""):
        print(indent+ self.pos+": ")
        for p, chld in self.children.items():
            chld.print_me(indent+"-")
    
    def can_start(self, pos):
        if len(pos)>0 and pos[0] in acceptable_pos and pos[0] in self.children:
            return self.children[pos[0]]
        return self  
    
    def can_move(self, pos):
        if len(pos)>0 and pos[0] in acceptable_pos and pos[0] in self.children:
            return True
        return False 

### Read dataset: Semeval
Semeval is used for results comparison with Key2Vec paper (mentioned in the beginning).

In [5]:
import os

import spacy
nlp = spacy.load("en", disable=["parser", "textcat"])
nlp.add_pipe(nlp.create_pipe('sentencizer'))

class dataset_loader():
    
    def __init__(self, path):
        self.path = path
        self.documents = {}
        self.keywords = {}
        self.keywords_count = 0
        
    def get_all_files(self, extension):
        all_files = []
        for root, dirs, files in os.walk(self.path):
            for file in files:
                if file.endswith(extension):
                     all_files.append(os.path.join(root, file))
        return all_files
    
    def read_file(self, path):
        lines = []
        with open(path) as fp:  
            lines = fp.readlines()
        return lines
    
    # extract seuences word words which POS tags have the same sequence as keywords pos sequences
    def extract_keywords_candidates(self, root, minimal_frequency=2):
        candidates = {}
        doc_candidates = {}
        for doc_id, text in self.documents.items():
            print(doc_id+", ", end = '')
            if doc_id not in doc_candidates:
                doc_candidates[doc_id]={}
            processed = nlp(text)
            for sent in processed.sents:
                sent_toks = []
                tmp_currents = [root]
                tmp_currents2 = []
                local_candidates = []
                for token in sent:
                    if token.is_alpha and not token.is_stop and len(token.lemma_)>3: # keywords are mostly alfa and dont contain stop words
                        sent_toks.append(token.lemma_)
                    else:
                        sent_toks.append("--ommit--")
                    # print(token.lemma_+"  len curremt "+str(len(tmp_currents)))
                    for current in tmp_currents:
                        is_valid_keyword = True
                        new_current = current.can_start(token.tag_)
                        if current == new_current: #fail or start from scratch 
                            current = root
                            
                        else: # move forward
                            current = new_current
                        if current.terminal > minimal_frequency:
                            #avoid non words
                            for tok in sent_toks[-current.deph:]:
                                if tok=="--ommit--":
                                    is_valid_keyword = False
                            candidate_sent = " ".join(sent_toks[-current.deph:])
                            #avoid repetitions
                            if is_valid_keyword:
                                for lc in local_candidates:
                                    lcstr = " ".join(lc)
                                    if candidate_sent == lcstr:
                                        is_valid_keyword = False
                                        break
                            if is_valid_keyword:
                                local_candidates.append(sent_toks[-current.deph:])
                                #print("Add to local candidates "+" ".join(sent_toks[-current.deph:])+"   "+str(current.deph) )
                            
                        if current != root and is_valid_keyword:
                            tmp_currents2.append(current)
                            potential_start = root.can_start(token.tag_)
                            if root!=potential_start and current!= potential_start:
                                tmp_currents2.append(potential_start)
                    tmp_currents = tmp_currents2.copy()
                    if len(tmp_currents)==0:
                        if len(local_candidates)>0:# select best from local candidates
                            longest = None
                            for lc in local_candidates:
                                if longest is None or len(lc) > len(longest):
                                    longest=lc
                            candidate_str = " ".join(longest)
                            if candidate_str not in candidates:
                                candidates[candidate_str]= {}
                            if doc_id not in candidates[candidate_str]:
                                candidates[candidate_str][doc_id]=0
                            candidates[candidate_str][doc_id]+=1
                            if candidate_str not in doc_candidates[doc_id]:
                                doc_candidates[doc_id][candidate_str]=0
                            doc_candidates[doc_id][candidate_str]+=1
                        local_candidates.clear()
                        tmp_currents = [root]
                    tmp_currents2.clear()   
        return candidates, doc_candidates
    
class semelval_loader(dataset_loader):
    
    def __init__(self, path):
        super().__init__(path)
        
        self.keywords_reader = {}
        self.keywords_combined = {}
        self.abstracts = {}
        self.keywords_combined_count =0
        self.keywords_reader_count =0
        
    def _get_abstract(self, lines, all_document=False):
        ret = ""
        i = 0
        start = False
        for l in lines:
            l=l.strip()
            if not all_document and l == "1. INTRODUCTION":
                break
            if ". REFERENCES" in l:
                break
            if i==0:
                ret = l
            elif i == 1 or start:
                ret += " "+l  
            if l == "ABSTRACT":
                start = True
            i+=1
        return ret.replace("-", " ")
    
    def read_keywords(self, content):
        ret_keywords = {}
        total_keywords = 0
        for c in content:
            splitted = c.split(":")
            doc_id = splitted[0].strip()
            keywords = splitted[1].strip().replace("-", " ").split(",")
            ret_keywords[doc_id]=keywords
            total_keywords += len(keywords)
        return ret_keywords, total_keywords
    
    def load_data(self):
        files = self.get_all_files("final")
        for f in files:
            fname=f.split("/")[-1]
            if "txt.final" in fname:
                content = self.read_file(f)
                self.abstracts[fname[:4]]=self._get_abstract(content)
                self.documents[fname[:4]]=self._get_abstract(content, True)
            elif fname=="train.author.final":
                content = self.read_file(f)
                self.keywords, self.keywords_count = self.read_keywords(content)
            elif fname=="train.combined.final":
                content = self.read_file(f)
                self.keywords_combined, self.keywords_combined_count = self.read_keywords(content)
            elif fname=="train.reader.final":
                content = self.read_file(f)
                self.keywords_reader, self.keywords_reader_count = self.read_keywords(content)
                  
ds = semelval_loader(TRAINSET_LOCATION)
ds.load_data()

### What keywords are better to use?
Posisble options are: authors, readers and combine

In [11]:
#compare keywords coverage

def keyword_coverage(documents, keywords_corpora):
    all_keywords = 0
    covered_keywords = 0
    for doc_id, keywords in keywords_corpora.items():
        text  = documents[doc_id]
        all_keywords += len(keywords)
        for keyword in keywords:
            if keyword[:-1] in text: #simplified stemming just for stat purposes
                covered_keywords += 1
    return covered_keywords, all_keywords

print("--Abstracts--")
n , a = keyword_coverage(ds.abstracts, ds.keywords)
print("Authors keyword coverage "+str(n)+" "+str(a)+" = "+str(n/a))

n , a = keyword_coverage(ds.abstracts, ds.keywords_reader)
print("Readers keyword coverage "+str(n)+" "+str(a)+" = "+str(n/a))

n , a = keyword_coverage(ds.abstracts, ds.keywords_combined)
print("Combined keyword coverage "+str(n)+" "+str(a)+" = "+str(n/a))

print("--Full documents--")
n , a = keyword_coverage(ds.documents, ds.keywords)
print("Authors keyword coverage "+str(n)+" "+str(a)+" = "+str(n/a))

n , a = keyword_coverage(ds.documents, ds.keywords_reader)
print("Readers keyword coverage "+str(n)+" "+str(a)+" = "+str(n/a))

n , a = keyword_coverage(ds.documents, ds.keywords_combined)
print("Combined keyword coverage "+str(n)+" "+str(a)+" = "+str(n/a)+" \n")

print("Average keyword count for author "+str(ds.keywords_count / len(ds.keywords) ) )
print("Average keyword count for reader "+str(ds.keywords_reader_count / len(ds.keywords_reader) ) )
print("Average keyword count combined "+str(ds.keywords_combined_count / len(ds.keywords_combined) ) )

--Abstracts--
Authors keyword coverage 823 2223 = 0.3702204228520018
Readers keyword coverage 686 1824 = 0.37609649122807015
Combined keyword coverage 823 2223 = 0.3702204228520018
--Full documents--
Authors keyword coverage 1672 2223 = 0.7521367521367521
Readers keyword coverage 1429 1824 = 0.7834429824561403
Combined keyword coverage 1672 2223 = 0.7521367521367521 

Average keyword count for author 3.8819444444444446
Average keyword count for reader 12.666666666666666
Average keyword count combined 15.4375


### For further processing we'll use combined keywords as they contain largest number of words, with only slightly lower coverage

In [12]:
ds.keywords = ds.keywords_combined

We'll focus on combined readers and authors keywords are they are most popular with slightly smaller coverage than readers' keywords

## How do keywords look like?
What are most common POS sequences among them?

In [8]:
import operator
#what kind of POS'es are the keywords.

pos_frequency = {}
keywors_sequence_freqs = {}
keywords_freq = {}
root = Trie("")
for doc_id, keywords in ds.keywords_combined.items():
    for keyword in keywords:
        if keyword not in keywords_freq:
            keywords_freq[keyword] = 0
        keywords_freq[keyword] += 1
        
        processed = nlp(keyword)
        sequence = ""
        nnode = root
        for token in processed:
            nnode = nnode.add_node(token.tag_[0])
#             print(token.text, token.lemma_, token.pos_, token.tag_,
#                     token.shape_, token.is_alpha, token.is_stop)
            sequence+=token.tag_+" "
            if token.tag_ not in pos_frequency:
                pos_frequency[token.tag_] = 0
            pos_frequency[token.tag_] += 1
        nnode.terminal += 1
        if sequence not in keywors_sequence_freqs:
            keywors_sequence_freqs[sequence] = 0
        keywors_sequence_freqs[sequence] += 1
pos_frequency = sorted(pos_frequency.items(), key=operator.itemgetter(1))
keywors_sequence_freqs = sorted(keywors_sequence_freqs.items(), key=operator.itemgetter(1))
keywords_freq = sorted(keywords_freq.items(), key=operator.itemgetter(1))
            
print("Most popular POSes among keywords "+str(pos_frequency))
# print("Keywords POS sequences "+str(keywors_sequence_freqs))
# print("How uniformly keywords are spreaded across documents "+str(keywords_freq))

Most popular POSes among keywords [('SYM', 1), ('ADD', 1), ('.', 1), ('``', 1), ('VBZ', 1), ('RP', 1), ('POS', 2), ('JJS', 2), ('AFX', 2), ('NNP', 3), ('LS', 4), ('JJR', 4), ('XX', 5), ('DT', 6), ('UH', 8), ('TO', 10), ('CD', 11), ('FW', 12), ('VBD', 20), ('RB', 20), ('NNS', 31), ('CC', 38), ('VBP', 41), ('VBG', 89), ('VB', 101), ('IN', 102), ('VBN', 162), ('JJ', 868), ('NN', 3646)]


### Conclusion
Keyword candidates will be havying the structure of the target keywords, limitted to POS starting with *N* and *J* to cover most popular keyword structures

# Extract keyword candidates
Process documents and extract token sequences which reflect keywords POS sequences

In [9]:
print("Processed documents:")
keywords, doc_keywords = ds.extract_keywords_candidates(root)


C-57, H-52, I-66, I-51, H-44, J-40, I-45, J-50, C-45, H-48, H-50, I-38, I-37, I-54, H-41, J-67, J-51, H-92, H-38, H-53, H-81, J-36, H-88, C-77, H-84, H-62, C-46, J-44, C-44, H-73, J-47, H-96, I-58, C-75, I-73, C-52, I-60, J-37, I-47, J-55, I-59, H-69, H-79, J-66, J-45, C-56, H-64, J-42, I-57, H-47, I-43, I-55, I-65, J-73, C-55, C-81, H-49, C-61, J-62, J-38, C-50, C-78, H-46, J-70, I-77, J-59, I-75, I-64, J-71, I-42, C-62, I-56, H-97, J-49, J-39, C-74, C-67, H-45, J-61, J-56, I-48, J-74, H-85, C-71, C-72, H-54, H-35, H-87, C-80, C-69, I-74, H-98, J-60, C-66, I-49, H-60, C-54, I-53, J-58, C-76, I-68, C-48, J-57, J-34, J-53, I-62, I-76, I-46, C-41, C-83, J-41, C-42, J-69, I-71, C-58, I-52, H-43, H-90, J-33, J-72, J-65, H-61, H-83, I-61, J-63, J-52, I-50, H-42, C-65, C-49, H-77, C-68, H-37, I-63, H-82, C-53, H-63, I-70, H-40, I-72, H-95, J-35, C-79, C-84, 

### Gather embeddings
For all candidate keywords, gether their embeddings to avoid repetitive processing

In [13]:
def gather_embeddings(keywords):
    embeddings = {}
    for word, _ in keywords.items():
        embeddings[word] = fb_partial.wv[word]
    return embeddings

embeddings = gather_embeddings(keywords)

# Candidate filtering
To select best keyword candidates for each document, we'll use tf-idf. 

Since the target keywords often express same meanings with different words (for e.g. load-dependent resource failure, load-dependent failure), the tf-idf will be calculated in a fuzzy way for meanings instead of words. 

To group words into meanings we'll use simple threashold, assessed by experiemnts on trainset

The tf for a given word will be calculated by adding to the usual tf, the frequencies of the words having the same meaning, multiplied by the cosine simmilarity of their embeddings.

Idf is calculated in simmilar manner - by adding to usual idf, numbers of documents the "sibling" word appeared in, multiplied by their simmilarity.

In [None]:
import math
from multiprocessing import Pool
import itertools

PROCESS_COUNT = 7
simmilarity_threshold = 0.73 # assumption derived from manual simmilarity analysis 

def group_results(scores, groups):
    result = {}
    for word, score in scores:
        group_processed = False
        if word in groups:
            for w in groups[word]:
                if w in result:
                    group_processed = True
                    break
            if group_processed:
                continue
        result[word] = score
    return result
            

def calculate_tf(keywords, embeddings, simmilarity_threshold):
    tf_scores = {}
    groups = {}
    
    all_words = list(keywords.keys())
    word_embeddings=[]
    for word2, freq in keywords.items():
        word_embeddings.append(embeddings[word2])
    for word, freq in keywords.items():
        word_embedding = embeddings[word]
        dists = []
        
        dists = fb_partial.wv.cosine_similarities(word_embedding, word_embeddings)
        score = 0
        for d, w in zip(dists, all_words):
            if d > simmilarity_threshold:
                score += d * keywords[w]
                if w != word:
                    if word not in groups and w not in groups:
                        groups[word] = set()
                        groups[w] = groups[word]
                    elif word not in groups:
                        groups[word] = groups[w]
                    else:
                        groups[w] = groups[word]
                    groups[word].add(w)
                    groups[word].add(word)
                    groups[w].add(w)
                    groups[w].add(word)
                #print(word+"   <-  "+w+"  "+str(d)+"  "+str(keywords[w]))
        tf_scores[word] = score
    tf_scores = sorted(tf_scores.items(), key=lambda tup: tup[1], reverse=True)
    tf_scores = group_results(tf_scores, groups)
    return tf_scores, groups

#modified idf ?- calculates in how many documents a word appeared with what strenghts
def idf(word, embeddings, keywords, doclen, simmilarity_threshold, groups):
    best_score = 0
    best_group_word = ""
    if word not in groups:
        groups[word] = set()
        groups[word].add(word)
    for wg in groups[word]:
        doc_number = len(keywords[wg]) -1
        for w, _ in keywords.items():
            dist = fb_partial.wv.cosine_similarities(embeddings[wg], [embeddings[w]])
            if dist > simmilarity_threshold:
                doc_number += dist * (len(keywords[w]) -1 )
        score = math.log(doclen / (doc_number + 0.1))
        if score > best_score:
            best_score = score
            best_group_word = wg
    return best_score, best_group_word

def tfidf(tf_keywords, keywords, embeddings, doclen, simmilarity_threshold, groups):
    result = []
    for word, tf in tf_keywords.items():
        idf_score, best_word = idf(word, embeddings, keywords, doclen, simmilarity_threshold, groups)
        #print(word+" -> "+best_word+"   tf= "+str(tf)+"   idf =  "+str(idf_score) )
        result.append( (word, tf * idf_score ) )
    result = sorted(result, key=lambda tup: tup[1], reverse=True)
    return result

def predict(doc_id, doc_words, keywords, embeddings, simmilarity_threshold):
    predictions = {}
    groups_dict = {}
    print("start "+doc_id)
    tf_keywords, groups = calculate_tf(doc_words, embeddings, simmilarity_threshold)
    keyword_prediction = tfidf(tf_keywords, keywords, embeddings, len(doc_keywords), simmilarity_threshold, groups)
    predictions[doc_id] = keyword_prediction[:20]
    groups_dict[doc_id] = groups
    print("end "+doc_id+"  -> "+str(predictions))
    return predictions, groups_dict

docs = list(doc_keywords.keys())
keyword_lists = []
for d in docs:
    keyword_lists.append(doc_keywords[d])
pool = Pool(processes=PROCESS_COUNT)
results = pool.starmap(predict, zip(docs, keyword_lists, itertools.repeat(keywords), itertools.repeat(embeddings),itertools.repeat(simmilarity_threshold), ) )
groups_dict={}
predictions={}
for p,g in results:
    for docid, vals in p.items():
        predictions[docid] = vals
    for docid, vals in g.items():
        groups_dict[docid] = vals

['C-57', 'H-52', 'I-66', 'I-51', 'H-44', 'J-40', 'I-45', 'J-50', 'C-45', 'H-48', 'H-50', 'I-38', 'I-37', 'I-54', 'H-41', 'J-67', 'J-51', 'H-92', 'H-38', 'H-53', 'H-81', 'J-36', 'H-88', 'C-77', 'H-84', 'H-62', 'C-46', 'J-44', 'C-44', 'H-73', 'J-47', 'H-96', 'I-58', 'C-75', 'I-73', 'C-52', 'I-60', 'J-37', 'I-47', 'J-55', 'I-59', 'H-69', 'H-79', 'J-66', 'J-45', 'C-56', 'H-64', 'J-42', 'I-57', 'H-47', 'I-43', 'I-55', 'I-65', 'J-73', 'C-55', 'C-81', 'H-49', 'C-61', 'J-62', 'J-38', 'C-50', 'C-78', 'H-46', 'J-70', 'I-77', 'J-59', 'I-75', 'I-64', 'J-71', 'I-42', 'C-62', 'I-56', 'H-97', 'J-49', 'J-39', 'C-74', 'C-67', 'H-45', 'J-61', 'J-56', 'I-48', 'J-74', 'H-85', 'C-71', 'C-72', 'H-54', 'H-35', 'H-87', 'C-80', 'C-69', 'I-74', 'H-98', 'J-60', 'C-66', 'I-49', 'H-60', 'C-54', 'I-53', 'J-58', 'C-76', 'I-68', 'C-48', 'J-57', 'J-34', 'J-53', 'I-62', 'I-76', 'I-46', 'C-41', 'C-83', 'J-41', 'C-42', 'J-69', 'I-71', 'C-58', 'I-52', 'H-43', 'H-90', 'J-33', 'J-72', 'J-65', 'H-61', 'H-83', 'I-61', 'J-63',

start I-66
calculated tf I-66
end H-62  -> {'H-62': [('user model', 375.81468480531186), ('ucair', 298.1683163231543), ('search result', 176.37806788504543), ('search agent', 120.12148141580198), ('loss function', 108.0971535400064), ('query jaguar', 63.71579527985056), ('retrieval model', 61.451442432446164), ('search context', 55.030837895472764), ('indonesia', 46.506630833491194), ('relevance value', 45.99459850895729), ('information retrieval system', 44.59495530443761), ('feedback information', 44.40614908641339), ('information need', 43.79013385288307), ('document', 40.38034032005735), ('search engine', 39.67240182758094), ('google', 35.61048263304906), ('java', 34.15756240593844), ('query expansion', 33.93135651618756), ('next link', 29.089593570280186), ('user action', 28.345172504715887)]}
start C-46
calculated tf C-46
end I-54  -> {'I-54': [('issue negotiation problem', 137.85830337755772), ('knapsack problem', 111.59105464885535), ('negotiation', 105.46848727744583), ('packa

start H-44
calculated tf H-44
end H-50  -> {'H-50': [('input ranking', 251.06991647602393), ('consensus ranking', 133.82529036032722), ('rank aggregation strategy', 103.36344341501147), ('diσdi', 94.5411791034106), ('outranking relation', 79.99638231827052), ('outranking approach', 79.99637278195861), ('document', 63.62962717099946), ('combsum', 58.17918714056037), ('cmin', 58.17918020506081), ('discordance coalition', 49.653791487114844), ('input method', 43.63439035542028), ('veto threshold', 43.63438775460794), ('candidate document', 42.27875530317382), ('ranking', 41.1171263984195), ('discordance threshold', 38.81531874464971), ('datum fusion approach', 35.178087882522824), ('condorcet procedure', 32.16674423131081), ('combmnz', 29.089597038029968), ('smax', 21.81719517771014), ('concordance condition', 21.472856293649173)]}
start I-38
calculated tf I-38
end H-81  -> {'H-81': [('distance measure', 351.01360674364906), ('descriptor', 209.97075463274817), ('mpeg', 142.35307095811027)

start H-88
calculated tf H-88
end I-59  -> {'I-59': [('provider profile item', 203.58345917735397), ('relay entity', 140.003599825414), ('information filtering', 115.6287678742842), ('privacy', 93.00472458424666), ('relay agent', 79.99637278195861), ('recommendation', 65.74210650745437), ('user agent', 63.98032893073144), ('recommender systems', 62.7244541553287), ('provider', 61.915196190889475), ('smart event assistant', 58.17918714056037), ('jiac', 43.634385153795606), ('matchmaker module', 42.99494249709035), ('privacy preserving recommender systems', 41.136392898137245), ('relay', 41.09982402692459), ('filtering technique', 38.968108447660725), ('agent platform', 29.349153762945974), ('user privacy', 25.812680993495036), ('java security policy', 25.56073856852279), ('secure multi party computation', 21.817197778522477), ('establish control', 21.817197778522477)]}
start H-69
calculated tf H-69
end J-40  -> {'J-40': [('mutant', 356.34747875599743), ('fitness', 287.4956046058673), ('

start H-97
calculated tf H-97
end C-55  -> {'C-55': [('location model', 274.74636965106583), ('group interaction support', 170.4503011744248), ('messenger', 94.5411791034106), ('giss', 79.99638231827052), ('contact list', 79.99638231827052), ('context information', 70.17590066264555), ('group', 65.40942256323267), ('chapter', 64.3918019550115), ('sensor', 63.44701193957528), ('group member', 50.79417890035749), ('availability status', 49.10752683360562), ('proximity', 47.25824840357794), ('context framework', 43.634395557044954), ('visualisation', 43.332397533342515), ('location', 39.20010984229309), ('sensor encapsulation', 39.03810561957258), ('rfid', 36.36199196285023), ('giss core attribute', 36.36198979550662), ('post', 34.85457714544797), ('floor', 34.77677328785689)]}
start C-81
calculated tf C-81
end H-73  -> {'H-73': [('database', 264.0159938149755), ('resource selection problem', 244.71382859411844), ('information retrieval system', 233.19453816160143), ('document score', 173

start H-64
calculated tf H-64
end I-43  -> {'I-43': [('quarry', 167.2651430895498), ('target dynamic', 138.42832333725232), ('action selection procedure', 135.87320120432793), ('state space', 106.83801359385936), ('dynamics based control', 70.30388990075753), ('hunter', 65.45158553313041), ('user level', 61.850046846458596), ('environment model', 47.41950713258868), ('agent level', 44.30922287327334), ('environment design level', 38.587475330896), ('system dynamic', 38.0508888448942), ('extended markov tracking', 36.36198979550662), ('arena', 36.36198762816301), ('pomdp', 35.58826349707596), ('target system dynamic', 34.62985658141801), ('motion', 28.779489425030324), ('pomdps', 28.453718269613226), ('control approach', 25.497619453913092), ('pomdp policy', 21.81719517771014), ('control', 21.5375398316973)]}
start I-55
calculated tf I-55
end C-78  -> {'C-78': [('event channel', 328.76765197926267), ('event layer', 215.21442762468973), ('quality attribute', 127.82193407340911), ('commun

start J-70
calculated tf J-70
end J-39  -> {'J-39': [('exercise price', 186.06445576253444), ('proxy', 171.6097420107593), ('ebay', 135.2635841729629), ('price matching', 131.69618432250508), ('auction', 122.11593219523995), ('bidder', 116.68179639913798), ('stone', 116.35837428112075), ('buyer', 113.37935803652066), ('option', 110.73301036585072), ('nancy', 72.72398392570047), ('haile', 72.72397959101323), ('sand', 71.87389258343116), ('ebay winner', 65.8872623456589), ('tamer', 65.45159333556744), ('tues', 65.4515738294749), ('bidder population', 58.40971534368617), ('polly', 58.17918714056037), ('tuesday', 56.3588768138346), ('bundle', 55.99532642060245), ('alice', 42.278765383215784)]}
start C-74
calculated tf C-74
end I-42  -> {'I-42': [('pseudotree', 803.5518813426299), ('dcpop', 269.07867637172075), ('dcop instance', 165.30942020618215), ('dpop algorithm', 137.31917483939964), ('dpop', 131.06414925187136), ('utility propagation message', 122.92384760621158), ('branch parent bran

start C-71
calculated tf C-71
end C-62  -> {'C-62': [('accountability', 196.35473319208023), ('monitor', 192.82260213394858), ('path quality', 159.99276463654104), ('innovation', 135.94553011202132), ('path monitor', 128.42290117933638), ('cheater', 122.60839768723449), ('innovation game', 119.14965797669313), ('ropc', 101.81357749598065), ('contract', 97.06072215799324), ('datum path', 94.82979076397666), ('commoditization', 79.99638231827052), ('ropc monitor', 65.4515777306934), ('contracting system', 55.31969627450942), ('pathology', 54.962394998180514), ('punishment', 51.80309138468228), ('network model', 43.2716498571367), ('network behavior', 42.400594025092346), ('route coalition proof', 41.00576467472797), ('claim', 39.88797135776537), ('shopc', 36.36199196285023)]}
start I-56
calculated tf I-56
end I-74  -> {'I-74': [('dialogue', 342.4759318037049), ('argumentation graph', 116.81630311261856), ('undercut', 106.47026869917042), ('argument graph', 91.87508945391052), ('move', 83

start H-35
calculated tf H-35


In [21]:
def calculate_stats(predictions, groups_dict, gold_keywords, use_meanings=False, show_details = False):
    tp =0
    fp=0
    tn=0
    fn=0
    for doc_id, doc_gold_keywords in gold_keywords.items():
        if doc_id not in predictions:
            continue
        if show_details:
            print("Detailed results for document "+doc_id)
        preds = predictions[doc_id]
        groups = groups_dict[doc_id]
        if show_details and use_meanings:
            print("Stats are calculated based on meaning groups:")
            i=0
            shown = set()
            for word, st in groups.items():
                if "".join(st) in shown or len(st)<=1:
                    continue
                print("Meaning group "+str(i)+" = "+str(st))
                shown.add("".join(st))
                i+=1
        #precision - related
        pred_words_set = set()
        for pred_word, score in preds:
            if use_meanings:
                if pred_word not in groups:
                    groups[pred_word] = set(list(pred_word))
                found = False
                for gword in groups[pred_word]:
                    if gword in doc_gold_keywords:
                        tp+=1
                        found = True
                        if show_details:
                            print("Correctly predicted meaning: "+str(groups[pred_word])+" score= "+str(score))
                        break
                if not found:
                    fp += 1
                pred_words_set |= groups[pred_word]
            else:
                if pred_word in doc_gold_keywords:
                    if show_details:
                        print("Correctly predicted keyword: "+pred_word+" score= "+str(score))
                    tp+=1
                else:
                    fp+=1
                pred_words_set.add(pred_word)
        #recall related
        for gold_word in doc_gold_keywords:
            if gold_word not in pred_words_set:
                fn+=1
    if show_details:
        print("True positives: "+str(tp))
        print("False positives: "+str(fp))
        print("False negatives: "+str(fn))
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    print("Precision = "+str(precision)+"   Recall = "+str(recall))
print(predictions)
print(groups_dict)
calculate_stats(predictions, groups_dict, ds.keywords_combined)

({'C-57': [('congestion game', 235.5838995995128), ('nash equilibrium strategy profile', 160.2835180486844), ('cost function', 141.73954843120117), ('cglf', 130.9031554613868), ('failure probability', 126.91132205306069), ('cglfs', 116.35838815211987), ('step addition operation', 73.71216517400366), ('congestion', 37.412775526215476), ('task completion', 25.36725922992947), ('monderer', 21.817197778522477), ('group utility', 21.81719517771014), ('lemma', 20.7910128997117), ('stable profile', 20.43797876023906), ('courier', 16.911502121269525), ('deviator', 14.544798519014984), ('task execution', 14.544796785140093), ('cglf model', 14.544796785140093), ('koutsoupias', 14.544796785140093), ('sstability', 14.544795918202647), ('existence', 13.479049832731208)]}, {'C-57': {'congestion games': {'pscongestion game', 'congestion game', 'congestion setting', 'congestion games', 'congestion vector'}, 'congestion game': {'pscongestion game', 'congestion game', 'congestion setting', 'congestion g

ZeroDivisionError: division by zero

The above statistics assumes simple word matching.

If we aim for meaning matching (as initially assumed), the results will be following:

In [None]:
calculate_stats(predictions, groups_dict, ds.keywords_combined, use_meanings=True)

The meaning groups are:

In [None]:
for d in docs:
    i=0
    shown = set()
    for word, st in groups.items():
        if "".join(st) in shown or len(st)<=1:
            continue
        print(d+" - meaning group "+str(i)+" = "+str(st))
        shown.add("".join(st))
        i+=1

#Evaluate on testset

In [None]:
ts = semelval_loader(TESTSET_LOCATION)
ts.load_data()
ts.keywords = ts.keywords_combined
tkeywords, tdoc_keywords = ts.extract_keywords_candidates(root)
tembeddings = gather_embeddings(tkeywords)

docs = list(tdoc_keywords.keys())
keyword_lists = []
for d in docs:
    keyword_lists.append(tdoc_keywords[d])
pool = Pool(processes=PROCESS_COUNT)
results = pool.starmap(predict, zip(docs, keyword_lists, itertools.repeat(tkeywords), itertools.repeat(tembeddings),itertools.repeat(simmilarity_threshold), ) )
groups_dict={}
predictions={}
for p,g in results:
    for docid, vals in p.items():
        predictions[docid] = vals
    for docid, vals in g.items():
        groups_dict[docid] = vals
        
print("keyword stats: ")
calculate_stats(predictions, groups_dict, ts.keywords_combined)
print("meaning stats: ")
calculate_stats(predictions, groups_dict, ts.keywords_combined, use_meanings=True)