In this notebook, I validate the Keyword Extraction Technique on datasets found on [github](https://github.com/SDuari/Keyword-Extraction-Datasets). Keyword extraction technique is based on code from this medium [article](https://towardsdatascience.com/textrank-for-keyword-extraction-by-python-c0bae21bcec0) by [Xu Liang](https://towardsdatascience.com/@bramblexu)

In [None]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    # def normalize(self, token):
    #     if (token.is_alpha):
    #         return token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False and token.is_alpha:
                    # token = self.normalize(token)
                    # selected_words.append(token)
                    if lower is True:
                        selected_words.append(token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_)
                    else:
                        selected_words.append(token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_)
            sentences.append(selected_words)
        return sentences
    
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=100000, limit=0.0):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        keywords = []
        for i, (key, value) in enumerate(node_weight.items()):
            if value >= limit:
                keywords.append(key)
            # print(key + ' - ' + str(value))
            # if i > number:
            #     break
        return keywords
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [None]:
# !unzip /content/NLM500.zip
# !unzip /content/Hulth2003.zip
# !unzip /content/WWW.zip
!unzip /content/semeval.zip

Archive:  /content/semeval.zip
   creating: semeval/
  inflating: semeval/.DS_Store       
  inflating: semeval/.RData          
  inflating: semeval/.Rhistory       
  inflating: semeval/C-1.key         
  inflating: semeval/C-1.txt         
  inflating: semeval/C-14.key        
  inflating: semeval/C-14.txt        
  inflating: semeval/C-17.key        
  inflating: semeval/C-17.txt        
  inflating: semeval/C-18.key        
  inflating: semeval/C-18.txt        
  inflating: semeval/C-19.key        
  inflating: semeval/C-19.txt        
  inflating: semeval/C-20.key        
  inflating: semeval/C-20.txt        
  inflating: semeval/C-22.key        
  inflating: semeval/C-22.txt        
  inflating: semeval/C-23.key        
  inflating: semeval/C-23.txt        
  inflating: semeval/C-27.key        
  inflating: semeval/C-27.txt        
  inflating: semeval/C-28.key        
  inflating: semeval/C-28.txt        
  inflating: semeval/C-29.key        
  inflating: semeval/C-29.txt      

In [None]:
import pandas as pd

textdf = pd.DataFrame(columns=['filename', 'text'])
keyworddf = pd.DataFrame(columns=['filename', 'keyword'])

In [None]:
import os

directory = '/content/semeval'

for filename in os.listdir(directory):
    if filename.endswith(".txt"): 
        f = open(os.path.join(directory, filename), "r")
        textdf = textdf.append({'filename': os.path.splitext(filename)[0], 'text': f.read()}, ignore_index=True)
    else:
        continue

for filename in os.listdir(directory):
    if filename.endswith(".key"): 
        f = open(os.path.join(directory, filename), "r")
        keyworddf = keyworddf.append({'filename': os.path.splitext(filename)[0], 'keyword': f.readlines()}, ignore_index=True)
    else:
        continue

In [None]:
dataset = textdf.merge(keyworddf, on='filename', how='inner')

In [None]:
dataset.head(3)

Unnamed: 0,filename,text,keyword
0,C-42,Demonstration of Grid-Enabled Ensemble Kalman ...,"[ensemble kalman filter\n, datum assimilation ..."
1,C-31,Apocrita: A Distributed Peer-to-Peer File Shar...,"[peer-to-peer\n, file sharing system\n, intran..."
2,H-49,Performance Prediction Using Spatial Autocorre...,"[performance prediction\n, information retriev..."


In [None]:
dataset.iloc[2]['keyword']

['performance prediction\n',
 'information retrieval\n',
 'spatial autocorrelation\n',
 'autocorrelation\n',
 'cluster hypothesis\n',
 'zero relevance judgment\n',
 'relationship of predictor\n',
 'predictor relationship\n',
 'predictive power of predictor\n',
 'predictor predictive power\n',
 'language model score\n',
 'ranking of query\n',
 'query ranking\n',
 'regularization\n']

In [None]:
def clean_up(keyword_list):
    """
    We are interested in keywords not keyphrases so list of keyphrases are converted to list of keywords
    """
    cleaned = []
    for keyword in keyword_list:
        word = keyword[:-1]
        word_list = word.split()
        for words in word_list:
            cleaned.append(words.lower())
    return list(set(cleaned))

In [None]:
dataset['keyword'] = dataset.apply(lambda row: clean_up(row['keyword']), axis=1)

In [None]:
dataset.iloc[2]['keyword']

['predictive',
 'model',
 'relationship',
 'information',
 'spatial',
 'ranking',
 'score',
 'cluster',
 'query',
 'predictor',
 'autocorrelation',
 'judgment',
 'hypothesis',
 'zero',
 'language',
 'of',
 'retrieval',
 'regularization',
 'performance',
 'power',
 'prediction',
 'relevance']

In [None]:
def scores(gold_keywords, extracted_keywords):
    total_keywords = len(gold_keywords)
    total_summary_keywords = len(extracted_keywords)
    correct_keywords = 0

    for keyword in extracted_keywords:
        if keyword in gold_keywords:
            correct_keywords += 1
    
    precision = correct_keywords / total_summary_keywords
    recall = correct_keywords / total_keywords
    
    return {'precision':precision, 'recall':recall}

In [None]:
tr4w = TextRank4Keyword()

def get_keywords(text):
    tr4w.analyze(text, candidate_pos = ['NOUN','PROPN'], window_size=4, lower=True)
    return tr4w.get_keywords()

In [None]:
dataset['extracted_keywords'] = dataset.apply(lambda row: get_keywords(row['text']), axis=1)
dataset['scores'] = dataset.apply(lambda row: scores(row['keyword'], row['extracted_keywords']), axis=1)

In [None]:
def get_precision(scores):
    return scores['precision']

def get_recall(scores):
    return scores['recall']

In [None]:
dataset['recall'] = dataset.apply(lambda row: get_recall(row['scores']), axis=1)
dataset['precision'] = dataset.apply(lambda row: get_precision(row['scores']), axis=1)

In [None]:
dataset.head()

Unnamed: 0,filename,text,keyword,extracted_keywords,scores,recall,precision
0,C-42,Demonstration of Grid-Enabled Ensemble Kalman ...,"[development, high, model, application, gridwa...","[grid, tigre, model, job, resource, applicatio...","{'precision': 0.051094890510948905, 'recall': ...",0.875,0.051095
1,C-31,Apocrita: A Distributed Peer-to-Peer File Shar...,"[model, system, sharing, distribution, incomin...","[node, document, file, apocrita, indexing, ind...","{'precision': 0.04643962848297214, 'recall': 0...",0.75,0.04644
2,H-49,Performance Prediction Using Spatial Autocorre...,"[predictive, model, relationship, information,...","[retrieval, document, autocorrelation, informa...","{'precision': 0.037549407114624504, 'recall': ...",0.863636,0.037549
3,C-56,A Hierarchical Process Execution Support for G...,"[algorithm, computing, system, application, sc...","[process, execution, activity, grid, controlle...","{'precision': 0.03723404255319149, 'recall': 0...",0.875,0.037234
4,I-22,Realistic Cognitive Load Modeling for Enhancin...,"[shared, problem-solving, model, team, schema,...","[agent, load, model, information, team, task, ...","{'precision': 0.038461538461538464, 'recall': ...",0.814815,0.038462


In [None]:
dataset['recall'].mean()

0.7821065189534685

In [None]:
dataset['precision'].mean()

0.038814192116577634