In [None]:
!pip install --quiet wikipedia-api
!pip install --quiet rake-nltk
!pip install --quiet yake
!pip install --quiet keybert
!pip install --quiet pytextrank
!python3 -m spacy download en_core_web_sm

  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 60 kB 5.6 MB/s 
[K     |████████████████████████████████| 132 kB 35.2 MB/s 
[?25h  Building wheel for jellyfish (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 85 kB 3.8 MB/s 
[K     |████████████████████████████████| 235 kB 59.1 MB/s 
[K     |████████████████████████████████| 51 kB 6.5 MB/s 
[K     |████████████████████████████████| 4.7 MB 52.1 MB/s 
[K     |████████████████████████████████| 1.3 MB 56.9 MB/s 
[K     |████████████████████████████████| 101 kB 10.4 MB/s 
[K     |████████████████████████████████| 596 kB 50.0 MB/s 
[K     |████████████████████████████████| 6.6 MB 30.9 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.1 MB 50.5 MB/s 
[K     |████████████████████████████████| 47 kB 3.7 MB/s 
[K     |████████████████████████████████| 11.2 MB 49.5 MB/

In [None]:
import wikipediaapi
from operator import itemgetter
from nltk.probability import FreqDist
from nltk.util import ngrams
from collections import Counter
import string
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from bisect import bisect_left
from collections import Counter
from nltk.stem.snowball import SnowballStemmer
from rake_nltk import Rake
from keybert import KeyBERT
import yake
import spacy
import pytextrank
import pandas as pd
import numpy as np
import nltk
import random
import os
nltk.download('stopwords')
nltk.download('punkt')
spacy.load('en_core_web_sm')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


<spacy.lang.en.English at 0x7f6381cc7b10>

In [None]:
wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)

In [None]:
def _most_frequent_words(text : list):
    freq = FreqDist(text)
    return sorted(freq.items(), key=itemgetter(1), reverse=True)[:20]

def _get_n_grams(text : string, n : int):
    result = []
    n_grams = ngrams(text.split(), n)
    for grams in n_grams :
        result.append(grams)
    return result

def _is_duplicate(keyword : string, keywords : list):
  
    if [wiki.page(x).text[:50] for x in keywords].count(wiki.page(keyword).text[:50]) >= 1:
        return True
    
    return False


def _remove_duplicates(keywords : list):
    for keyword in keywords:
        if _is_duplicate(keyword, keywords):
            keywords.remove(keyword)
    return keywords

def _topic_relevance_score(word, keywords : list):
        score = 0
        for y in keywords:
            if y != word:
                score += [x.lower() for x in word_tokenize(wiki.page(word).text)].count(y)
        return score

def _get_stopwords():
    file = open("stopwords.txt", "rb")
    stopwords = []
    for word in file:
        stopwords.append(SnowballStemmer("english").stem(re.sub('\n', '', word.decode("utf-8"))))
    return stopwords

def _remove_stopwords(document : string):
    stopwords = _get_stopwords()
    words = [word.lower() for word in word_tokenize(document) if SnowballStemmer("english").stem(word.lower()) not in stopwords and word.isalpha() is True]
    return " ".join(words)


In [None]:
def get_doc_keywords(text:string):

    bigrams = Counter(_get_n_grams(_remove_stopwords(text), 2)).most_common(10)
    trigrams = Counter(_get_n_grams(_remove_stopwords(text), 2)).most_common(10)
    keywords = [] + [x[0][0] + " " + x[0][1] for x in bigrams] + [x[0][0] + " " + x[0][1] + x[0][1] for x in trigrams]
    keywords.append(_most_frequent_words(word_tokenize(_remove_stopwords(text)))[0][0])
    keywords = keywords + [x[0] for x in _most_frequent_words([x for y in trigrams for x in y[0]])[:3]]
    keywords = keywords + [x[0] for x in _most_frequent_words([x for y in bigrams for x in y[0]])[:3]]
   
    keywords = [x for x in keywords if wiki.page(x).exists() and wiki.page(x).text[:100].find("refer to") == -1]
    
    keywords = _remove_duplicates(keywords)

    topic_relevance_scores = {}
        
    for kw in keywords:
      topic_relevance_scores[kw] = _topic_relevance_score(kw, keywords)  
    
    keywords = [x for x in topic_relevance_scores if topic_relevance_scores[x] >= 10]

    return keywords


def rake_extraction(row):

  text = row["text"]
  
  n_keywords = row["number_of_keywords"]

  r = Rake(max_length= 3, min_length = 1, include_repeated_phrases=False)

  # Extraction given the text.
  r.extract_keywords_from_text(text)

  # Get keyword phrases ranked highest to lowest with scores.
  keywords = [x[1] for x in r.get_ranked_phrases_with_scores()][:n_keywords]

  keywords = [x for x in keywords if wiki.page(x).exists() and wiki.page(x).text[:100].find("refer to") == -1]

  keywords = _remove_duplicates(keywords)
  
  return keywords
  


def keybert_extraction(row):
  text = row["text"]
  n_keywords = row["number_of_keywords"]
  text = _remove_stopwords(text)
  kw_model = KeyBERT()
  keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3),use_maxsum=True, nr_candidates=n_keywords, top_n=n_keywords)
  keywords = [x[0] for x in keywords]
  keywords = [x for x in keywords if wiki.page(x).exists() and wiki.page(x).text[:100].find("refer to") == -1]
  keywords = _remove_duplicates(keywords)
  return keywords

def yake_extraction(row):
  text = row["text"]
  n_keywords = row["number_of_keywords"]
  text = _remove_stopwords(text)
  simple_kwextractor = yake.KeywordExtractor(n = 3, top = n_keywords)
  keywords = [x[0] for x in simple_kwextractor.extract_keywords(text)]
  keywords = [x for x in keywords if wiki.page(x).exists() and wiki.page(x).text[:100].find("refer to") == -1]
  keywords = _remove_duplicates(keywords)
  return keywords

def textrank_extraction(row):

  text = row["text"]
  n_keywords = row["number_of_keywords"]
  
  # load a spaCy model, depending on language, scale, etc.
  text = _remove_stopwords(text)
  nlp = spacy.load("en_core_web_sm")

  # add PyTextRank to the spaCy pipeline
  nlp.add_pipe("textrank")
  doc = nlp(text)

  keywords = []

  # examine the top-ranked phrases in the document
  for phrase in doc._.phrases:
      keywords.append(phrase.text)

  keywords = keywords[:n_keywords]

  keywords = [x for x in keywords if wiki.page(x).exists() and wiki.page(x).text[:100].find("refer to") == -1]
  keywords = _remove_duplicates(keywords)

  return keywords

def to_list(list_string):
  return [x.strip().strip('"').strip("'").strip() for x in list_string.strip("[").strip("]").split(",")]

def get_dataset():
  
  data = {}

  all_keywords = []

  all_text = []

  all_filename = []

  for f_name in os.listdir("Nguyen2007/docsutf8"):
    text = open("Nguyen2007/docsutf8/" + f_name, "r").read()
    f_key = f_name.split(".")[0] + ".key"
    keywords = open("Nguyen2007/keys/" + f_key, "r").read().split("\n")
    all_keywords.append(keywords)
    all_text.append(text)
    all_filename.append(f_name)

  data["keywords"] = all_keywords
  data["filename"] = all_filename
  data["text"] = all_text
  data["number_of_keywords"] = [len(x) for x in data["keywords"]]

  df = pd.DataFrame(data)

  return df


In [None]:
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))


def precision(truth:list, predictions:list):
  if len(predictions) == 0:
    return 0
  predictions = [normalize_text(x) for x in predictions]
  truth = [normalize_text(x) for x in truth]
  common_keywords = list(set(truth) & set(predictions))
  return len(common_keywords) / len(predictions)

def recall(truth:list, predictions:list):
  if len(predictions) == 0:
    return 0
  predictions = [normalize_text(x) for x in predictions]
  truth = [normalize_text(x) for x in truth]
  common_keywords = list(set(truth) & set(predictions))
  return len(common_keywords) / len(truth)

def f1_score(truth:list, predictions:list):
  prec = precision(truth, predictions)
  rec = recall(truth, predictions)
  if prec + rec == 0:
    return 0
  return 2 * (prec * rec) / (prec + rec)

In [None]:
!unzip Nguyen2007.zip

In [None]:
df = get_dataset()

In [None]:
df["rake"] = df.apply(lambda x: rake_extraction(x), axis = 1)

In [None]:
df["yake"] = df.apply((lambda x: yake_extraction(x)), axis = 1)

In [None]:
df["keybert"] = df.apply((lambda x: keybert_extraction(x)), axis = 1)

In [None]:
df["textrank"] = df.apply((lambda x: textrank_extraction(x)), axis = 1)

In [None]:
df = pd.read_excel("keywords.xlsx")

In [None]:
truth = [[x for x in to_list(x) if x != ''] for x in list(df["keywords"])]

In [None]:
rake_predictions = [[x for x in to_list(x) if x != ''] for x in list(df["rake"])]
rake = list(zip(truth, rake_predictions))

rake_precision = [precision(x[0], x[1]) for x in rake]
rake_recall = [recall(x[0], x[1]) for x in rake]
rake_f1 = [f1_score(x[0], x[1]) for x in rake]

print("Precision: ", sum(rake_precision)/len(rake_precision))
print("Recall: ", sum(rake_recall)/len(rake_recall))
print("F1-score: ", sum(rake_f1)/len(rake_f1))

Precision:  0.0021738698853350723
Recall:  0.002378419031428276
F1-score:  0.0022681438682367106


In [None]:
yake_predictions = [[x for x in to_list(x) if x != ''] for x in list(df["yake"])]
yake = list(zip(truth, yake_predictions))

yake_precision = [precision(x[0], x[1]) for x in yake]
yake_recall = [recall(x[0], x[1]) for x in yake]
yake_f1 = [f1_score(x[0], x[1]) for x in yake]

print("Precision: ", sum(yake_precision)/len(yake_precision))
print("Recall: ", sum(yake_recall)/len(yake_recall))
print("F1-score: ", sum(yake_f1)/len(yake_f1))


Precision:  0.02804980900413081
Recall:  0.029697346297601117
F1-score:  0.028759857113784416


In [None]:
keybert_predictions = [[x for x in to_list(x) if x != ''] for x in list(df["keybert"])]
keybert = list(zip(truth, keybert_predictions))

keybert_precision = [precision(x[0], x[1]) for x in keybert]
keybert_recall = [recall(x[0], x[1]) for x in keybert]
keybert_f1 = [f1_score(x[0], x[1]) for x in keybert]

print("Precision: ", sum(keybert_precision)/len(keybert_precision))
print("Recall: ", sum(keybert_recall)/len(keybert_recall))
print("F1-score: ", sum(keybert_f1)/len(keybert_f1))

Precision:  0.009289651387289679
Recall:  0.009865442333894024
F1-score:  0.009551831421156837


In [None]:
textrank_predictions = [[x for x in to_list(x) if x != ''] for x in list(df["textrank"])]
textrank = list(zip(truth, textrank_predictions))

textrank_precision = [precision(x[0], x[1]) for x in textrank]
textrank_recall = [recall(x[0], x[1]) for x in textrank]
textrank_f1 = [f1_score(x[0], x[1]) for x in textrank]

print("Precision: ", sum(textrank_precision)/len(textrank_precision))
print("Recall: ", sum(textrank_recall)/len(textrank_recall))
print("F1-score: ", sum(textrank_f1)/len(textrank_f1))

Precision:  0.014169661294425392
Recall:  0.01444934520414026
F1-score:  0.014300345336592689


In [None]:
louis_predictions = [[x for x in to_list(x) if x != ''] for x in list(df["louis_algo"])]
louis = list(zip(truth, louis_predictions))

louis_precision = [precision(x[0], x[1]) for x in louis]
louis_recall = [recall(x[0], x[1]) for x in louis]
louis_f1 = [f1_score(x[0], x[1]) for x in louis]

print("Precision: ", sum(louis_precision)/len(louis_precision))
print("Recall: ", sum(louis_recall)/len(louis_recall))
print("F1-score: ", sum(louis_f1)/len(louis_f1))

Precision:  0.07097288676236045
Recall:  0.012190792711182386
F1-score:  0.017737858318188995
