In [1]:
import os
import string
import re 
import operator
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from library import clean_text_simple,terms_to_graph,core_dec,accuracy_metrics

In [2]:
stemmer = nltk.stem.PorterStemmer()
stpwds = stopwords.words('english')
punct = string.punctuation.replace('-', '')

In [3]:
##################################
# read and pre-process abstracts #
##################################

path_to_abstracts = "data/Hulth2003testing/abstracts"
abstract_names = sorted(os.listdir(path_to_abstracts))

abstracts = []
for counter,filename in enumerate(abstract_names):
    # read file
    with open(path_to_abstracts + '/' + filename, 'r') as my_file: 
        text = my_file.read().splitlines()
    text = ' '.join(text)
    # remove formatting
    text = re.sub('\s+', ' ', text)
    abstracts.append(text)
    
    if counter % round(len(abstract_names)/5) == 0:
        print(counter, 'files processed')

abstracts_cleaned = []
for counter,abstract in enumerate(abstracts):
    my_tokens = clean_text_simple(abstract,my_stopwords=stpwds,punct=punct)
    abstracts_cleaned.append(my_tokens)
    
    if counter % round(len(abstracts)/5) == 0:
        print(counter, 'abstracts processed')

0 files processed
100 files processed
200 files processed
300 files processed
400 files processed
0 abstracts processed
100 abstracts processed
200 abstracts processed
300 abstracts processed
400 abstracts processed


In [4]:
###############################################
# read and pre-process gold standard keywords #
###############################################

path_to_keywords = "data/Hulth2003testing/uncontr"
keywd_names = sorted(os.listdir(path_to_keywords))
   
keywds_gold_standard = []

for counter,filename in enumerate(keywd_names):
    # read file
    with open(path_to_keywords +'/'+ filename, 'r') as my_file: 
        text = my_file.read().splitlines()
    text = ' '.join(text)
    text =  re.sub('\s+', ' ', text) # remove formatting
    text = text.lower()
    # turn string into list of keywords, preserving intra-word dashes 
    # but breaking n-grams into unigrams
    keywds = text.split(';')
    keywds = [keywd.strip().split(' ') for keywd in keywds]
    keywds = [keywd for sublist in keywds for keywd in sublist] # flatten list
    keywds = [keywd for keywd in keywds if keywd not in stpwds] # remove stopwords (rare but may happen due to n-gram breaking)
    keywds_stemmed = [stemmer.stem(keywd) for keywd in keywds]
    keywds_stemmed_unique = list(set(keywds_stemmed)) # remove duplicates (may happen due to n-gram breaking)
    keywds_gold_standard.append(keywds_stemmed_unique)
    
    if counter % round(len(keywd_names)/5) == 0:
        print(counter, 'files processed')

0 files processed
100 files processed
200 files processed
300 files processed
400 files processed


In [5]:
##############################
# precompute graphs-of-words #
##############################

gs = []
for extract in abstracts_cleaned:
    gs.append(terms_to_graph(extract,11))

In [6]:
##################################
# graph-based keyword extraction #
##################################

my_percentage = 0.33 # for PR and TF-IDF

method_names = ['kc','wkc','pr','tfidf']
keywords = dict(zip(method_names,[[],[],[],[]]))

for counter,g in enumerate(gs):
    # k-core
    core_numbers = core_dec(g,False)
    core_numbers = [i[0] for i in sorted(core_numbers.items() , key=lambda t : t[1],reverse=True) if i[1]==max(core_numbers.values())]
    keywords['kc'].append(core_numbers[:int(len(core_numbers))])
    
    # weighted k-core
    weighted_core_numbers = core_dec(g,True)
    weighted_core_numbers = [i[0] for i in sorted(weighted_core_numbers.items() , key=lambda t : t[1],reverse=True) if i[1]==max(weighted_core_numbers.values())]
    keywords['wkc'].append(weighted_core_numbers[:int(len(weighted_core_numbers))])
    
    # PageRank
    pr_scores = zip(g.vs['name'],g.pagerank())
    pr_scores = sorted(pr_scores, key=operator.itemgetter(1), reverse=True) # in decreasing order
    numb_to_retain = int(len(pr_scores)*my_percentage) # retain top 'my_percentage' % words as keywords
    keywords['pr'].append([tuple[0] for tuple in pr_scores[:numb_to_retain]])
        
    if counter % round(len(gs)/5) == 0:
        print(counter)

0
100
200
300
400


In [7]:
#############################
# TF-IDF keyword extraction #
#############################

abstracts_cleaned_strings = [' '.join(elt) for elt in abstracts_cleaned] # to ensure same pre-processing as the other methods
tfidf_vectorizer = TfidfVectorizer(stop_words=stpwds)
doc_term_matrix = tfidf_vectorizer.fit_transform(abstracts_cleaned_strings)
terms = tfidf_vectorizer.get_feature_names()
vectors_list = doc_term_matrix.todense().tolist()

for counter,vector in enumerate(vectors_list):
    terms_weights = list(zip(terms,vector)) # bow feature vector as list of tuples
    nonzero = [terms_weights[i] for i,j in enumerate(vector) if j!=0 ]
    nonzero = sorted(nonzero, key=operator.itemgetter(1), reverse=True) # in decreasing order
    numb_to_retain = int(len(nonzero)*my_percentage) # retain top 'my_percentage' % words as keywords
    keywords['tfidf'].append([tuple[0] for tuple in nonzero[:numb_to_retain]])
    
    if counter % round(len(vectors_list)/5) == 0:
        print(counter)

0
100
200
300
400


In [8]:
##########################
# performance comparison #
##########################

perf = dict(zip(method_names,[[],[],[],[]]))

for idx,truth in enumerate(keywds_gold_standard):
    for mn in method_names:
        perf[mn].append(accuracy_metrics(keywords[mn][idx],truth))

lkgs = len(keywds_gold_standard)

for k,v in perf.items():
    print(k + ' performance: \n')
    print('precision:', round(100*sum([tuple[0] for tuple in v])/lkgs,2))
    print('recall:', round(100*sum([tuple[1] for tuple in v])/lkgs,2))
    print('F-1 score:', round(100*sum([tuple[2] for tuple in v])/lkgs,2))
    print('\n')

kc performance: 

precision: 50.44
recall: 64.64
F-1 score: 53.67


wkc performance: 

precision: 62.82
recall: 49.24
F-1 score: 48.66


pr performance: 

precision: 54.36
recall: 34.69
F-1 score: 40.7


tfidf performance: 

precision: 59.21
recall: 38.5
F-1 score: 44.85


