In [1]:
import pandas as pd
# Evaluation Metrics (ROUGE)
from rouge_score import rouge_scorer
import numpy as np

### Reading all the predicted_files for evaluating results

In [2]:
# RAKE predictions
rake_pred_df = pd.read_pickle('www_results/rake_pred_df.pkl')

# YAKE predictions
yake_pred_df = pd.read_pickle('www_results/yake_pred_df.pkl')

# TF-IDF predictions
tf_idf_pred_df = pd.read_pickle('www_results/tf_idf_pred_df.pkl')

# KeyBERT predictions
key_bert_pred_df = pd.read_pickle('www_results/key_bert_pred_df.pkl')

# LDA predictions
lda_pred_df = pd.read_pickle('www_results/lda_pred_df.pkl')

# PositionRank predictions
pr_pred_df = pd.read_pickle('www_results/pos_rank_pred_df.pkl')

### Refrence Dataset with labeled keywords
- This data frame is used to evalute the results for the predicted keywords

In [3]:
data = pd.read_pickle('data/Processed_WWW.pkl')

In [4]:
data

Unnamed: 0,Doc_no,Abstract,Keywords
0,183,eigentrust algorithm reputation management p p...,"distributed eigenvector computation,peer-to-pe..."
1,10119,simulation verification automated composition ...,"automated reasoning,daml,distributed systems,o..."
2,11785,context content based trust policy semantic we...,"named graphs,semantic web,trust mechanisms,tru..."
3,12102,meteor web service annotation framework world ...,"ontology,semantic annotation of web services,s..."
4,13109,detecting web page structure adaptive viewing ...,"adaptive hypermedia,content adaptation,mobile ..."
...,...,...,...
495,14449751,flexible generative model preference aggregati...,"collaborative filtering,learning,meta search,p..."
496,14453157,evaluation informational navigational intent g...,"diversification,evaluation,information search ..."
497,14453752,template based question answering rdf data inc...,"natural language patterns,question answering,s..."
498,14454508,zencrowd leveraging probabilistic reasoning cr...,"crowdsourcing,entity linking,linked data,proba..."


In [5]:
doc_no = data['Doc_no']
abstracts = data['Abstract']
refrence_data = data['Keywords'].tolist()

In [6]:
"""
All the algorithms realted results
"""
algorithm = ["RAKE", "YAKE", "TF-IDF", "KeyBERT", "LDA", "Position Rank"]
algo_precision_scores = []
algo_recall_scores = []
algo_f1_scores = []

### ROUGE Metrics for Evaluation of the results

In [7]:
"""
This function returns the average of precision, recall and f1score
based on the score from rouge1, rouge2, rougeL rounding off the final 
value to 2 decimal places for each doc in the dataset
"""
def get_avg_scores(result):
    precision_score = []
    recall_score = []
    f1_score = []
    for rouge in ['rouge1','rouge2','rougeL']:
        precision_score.append(result[rouge][0]*2)
        recall_score.append(result[rouge][1]*2)
        f1_score.append(result[rouge][2]*2)
    precision_score = np.round(np.average(precision_score), 2)
    recall_score = np.round(np.average(recall_score), 2)
    f1_score = np.round(np.average(f1_score), 2)
    return precision_score, recall_score, f1_score

In [8]:
def get_Rouge_scores(refrence_data, test_data):
    precision_score = []
    recall_score = []
    f1_score = []
    
    for refrence, test in zip(refrence_data, test_data):
        
        # modifying the format as a string with spaces
        refrence = refrence.replace(",", " ")
        test = test.replace(",", " ")
        scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)
        scores = scorer.score(refrence, test)
        
        # Getting the avg scores and appending them to the list
        p_score, r_score, f_score = get_avg_scores(scores)
        precision_score.append(p_score)
        recall_score.append(r_score)
        f1_score.append(f_score)
        
    results_df = pd.DataFrame(zip(doc_no,abstracts,precision_score, recall_score, f1_score), columns=['Doc_no', 'Abstract','Avg_precision_score','Avg_recall_score','Avg_f1_score'])
    return results_df

In [9]:
"""
This method returns the average scores for 
Precison, Recall and F1 
on the whole document
"""
def get_final_results(result):
    return np.average(result['Avg_precision_score']),np.average(result['Avg_recall_score']),np.average(result['Avg_f1_score'])

### Results for the RAKE Algorithm

In [10]:
rake_data = rake_pred_df['Extracted_Keywords'].tolist()

In [11]:
rake_results = get_Rouge_scores(refrence_data, rake_data)

In [12]:
rake_results.head()

Unnamed: 0,Doc_no,Abstract,Avg_precision_score,Avg_recall_score,Avg_f1_score
0,183,eigentrust algorithm reputation management p p...,0.13,0.38,0.2
1,10119,simulation verification automated composition ...,0.58,0.87,0.69
2,11785,context content based trust policy semantic we...,0.29,0.77,0.42
3,12102,meteor web service annotation framework world ...,0.51,0.83,0.63
4,13109,detecting web page structure adaptive viewing ...,0.06,0.22,0.1


In [13]:
print("---------------- RAKE RESULTS ----------------")
final_res = get_final_results(rake_results)
algo_precision_scores.append(final_res[0])
algo_recall_scores.append(final_res[1])
algo_f1_scores.append(final_res[2])
print(f"1. Avg. Precision score on the whole document : {final_res[0]}")
print(f"2. Avg. Recall score on the whole document : {final_res[1]}")
print(f"3. Avg. F1 score on the whole document : {final_res[2]}")

---------------- RAKE RESULTS ----------------
1. Avg. Precision score on the whole document : 0.20918
2. Avg. Recall score on the whole document : 0.48929999999999996
3. Avg. F1 score on the whole document : 0.28082


### Results for the YAKE Algorithm

In [14]:
yake_data = yake_pred_df['Extracted_Keywords'].tolist()

In [15]:
yake_results = get_Rouge_scores(refrence_data, yake_data)

In [16]:
yake_results.head()

Unnamed: 0,Doc_no,Abstract,Avg_precision_score,Avg_recall_score,Avg_f1_score
0,183,eigentrust algorithm reputation management p p...,0.19,0.38,0.25
1,10119,simulation verification automated composition ...,0.79,0.73,0.76
2,11785,context content based trust policy semantic we...,0.78,1.38,0.99
3,12102,meteor web service annotation framework world ...,0.87,0.94,0.9
4,13109,detecting web page structure adaptive viewing ...,0.1,0.22,0.13


In [17]:
print("---------------- YAKE RESULTS ----------------")
final_res = get_final_results(yake_results)
algo_precision_scores.append(final_res[0])
algo_recall_scores.append(final_res[1])
algo_f1_scores.append(final_res[2])
print(f"1. Avg. Precision score on the whole document : {final_res[0]}")
print(f"2. Avg. Recall score on the whole document : {final_res[1]}")
print(f"3. Avg. F1 score on the whole document : {final_res[2]}")

---------------- YAKE RESULTS ----------------
1. Avg. Precision score on the whole document : 0.31689999999999996
2. Avg. Recall score on the whole document : 0.5381400000000001
3. Avg. F1 score on the whole document : 0.38094


### Results for TF-IDF

In [18]:
tf_idf_data = tf_idf_pred_df['Extracted_Keywords'].tolist()

In [19]:
tf_idf_results = get_Rouge_scores(refrence_data, tf_idf_data)

In [20]:
tf_idf_results.head()

Unnamed: 0,Doc_no,Abstract,Avg_precision_score,Avg_recall_score,Avg_f1_score
0,183,eigentrust algorithm reputation management p p...,0.17,0.38,0.23
1,10119,simulation verification automated composition ...,0.76,0.88,0.82
2,11785,context content based trust policy semantic we...,0.51,1.04,0.68
3,12102,meteor web service annotation framework world ...,0.84,1.1,0.95
4,13109,detecting web page structure adaptive viewing ...,0.08,0.22,0.12


In [21]:
print("---------------- TF-IDF RESULTS ----------------")
final_res = get_final_results(tf_idf_results)
algo_precision_scores.append(final_res[0])
algo_recall_scores.append(final_res[1])
algo_f1_scores.append(final_res[2])
print(f"1. Avg. Precision score on the whole document : {final_res[0]}")
print(f"2. Avg. Recall score on the whole document : {final_res[1]}")
print(f"3. Avg. F1 score on the whole document : {final_res[2]}")

---------------- TF-IDF RESULTS ----------------
1. Avg. Precision score on the whole document : 0.29954000000000003
2. Avg. Recall score on the whole document : 0.5691600000000001
3. Avg. F1 score on the whole document : 0.37492000000000003


### Results for KeyBert Algorithm

In [22]:
kb_data = key_bert_pred_df['Extracted_Keywords'].tolist()

In [23]:
kb_results = get_Rouge_scores(refrence_data, kb_data)

In [24]:
kb_results.head()

Unnamed: 0,Doc_no,Abstract,Avg_precision_score,Avg_recall_score,Avg_f1_score
0,183,eigentrust algorithm reputation management p p...,0.09,0.19,0.12
1,10119,simulation verification automated composition ...,0.54,0.58,0.56
2,11785,context content based trust policy semantic we...,0.59,1.12,0.77
3,12102,meteor web service annotation framework world ...,0.27,0.31,0.29
4,13109,detecting web page structure adaptive viewing ...,0.0,0.0,0.0


In [25]:
print("---------------- KeyBERT RESULTS ----------------")
final_res = get_final_results(kb_results)
algo_precision_scores.append(final_res[0])
algo_recall_scores.append(final_res[1])
algo_f1_scores.append(final_res[2])
print(f"1. Avg. Precision score on the whole document : {final_res[0]}")
print(f"2. Avg. Recall score on the whole document : {final_res[1]}")
print(f"3. Avg. F1 score on the whole document : {final_res[2]}")

---------------- KeyBERT RESULTS ----------------
1. Avg. Precision score on the whole document : 0.25610000000000005
2. Avg. Recall score on the whole document : 0.45124000000000003
3. Avg. F1 score on the whole document : 0.31272000000000005


### Results for LDA Algorithm

In [26]:
lda_data = lda_pred_df['Extracted_Keywords'].tolist()

In [27]:
lda_results = get_Rouge_scores(refrence_data, lda_data)

In [28]:
lda_results.head()

Unnamed: 0,Doc_no,Abstract,Avg_precision_score,Avg_recall_score,Avg_f1_score
0,183,eigentrust algorithm reputation management p p...,0.13,0.38,0.19
1,10119,simulation verification automated composition ...,0.51,0.77,0.62
2,11785,context content based trust policy semantic we...,0.39,1.04,0.56
3,12102,meteor web service annotation framework world ...,0.54,0.88,0.67
4,13109,detecting web page structure adaptive viewing ...,0.0,0.0,0.0


In [29]:
print("---------------- LDA RESULTS ----------------")
final_res = get_final_results(lda_results)
algo_precision_scores.append(final_res[0])
algo_recall_scores.append(final_res[1])
algo_f1_scores.append(final_res[2])
print(f"1. Avg. Precision score on the whole document : {final_res[0]}")
print(f"2. Avg. Recall score on the whole document : {final_res[1]}")
print(f"3. Avg. F1 score on the whole document : {final_res[2]}")

---------------- LDA RESULTS ----------------
1. Avg. Precision score on the whole document : 0.20872
2. Avg. Recall score on the whole document : 0.52298
3. Avg. F1 score on the whole document : 0.28778


### Results for PositionRank Algorithm

In [30]:
pr_data = pr_pred_df['Extracted_Keywords'].tolist()

In [31]:
pr_results = get_Rouge_scores(refrence_data, pr_data)

In [32]:
pr_results.head()

Unnamed: 0,Doc_no,Abstract,Avg_precision_score,Avg_recall_score,Avg_f1_score
0,183,eigentrust algorithm reputation management p p...,0.14,0.57,0.23
1,10119,simulation verification automated composition ...,0.45,0.68,0.54
2,11785,context content based trust policy semantic we...,0.99,1.12,1.05
3,12102,meteor web service annotation framework world ...,0.6,0.78,0.68
4,13109,detecting web page structure adaptive viewing ...,0.05,0.22,0.08


In [33]:
print("---------------- PositionRank RESULTS ----------------")
final_res = get_final_results(pr_results)
algo_precision_scores.append(final_res[0])
algo_recall_scores.append(final_res[1])
algo_f1_scores.append(final_res[2])
print(f"1. Avg. Precision score on the whole document : {final_res[0]}")
print(f"2. Avg. Recall score on the whole document : {final_res[1]}")
print(f"3. Avg. F1 score on the whole document : {final_res[2]}")

---------------- PositionRank RESULTS ----------------
1. Avg. Precision score on the whole document : 0.2101
2. Avg. Recall score on the whole document : 0.46776
3. Avg. F1 score on the whole document : 0.26124


### Comparing the overall results

In [34]:
algo_comparision = pd.DataFrame(zip(algorithm, algo_precision_scores, algo_recall_scores, algo_f1_scores), columns=['Algorithm','Avg Precision Score', 'Avg Recall Score','Avg F1 Scores'])

In [35]:
print("\n----- Comparision of various algorithms for keywords extraction on WWW dataset -----")
display(algo_comparision)


----- Comparision of various algorithms for keywords extraction on WWW dataset -----


Unnamed: 0,Algorithm,Avg Precision Score,Avg Recall Score,Avg F1 Scores
0,RAKE,0.20918,0.4893,0.28082
1,YAKE,0.3169,0.53814,0.38094
2,TF-IDF,0.29954,0.56916,0.37492
3,KeyBERT,0.2561,0.45124,0.31272
4,LDA,0.20872,0.52298,0.28778
5,Position Rank,0.2101,0.46776,0.26124
