# Evaluation Keyword extraction

In [1]:
#imports
import pandas as pd
import re
import numpy as np

import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy import ARRAY, String

from sklearn.metrics import precision_score, recall_score, f1_score

from nltk.stem import WordNetLemmatizer
import nltk

In [2]:
#download packages
nltk.download("wordnet")
nltk.download("omw-1.4")

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ubuntu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
#define connection to db 
connect_string = 'postgresql+psycopg2://postgres:5050@localhost:5432/postgres'
#define sql queries
query_corpus = 'SELECT dbrecordid, "MeSH_title", "MeSH_abs" FROM ke_stage."corpus_keywords_MeSH"'
query_golden = 'SELECT * FROM ke_stage.corpus_small_key_eval LIMIT 30000000'

#create engine
engine = create_engine(connect_string)
#read data as df
df_corpus = pd.read_sql(query_corpus, engine)
df_golden = pd.read_sql(query_golden, engine)

In [4]:
#merge both dfs
result = pd.merge(df_corpus, df_golden, on=['dbrecordid'], how='inner')

In [5]:
# delete the main topic, lower and lemmatize the words
def clean_df(df):
    res = [] 
    for row in df['mesh']:
        temp = []
        for item in row:
            item = re.sub(r'\/(.*)$','', item)
            if ',' in item:
                sublist = item.split(',')
                sublist = [split_item.lower().strip() for split_item in sublist]
                sublist2 =[]
                for word in sublist:
                    sublist2.append(lemmatizer.lemmatize(word))
                temp.extend(sublist2)
            else:
                temp.append(lemmatizer.lemmatize(item.lower()))
        res.append(temp)
    return res

In [6]:
#replace nan-values with empty list
def replace_nan(df, column):
    df[column] = df[column].apply(lambda d: d if isinstance(d, list) else [])
    return df

In [7]:
result['clean_mesh'] = clean_df(result)
result = replace_nan(result,'clean_mesh')
result = replace_nan(result,'MeSH_title')
result = replace_nan(result,'MeSH_abs')

In [33]:
result

Unnamed: 0,dbrecordid,MeSH_title,MeSH_abs,mesh,clean_mesh
0,M30823664,"[proteostasis, presenilins, membrane]","[regulation, signal transduction, catalytic su...","[Autophagy, Amyloid Precursor Protein Secretas...","[autophagy, amyloid precursor protein secretas..."
1,M35681513,[tumors],"[mice, tumorigenesis, report, lncrna]","[Animals, Tumor Suppressor Protein p53/genetic...","[animal, tumor suppressor protein p53, sarcoma..."
2,AGRICOLAIND44399627,"[finland, toxoplasma gondii, sheep, seropreval...","[domestic sheep, adult, season, odds ratio, ma...",[],[]
3,AGRISUS201600201127,"[membrane, chondrocytes]","[membrane, membrane, membrane, membrane, membr...",[],[]
4,M14977554,[],"[prokaryotic cells, streptococcus, tyrosine, a...","[Prokaryotic Cells/enzymology, Bacteria/enzymo...","[prokaryotic cells, bacteria, catalysis, histi..."
...,...,...,...,...,...
484351,BASE::ftdoajarticles:oai:doaj.org/article:fb14...,"[sculpture, compréhension]","[elements, forms, engraving, atmosphere, movem...",[],[]
484352,BASE::fthzi:oai:repository.helmholtz-hzi.de:10...,[streptococcus],"[plasminogen, plasminogen, plasminogen, cell l...",[],[]
484353,M31862392,"[biofilm, pseudomonas aeruginosa, quorum sensi...","[biofilm, biofilm, biofilm, biofilm, biofilm, ...","[Pyocyanine/metabolism, Pseudomonas aeruginosa...","[pyocyanine, pseudomonas aeruginosa, lichen, i..."
484354,ULIDAT00537066,[],"[schweiz, umwelt, methoden, methoden, oesterre...",[],[]


In [34]:
#get list with all keywords
auto_keywords = list(result['MeSH_title'] + result['MeSH_abs'])
true_keywords = result['clean_mesh'].values.tolist()

In [35]:
def remove_duplicates(liste):
    unique_lists = []
    for sublist in liste:
        unique_sublist = list(set(sublist))
        unique_lists.append(unique_sublist)
    return unique_lists

In [36]:
#remove duplicates
auto_keywords = remove_duplicates(auto_keywords)
true_keywords = remove_duplicates(true_keywords)

In [37]:
print(len(auto_keywords))
print(len(true_keywords))

484356
484356


In [38]:
# empty the list, when parrallel list is empty
def empty_lists(pred_l, true_l):
    for i in range(len(pred_l)):
        if not true_l[i]:  # check, if list is empty
                pred_l[i] = []  
    return pred_l

In [39]:
auto_keywords = empty_lists(auto_keywords, true_keywords)
true_keywords = empty_lists(true_keywords,auto_keywords)

In [41]:
print(len(auto_keywords))
print(len(true_keywords))

484356
484356


In [42]:
#get same words from both lists (true positives)
def get_intersection(extract_list, true_list):
    res_liste =[]
    for i in range(len(extract_list)):
        res_liste.append(list(set(extract_list[i]).intersection(set(true_list[i]))))
    return res_liste

In [43]:
def count_list_entities(liste):
    count = 0
    for each in liste:
        count = count + len(each)
    return count

In [44]:
#remove empty lists
auto_keywords = [ele for ele in auto_keywords if ele != []]
true_keywords = [ele for ele in true_keywords if ele != []]

In [45]:
print(len(auto_keywords))
print(len(true_keywords))

174631
174631


In [46]:
tp_count = count_list_entities(get_intersection(auto_keywords, true_keywords))
total_averbis_count = count_list_entities(true_keywords)

print(tp_count/total_averbis_count)

0.1898790084410128
