# Evaluation Keyword extraction

In [2]:
#imports
import pandas as pd
import re
import numpy as np

import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy import ARRAY, String

from sklearn.metrics import precision_score, recall_score, f1_score

from nltk.stem import WordNetLemmatizer
import nltk

In [113]:
#download packages
nltk.download("wordnet")
nltk.download("omw-1.4")

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data] Downloading package omw-1.4 to /home/ubuntu/nltk_data...


True

In [4]:
#define connection to db 
connect_string = 'postgresql+psycopg2://postgres:5050@localhost:5432/postgres'
#define sql queries
query_corpus = 'SELECT dbrecordid, "MeSH_title", "MeSH_abs" FROM ke_stage."corpus_keywords_MeSH"'
query_golden = 'SELECT * FROM ke_stage.corpus_small_key_eval LIMIT 30000000'

#create engine
engine = create_engine(connect_string)
#read data as df
df_corpus = pd.read_sql(query_corpus, engine)
df_golden = pd.read_sql(query_golden, engine)

In [3]:
#merge both dfs
result = pd.merge(df_corpus, df_golden, on=['dbrecordid'], how='inner')

In [6]:
#result.to_csv('/home/ubuntu/ullrich/my_code/data/eval_keywords.csv', sep=',', encoding='utf-8')
#result = pd.read_csv('/home/ubuntu/ullrich/my_code/data/eval_keywords.csv', sep=',')

In [4]:
test = result[:5000] 

In [114]:
# delete the main topic, lower and lemmatize the words
def clean_df(df):
    res = [] 
    for row in df['mesh']:
        temp = []
        for item in row:
            item = re.sub(r'\/(.*)$','', item)
            if ',' in item:
                sublist = item.split(',')
                sublist = [split_item.lower().strip() for split_item in sublist]
                sublist2 =[]
                for word in sublist:
                    sublist2.append(lemmatizer.lemmatize(word))
                temp.extend(sublist2)
            else:
                temp.append(lemmatizer.lemmatize(item.lower()))
        res.append(temp)
    return res

In [110]:
#replace nan-values with empty list
def replace_nan(df, column):
    df[column] = df[column].apply(lambda d: d if isinstance(d, list) else [])
    return df

In [120]:
result['clean_mesh'] = clean_df(result)
result = replace_nan(result,'clean_mesh')
result = replace_nan(result,'MeSH_title')
result = replace_nan(result,'MeSH_abs')

In [116]:
result

Unnamed: 0,dbrecordid,MeSH_title,MeSH_abs,mesh,clean_mesh
0,M35920101,"[risk reduction, australia, dementia, adults]","[primary care, government, research, education...","[Humans, Aged, Australia, Dementia/prevention ...","[human, aged, australia, dementia, risk reduct..."
1,M35772114,[],"[health, health, will, environment, leadership...","[New Zealand, Delivery of Health Care, Humans,...","[new zealand, delivery of health care, human, ..."
2,M34481326,"[transient, brain, theta rhythms, architecture...","[brain, form]","[Oxytocin/administration & dosage, Adult, Alph...","[oxytocin, adult, alpha rhythm, brain, brain, ..."
3,M35816696,[cyclization],[],"[Cyclization, Esters, Peptides/chemistry]","[cyclization, ester, peptide]"
4,M32661566,"[kinetics, river]","[river, phenotype, paper, nature, understandin...","[Ants/physiology, Animals, Adhesiveness, Adapt...","[ant, animal, adhesiveness, adaptation, physio..."
...,...,...,...,...,...
484351,SOMED546225,[],"[cancer, cancer, cancer, cancer, mortality, mo...",[],[]
484352,AGRICOLAIND606461909,"[cells, membranes, membranes, water]","[electrolysis, proton, membranes, membranes, m...",[],[]
484353,M31483941,"[human, light, light, skin, safety]","[goal, light, light, therapeutic, safety]","[Low-Level Light Therapy, Humans, Light, Skin/...","[low-level light therapy, human, light, skin, ..."
484354,AGRICOLAIND605914486,[microbial fuel cell],"[microalgae, membranes, membranes, microbial f...",[],[]


In [None]:
test['clean_mesh'] = clean_df(test)
test = replace_nan(test,'clean_mesh')
test = replace_nan(test,'MeSH_title')
test = replace_nan(test,'MeSH_abs')

In [122]:
#get list with all keywords
auto_keywords = list(result['MeSH_title'] + result['MeSH_abs'])
true_keywords = result['clean_mesh'].values.tolist()

In [137]:
def remove_duplicates(liste):
    unique_lists = []
    for sublist in liste:
        unique_sublist = list(set(sublist))
        unique_lists.append(unique_sublist)
    return unique_lists

In [138]:
#remove duplicates
auto_keywords = remove_duplicates(auto_keywords)
true_keywords = remove_duplicates(true_keywords)

In [93]:
print(len(auto_keywords))
print(len(true_keywords))

484356
484356


In [76]:
auto_keywords[7]

['radiation',
 'atmosphere',
 'algorithm',
 'separation',
 'methods',
 'temperature',
 'heat',
 'chinese',
 'dataset']

In [139]:
# empty the list, when parrallel list is empty
def empty_lists(pred_l, true_l):
    for i in range(len(pred_l)):
        if not true_l[i]:  # check, if list is empty
                pred_l[i] = []  
    return pred_l

In [140]:
auto_keywords = empty_lists(auto_keywords, true_keywords)

In [141]:
#get same words from both lists (true positives)
def get_intersection(extract_list, true_list):
    res_liste =[]
    for i in range(len(extract_list)):
        res_liste.append(list(set(extract_list[i]).intersection(set(true_list[i]))))
    return res_liste

In [142]:
def count_list_entities(liste):
    count = 0
    for each in liste:
        count = count + len(each)
    return count

- tp = intersection beider listen
- fp = kommt in auto_key aber nicht in intersection
- tn = wörter in averbis aber nicht in intersection

In [149]:
tp_count = count_list_entities(get_intersection(auto_keywords, true_keywords))
total_averbis_count = count_list_entities(true_keywords)

print(tp_count/total_averbis_count)

0.22633776808531467


In [148]:
print(precision)
print(recall)

0.19553225776884528
1604226.0


In [57]:
print(count_list_entitys(res_liste))

5067


In [58]:
print(count_list_entitys(true_keywords))

28655


In [None]:
fn = count_list_entities(true_keywords)
fn2 = fn - tp_count
#print(total_pred_count)
#precision = tp_count / total_pred_count
#recall = tp_count / tp_count + fn2

In [None]:
for x in auto_keywords[:100]:
    for y in true_keywords[:100]:      
        matches = list(set(list1).intersection(list2))
        #accuracy = match_count / total_count

In [27]:
for x in auto_keywords[:100]:
    for y in true_keywords[:100]:
        matches = list(set(x).intersection(y))
        #matches = MultiLabelBinarizer().fit_transform(matches)
        pre = precision_score(y,x)
        print(pre)

ValueError: Found input variables with inconsistent numbers of samples: [7, 24]

In [22]:
from sklearn.metrics import precision_score, recall_score, f1_score

def pre_re(true_list, predicted_list):
    # Berechnung der true positives (TP), false positives (FP) und false negatives (FN)
    tp = len([1 for true, pred in zip(true_list, predicted_list) if pred in true])
    fp = len([1 for pred in predicted_list if pred not in true_list])
    fn = len([1 for true, pred in zip(true_list, predicted_list) if true not in pred])
    
    # Berechnung des Precision-Scores
    precision = tp / (tp + fp)

    # Berechnung des Recall-Scores
    recall = tp / (tp + fn)

    # Berechnung des F1-Scores
    #f1 = 2 * (int(precision) * int(recall)) / (int(precision) + int(recall))

    return precision, recall

In [110]:
# Berechnung des F1-Scores
precision, recall = pre_re(true_keywords, auto_keywords)

#print("F1-Score:", f1_score)
print("precision:", precision)
print('recall', recall)

precision: 0.0
recall 0.0


In [116]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
auto_keywords = mlb.fit([x for x in auto_keywords])
true_keywords = mlb.fit([x for x in true_keywords])

In [20]:
def calculation(true_list, pred_list):
    result = [] 
    tmp = {}
    trues = [item in true_list for item in pred_list]
    f1 = f1_score(trues, [True] * len(trues), average='weighted')
    #result.append(tmp)
    return f1

In [21]:
calculation(true_keywords,auto_keywords)

0.0004433497536945813

In [28]:
def calculate_precision_recall(expected, predicted):
    tp = 0  # True Positives
    fp = 0  # False Positives
    fn = 0  # False Negatives

    for i in range(len(expected)):
        for j in range(len(expected[i])):
            if expected[i][j] in predicted[i]:
                tp += 1
            else:
                fn += 1

    for i in range(len(predicted)):
        for j in range(len(predicted[i])):
            if predicted[i][j] not in expected[i]:
                fp += 1

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    return precision, recall

In [26]:
def calculate_precision_recall(expected, predicted):
    expected = [x for x in expected]
    predicted = [x for x in expected]  
    tp = len(set(expected) & set(predicted))  # True Positives
    fp = len(predicted) - tp  # False Positives
    fn = len(expected) - tp  # False Negatives

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    return precision, recall

In [29]:
precision, recall = calculate_precision_recall(true_keywords, auto_keywords)
print("Precision:", precision)
print("Recall:", recall)

Precision: 0.07772199560715407
Recall: 0.17746731148128247


- nur Englische Terme wurden annotiert?

In [105]:
def calculate_acc_score(combinations, pred_class):
    result = [] 
    total_count = len(pred_class)
    match_count = 0
    for item in pred_class:
        if item in combinations:
            match_count = match_count + 1
            print(item)
    #print(match_count)        
    accuracy = match_count / total_count
    return result