In [1]:
import os
import itertools
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score


## Verb phrase/noun phrase extraction performance evaluation

In [2]:
original = pd.read_csv(os.path.join('..', 'datasets', 'activity_dataset_final.csv'), sep=';')
# Exlude cases which contain multiple verb phrase cases 
# Such cases might come from disjunctive/conjunctiove clauses

def single_verb(x):
    if pd.isnull(x):
        return True
    return len(list(itertools.chain([v.split('|') for v in x.split(';')]))) == 1
    
ind_original = original['Verb phrases'].apply(single_verb)
original = original[ind_original]
ind_original_vp = ~pd.isnull(original['Verb phrases'])
original['VerbRequired'] = ind_original_vp.astype(int)
original['NounRequired'] = (~pd.isnull(original['Noun phrases'])).astype(int)


In [3]:
df_stanza = pd.read_csv('stanza-phrases.csv',sep=';')
df_stanza = df_stanza[ind_original]
df_stanza['VerbRequired'] = (~pd.isnull(df_stanza['VerbPhrases'])).astype(int)
df_stanza['NounRequired'] = (~pd.isnull(df_stanza['NounPhrases'])).astype(int)
df_stanza.head()


Unnamed: 0,Subject,Activity,VerbPhrases,NounPhrases,ProperNouns,VerbRequired,NounRequired
0,User,Open Enclosure,,Open Enclosure,,0,1
1,Liquid Cooling Module,Monitor LCM Status,Monitors,Status,LCM,1,1
2,TRS,Periodic Set Reference Temperature,,Periodic Set Reference Temperature,,0,1
3,User,Get M10 Tilt Angles,Gets,Angles,M10 Tilt,1,1
4,Translation Stage Drive,Preset FSS Translation Stage,,Preset FSS Translation Stage,,0,1


Accuracy of detection if verb phrase is extracted when required for transformation

In [4]:
print(confusion_matrix(original['VerbRequired'], df_stanza['VerbRequired']))
print('Accuracy:', accuracy_score(original['VerbRequired'], df_stanza['VerbRequired']))


[[ 106   10]
 [ 349 1506]]
Accuracy: 0.8178589548452562


Accuracy of detection if noun phrase is extracted when required for transformation

In [5]:
print(confusion_matrix(original['NounRequired'], df_stanza['NounRequired']))
print('Accuracy:', accuracy_score(original['NounRequired'], df_stanza['NounRequired']))


[[  27    6]
 [  44 1894]]
Accuracy: 0.9746321664129883


Calculate matching statistics for both cases when transformation output is repersented one or more noun phrases, or by a "verb phrase-noun phrase tuple". For the latter case, it is important to calculate errors for both extraction of verb phrase and noun phrase parts. If the output is represented by noun phrase, verb phrase entry in the original dataset is empty. 


In [6]:
def equal_verbs(verb1, verb2):
    if pd.isnull(verb1) & pd.isnull(verb2): return True
    if (~pd.isnull(verb1) and pd.isnull(verb2)) or (pd.isnull(verb1) and ~pd.isnull(verb2)): return False
    return verb1.lower() == verb2.lower()

def equal_outputs(out1, out2):
    if pd.isnull(out1) & pd.isnull(out2): return True
    if (~pd.isnull(out1) & pd.isnull(out2)) | (pd.isnull(out1) & ~pd.isnull(out2)): return False
    out1 = out1.split('|')
    out2 = out2.split('|')
    return sorted(list(map(lambda x: x.lower(), out1))) == sorted(list(map(lambda x: x.lower(), out2)))

# Extract matching statistics for cases when transformation output will be noun
# No need to check for verb extraction statistics
stanza_matches_outputn = pd.Series(map(lambda x, y: equal_outputs(x, y), original[~ind_original_vp]['Noun phrases'], df_stanza[~ind_original_vp]['NounPhrases']), 
                                   index=df_stanza[~ind_original_vp].index)

# Extract matching statistics for cases when transformation output will be as "verb-noun" type (e.g. association-class)
stanza_matches_outputv_np = pd.Series(map(lambda x, y: equal_outputs(x, y), original[ind_original_vp]['Noun phrases'], df_stanza[ind_original_vp]['NounPhrases']),
                                     index=df_stanza[ind_original_vp].index)
stanza_matches_outputv_vp = pd.Series(map(lambda x, y: equal_verbs(x, y), original[ind_original_vp]['Verb phrases'], df_stanza[ind_original_vp]['VerbPhrases']),
                                     index=df_stanza[ind_original_vp].index)


In [7]:
stanza_matches_outputn_nenp = stanza_matches_outputn[~pd.isnull(df_stanza[~ind_original_vp]['NounPhrases'])]
prec_outputn = sum(stanza_matches_outputn_nenp)/len(stanza_matches_outputn_nenp)
print('Noun extraction matching precision for transformation cases when output will be noun:', prec_outputn)
# No need to calculate precision for verb phrase part, as output will be only noun
print('Verb extraction matching precision for transformation cases when output will be noun:', np.nan)
stanza_matches_outputv_ne = ~pd.isnull(df_stanza[ind_original_vp]['NounPhrases'])
stanza_matches_outputv_nenp = stanza_matches_outputv_np[stanza_matches_outputv_ne]
prec_outputv_np = sum(stanza_matches_outputv_nenp)/len(stanza_matches_outputv_nenp)
print('Noun extraction matching precision for transformation cases when output will be noun:', prec_outputv_np)
stanza_matches_outputv_nevp = stanza_matches_outputv_vp[stanza_matches_outputv_ne]
prec_outputv_vp = sum(stanza_matches_outputv_nevp)/len(stanza_matches_outputv_nevp)  
print('Verb extraction matching precision for transformation cases when output will be noun:', prec_outputv_vp)


Noun extraction matching precision for transformation cases when output will be noun: 0.831858407079646
Verb extraction matching precision for transformation cases when output will be noun: nan
Noun extraction matching precision for transformation cases when output will be noun: 0.7604924454392837
Verb extraction matching precision for transformation cases when output will be noun: 0.6233911583659765


In [8]:
recall_outputn = sum(stanza_matches_outputn)/len(stanza_matches_outputn)
print('Noun extraction matching recall for transformation cases when output will be noun:', recall_outputn)
print('Verb extraction matching recall for transformation cases when output will be noun:', np.nan)
recall_outputv_np = sum(stanza_matches_outputv_np)/len(stanza_matches_outputv_np)
print('Noun extraction matching recall for transformation cases when output will be noun:', recall_outputv_np)
recall_outputv_vp = sum(stanza_matches_outputv_vp)/len(stanza_matches_outputv_vp)
print('Verb extraction matching recall for transformation cases when output will be noun:', recall_outputv_vp)


Noun extraction matching recall for transformation cases when output will be noun: 0.8103448275862069
Verb extraction matching recall for transformation cases when output will be noun: nan
Noun extraction matching recall for transformation cases when output will be noun: 0.7471698113207547
Verb extraction matching recall for transformation cases when output will be noun: 0.6215633423180593


In [9]:
def f1_score(precision, recall):
    return 2*precision*recall/(precision+recall)

print('Noun extraction matching F1 score for transformation cases when output will be noun:', f1_score(prec_outputn, recall_outputn))
print('Verb extraction matching F1 score for transformation cases when output will be noun:', np.nan)
print('Noun extraction matching F1 score for transformation cases when output will be noun:', f1_score(prec_outputv_np, recall_outputv_np))
print('Verb extraction matching F1 score for transformation cases when output will be noun:', f1_score(prec_outputv_vp, recall_outputv_vp)) 
    

Noun extraction matching F1 score for transformation cases when output will be noun: 0.8209606986899564
Verb extraction matching F1 score for transformation cases when output will be noun: nan
Noun extraction matching F1 score for transformation cases when output will be noun: 0.7537722648716106
Verb extraction matching F1 score for transformation cases when output will be noun: 0.6224759085614535


## Named entity recognition performance evaluation


In [10]:
ner_original = pd.read_csv(os.path.join('..', 'datasets', 'ner_dataset_final.csv'), sep=';')
ner_stanza = pd.read_csv('stanza-ner.csv', sep=';')


In [11]:
ner_original.head()

Unnamed: 0,Entry,Entities,EntityType
0,Storyboard,,
1,Building,,
2,review,,
3,end,,
4,Charles,Charles,PERSON


In [12]:
ner_stanza.head()

Unnamed: 0,Entry,Entities,EntityType
0,Storyboard,,
1,Building,,
2,review,,
3,end,,
4,Charles,Charles,PERSON


Detect if entity is extracted when required

In [13]:
ind_has_entity = ~pd.isnull(ner_original['Entities'])
ner_original['HasEntity'] = ind_has_entity.astype(int)
ner_stanza['HasEntity'] = (~pd.isnull(ner_stanza['Entities'])).astype(int)
print(confusion_matrix(ner_original['HasEntity'], ner_stanza['HasEntity']))
print('Accuracy:', accuracy_score(ner_original['HasEntity'], ner_stanza['HasEntity']))


[[1472  122]
 [  97  353]]
Accuracy: 0.8928571428571429


Calculate performance if valid named entities are extracted (at token level)

In [14]:
ner_matches = pd.Series(map(lambda x, y: equal_outputs(x, y), ner_original[ind_has_entity]['Entities'], ner_stanza[ind_has_entity]['Entities']),
                        index=ner_stanza[ind_has_entity].index)
stanza_matches_ne_token = ner_matches[~pd.isnull(ner_stanza[ind_has_entity]['Entities'])]
prec_ner_token = sum(stanza_matches_ne_token)/len(stanza_matches_ne_token)
print('NER precision:', prec_ner_token)
recall_ner_token = sum(ner_matches)/len(ner_matches)
print('NER recall:', recall_ner_token)
print('NER F1:', f1_score(prec_ner_token, recall_ner_token))


NER precision: 0.9320113314447592
NER recall: 0.7311111111111112
NER F1: 0.8194271481942714


Calculate performance if valid named entity types are extracted

In [15]:
ner_matches = pd.Series(map(lambda x, y: equal_outputs(x, y), ner_original[ind_has_entity]['EntityType'], ner_stanza[ind_has_entity]['EntityType']),
                        index=ner_stanza[ind_has_entity].index)
stanza_matches_ne_token = ner_matches[~pd.isnull(ner_stanza[ind_has_entity]['EntityType'])]
prec_ner_token = sum(stanza_matches_ne_token)/len(stanza_matches_ne_token)
print('NER precision:', prec_ner_token)
recall_ner_token = sum(ner_matches)/len(ner_matches)
print('NER recall:', recall_ner_token)
print('NER F1:', f1_score(prec_ner_token, recall_ner_token))


NER precision: 0.9348441926345609
NER recall: 0.7355555555555555
NER F1: 0.8233117135179748
