# JupterLab Notebook to support NLP-ADAPT experiment
## Copyright (c) Greg M. Silverman: Regents of the University of Minnesota

**Data prerequisites for analysis:**
1. Output from n-annotator systems (see [NLP-ADAPT](https://github.com/nlpie/nlp-adapt))
2. Manually annotated documents using BRAT annotation software
3. word2phrase trained model

In [None]:
from xmljson import badgerfish as bf
from xmljson import cobra as ca 
from xml.etree.ElementTree import fromstring
import json
from json import dumps
import pandas as pd
import re
import numpy as np
import math
from scipy import stats
import os, glob, path
from scipy.stats.mstats import gmean
import time
import gensim
import pandas as pd
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
from gensim.utils import tokenize
from sklearn.feature_extraction.text import CountVectorizer 

## Process CAS XMI

In [None]:
# https://gist.github.com/douglasmiranda/5127251
def find(key, dictionary):
    """
    find key by given value in nested JSON
    """
    for k, v in dictionary.items():
        if k == key:
            yield v
        elif isinstance(v, dict):
            for result in find(key, v):
                yield result
        elif isinstance(v, list):
            for d in v:
                for result in find(key, d):
                    yield result

In [None]:
def system_sofa(lines):   
    """
    travese nested JSON and extract text block for subject of analysis  
    """
    
    data = dumps(ca.data(fromstring(lines[0])))
    d = json.loads(data)

    # print to get JSON representation of file
    # print(d)

    xmi = d.get("{http://www.omg.org/XMI}XMI").get("children")
    for x in xmi:
        if x.get('{http:///uima/cas.ecore}Sofa'):
            is_sofa = x.get('{http:///uima/cas.ecore}Sofa')
            if list(find('sofaString', is_sofa)):
                text = is_sofa 
    return text, xmi

def myprint(d, begin, end):
    """
    maps span to associated text
    """
    i = 0
    out = ''
    for k, v in d.items():
        if isinstance(v, dict):
            myprint(v, begin, end)
            return myprint(v, begin, end)
        elif i == 3 and k != 'mimeType':
            if v:
                return v[begin:end]
        i += 1

# test
# test slicing from text
#v = myprint(text, 0, 1540)
#print(v)

## Initialization class for UIMA system annotation retrieval

In [None]:
class AnnotationSystems(object):
    """
    CAS XMI -> JSON mappings done as per Cobra convbention
    
    """
    
    def __init__(self):
        
        """ 
        annotation base types
        """
        self.annotation_type_system = ['http:///org/metamap/uima/ts.ecore',
                                     'http:///org/apache/ctakes/typesystem/type/syntax.ecore',
                                     'http:///org/apache/uima/ruta/type.ecore',
                                     'http:///edu/uth/clamp/nlp/typesystem.ecore',
                                     'http:///org/apache/ctakes/typesystem/type/textsem.ecore',
                                     'http:///org/apache/ctakes/typesystem/type/textspan.ecore',
                                     'http:///biomedicus/v2.ecore']
        """
        annotaion relation types: TODO -> future examination of these
        """
        self.annotation_type_relations = ['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA',
                                     '{http:///org/apache/ctakes/typesystem/type/textsem.ecore}SemanticArgument',
                                     '{http:///org/apache/ctakes/typesystem/type/textsem.ecore}Predicate']

        self.annotation_relation_attributes = ['ClampRelationUIMA', 'SemanticRoleRelation']

        """
        system types of ineterest
        """
        self.biomedicus_types = ['Acronym',
                         'DayOfWeek',
                         'DictionaryTerm',
                         'Fuzzy',
                         'Historical',
                         'IndefiniteQuantifierCue',
                         'ModificationCue', 
                         #'Negated',
                         'NormForm', 
                         'Number',
                         'NumberRange',
                         'OtherAcronymSense',
                         'ParseToken',
                         'SeasonWord', 
                         'StandaloneQuantifier',
                         'TemporalPhrase',
                         'TextData', 
                         'TimeUnits',
                         'UmlsConcept',
                         'YearNumber']
        
        self.clamp_types = ['All',
                             'Any', 
                             'BaseToken',
                             #'CW',
                             'Chunk',
                             'ClampNameEntityUIMA',
                             #'ClampRelationUIMA', # TODO with NN
                             #'ConllDependencyNode',
                             'NUM',
                             'RutaBasic',
                             #'SPACE',
                             #'SPECIAL',
                             #'SW',
                             'Segment',
                             'Sentence',
                             'TokenSeed',
                             'W']
        
        self.ctakes_types = ['AnatomicalSiteMention',
                             'ConllDependencyNode',
                             'ContractionToken',
                             'DateAnnotation',
                             'DiseaseDisorderMention',
                             'EntityMention',
                             'EventMention',
                             'FractionAnnotation',
                             'IdentifiedAnnotation',
                             'MeasurementAnnotation',
                             'MedicationMention',
                             'NumeToken',
                             'Predicate',
                             'ProcedureMention',
                             'RangeAnnotation',
                             'RomanNumeralAnnotation',
                             'SemanticArgument', # TODO with NN
                             #'SemanticRoleRelation',
                             'Sentence',
                             'SignSymptomMention',
                             'UmlsConcept',
                             'WordToken']
        
        self.metamap_types = ['AcronymAbbrev',
                             'Annotation',
                             'AnnotationBase',
                             'Candidate',
                             #'Negation'#,
                             'Phrase',
                             'Span',
                             'Utterance']
        
        self.amicus_types = ['AnatomicalSiteMention',
                             'Candidate',
                             'Chunk',
                             'IndefiniteQuantifierCue',
                             #'MedicationMention',
                             'Number',
                             'Phrase',
                             'Predicate',
                             'SemanticArgument',
                             'SignSymptomMention',
                             'StandaloneQuantifier',
                             'UmlsConcept']
        
        self.amicus_type = ['IndefiniteQuantifierCue','StandaloneQuantifier','Number']
        
        
    def get_system_type(self, system):
        
        """
        return system types
        """

        if system == 'biomedicus':
            types = self.biomedicus_types

        elif system == 'clamp':
            types = self.clamp_types

        elif system == 'ctakes':
            types = self.ctakes_types

        elif system == 'metamap':
            types = self.metamap_types
        
        elif system == 'amicus':
            types = self.amicus_types

        return types


def get_system_annotations(xmi, system, case, text, type_of_analysis=None): 
    
    """
    traverse JSON representation of XMI CAS object as represented by nested JSON object mapping as per Cobra convention
    NB: use type_of_analysis to control flow for relationship annotations
    
    """
    
    annSys = AnnotationSystems()
    types = annSys.get_system_type(system)
    
    annotation_out = []
    
    # traverse xmi JSON blob
    for x in xmi:
        for t in types: # parse by system type
            for ats in annSys.annotation_type_system: # with base types

                if list(find('{' + ats + '}' + t, x)):
                    if type_of_analysis is not None and t in annSys.annotation_relation_attributes: # traverse relationship type
                        # ['ClampRelationUIMA', 'SemanticRoleRelation']:
                        if t == 'ClampRelationUIMA':
                             #'Relation', need to resolve linkage by semantic argument and predicate ids
                             # link each by 'relation' 
                             # <textsem:SemanticRoleRelation argument="34379" category="A1" conditional="false" confidence="0.0" discoveryTechnique="0" id="0" polarity="0" predicate="34372" uncertainty="0" xmi:id="34385"/>
                             # <textsem:SemanticArgument begin="1400" end="1403" label="A1" relation="34385" sofa="1" xmi:id="34379"/>
                             # <textsem:Predicate begin="1385" end="1396" frameSet="extricate.01" relations="34385" sofa="1" xmi:id="34372"/>
                            #print(t['{' + ats + '}' + token]['attributes']['entTo'])
                            #print(t['{' + ats + '}' + token]['attributes']['entFrom'])

                            endTo = x['{' + ats + '}' + t]['attributes']['entTo']
                            endFrom = x['{' + ats + '}' + t]['attributes']['entFrom']

                            #print('{0}, {1}'.format('PARENT',  t['{' + ats + '}' + token]['attributes']))
                            for x1 in xmi:
                                if list(find('{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA', x1)):
                                    if x1['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['{http://www.omg.org/XMI}id'] == endTo:
                                        #print(x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes'])
                                        #print('{0}, {1}, {2}, {3}'.format('endTo', endTo, x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['begin'],x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['end']))
                                        x1['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']
                                        #print('{0}, {1}, {2}, {3}'.format('endTo', endTo, x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['begin'],x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['end']))
                                    if x1['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['{http://www.omg.org/XMI}id'] == endFrom:
                                        #print(x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes'])
                                        #print('{0}, {1}, {2}, {3}'.format('endFrom', endFrom, x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['begin'],x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['end']))
                                        x1['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']
                                        #print('{0}, {1}, {2}, {3}'.format('endFrom', endFrom, x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['begin'],x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['end']))
                        if t == 'SemanticRoleRelation':
                             #'Relation', need to resolve linkage by semantic argument and predicate ids
                             # link each by 'relation' 
                             # <textsem:SemanticRoleRelation argument="34379" category="A1" conditional="false" confidence="0.0" discoveryTechnique="0" id="0" polarity="0" predicate="34372" uncertainty="0" xmi:id="34385"/>
                             # <textsem:SemanticArgument begin="1400" end="1403" label="A1" relation="34385" sofa="1" xmi:id="34379"/>
                             # <textsem:Predicate begin="1385" end="1396" frameSet="extricate.01" relations="34385" sofa="1" xmi:id="34372"/>
#                             print(t['{' + ats + '}' + token]['attributes']['argument'])
#                             print(t['{' + ats + '}' + token]['attributes']['predicate'])
#                             print(t['{' + ats + '}' + token]['attributes'])

                            argument = t['{' + ats + '}' + t]['attributes']['argument']
                            predicate = t['{' + ats + '}' + t]['attributes']['predicate']

                            for x1 in xmi:
                                if list(find('{http:///org/apache/ctakes/typesystem/type/textsem.ecore}SemanticArgument', x1)):
                                    if x1['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}SemanticArgument']['attributes']['{http://www.omg.org/XMI}id'] == argument:
                                        #print(x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}SemanticArgument']['attributes'])
                                        #print('{0}, {1}, {2}, {3}'.format('argument', argument, x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}SemanticArgument']['attributes']['begin'],x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}SemanticArgument']['attributes']['end']))
                                        x1['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}SemanticArgument']['attributes']
                                        #print('{0}, {1}, {2}, {3}'.format('argument', argument, x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}SemanticArgument']['attributes']['begin'],x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}SemanticArgument']['attributes']['end']))
                            for x1 in xmi:
                                 if list(find('{http:///org/apache/ctakes/typesystem/type/textsem.ecore}Predicate', x)):
                                    if x1['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}Predicate']['attributes']['{http://www.omg.org/XMI}id'] == predicate:
                                        #print(x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}Predicate']['attributes'])
                                        #print('{0}, {1}, {2}, {3}'.format('argument', predicate, x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}Predicate']['attributes']['begin'],x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}Predicate']['attributes']['end']))
                                        x1['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}Predicate']['attributes']
                                        #print('{0}, {1}, {2}, {3}'.format('argument', predicate, x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}Predicate']['attributes']['begin'],x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}Predicate']['attributes']['end']))
                    
                    elif t not in annSys.annotation_relation_attributes: # normal sysetem type
                        #if (x.get('{http:///org/metamap/uima/ts.ecore}Phrase')):
#                         if (x.get('{http:///biomedicus/v2.ecore}UmlsConcept') or x.get('{http:///biomedicus/v2.ecore}StandaloneQuantifier')):
#                             print('!!!!')
#                             print(x.get('{http:///biomedicus/v2.ecore}UmlsConcept'), x.get('{http:///biomedicus/v2.ecore}StandaloneQuantifier'))
#                             print(x['{' + ats + '}' + t]['attributes'])
#                             print('!!!!')
                                                                                           
                        begin = x['{' + ats + '}' + t]['attributes']['begin']
                        end = x['{' + ats + '}' + t]['attributes']['end']
                        d = {'system': system, 'type': t, 'begin': begin, 'end': end, 'text': myprint(text, int(begin), int(end)), 'case': case}
                        
                        if d not in annotation_out:
                            annotation_out.append(d)

    df = pd.DataFrame(annotation_out)
    
    # test:
    # examine output from parsed xmi
    # print(df)
    return df, annotation_out

## Various methods for parsing BRAT ann files

In [None]:
# brat
# https://stackoverflow.com/questions/15325182/how-to-filter-rows-in-pandas-by-regex

# entity = test[test.brat_id.str.startswith("T")]
# relation = test[test.brat_id.str.startswith("R")]
# attribute = test[test.brat_id.str.startswith("A")]

In [None]:
def get_span(string):
    pattern = re.compile(r'\s(\d+).*?(\d+$)')
    match = re.search(pattern, string)
    return match.groups()

def get_entity_type(string):
    return string.split()[0]

def get_relation_type(string):
    return string.split()[0]

def get_relation_entities(string):
    out = string.split()[1:3]
    return (out[0].split(':')[1], out[1].split(':')[1])
    
def get_attribute_type(string):

    return string.split()[0]

def get_attribute_entity(string):
    out = string.split()[1:2]
    return out[0]

def get_attribute_degree(string):
    if len(string.split()) > 2:
        out = string.split()[2:3]
        return out[0]
    else:
        return None

## Methods for parsing BRAT annotations

In [None]:
def get_gold_annotations(df):
    """
    parse BRAT annotations into requisite categories
    parse span
    """
    
    # get type of snnotation as per spec: http://brat.nlplab.org/standoff.html
    entity = df[df.brat_id.str.startswith("T")]
    relation = df[df.brat_id.str.startswith("R")]
    attribute = df[df.brat_id.str.startswith("A")]
    
    # segregate by entity, relation, annotation
    df['entity_span'] = entity['brat_mapping'].apply(get_span)
    df['entity_type'] = entity['brat_mapping'].apply(get_entity_type)

    df['relation_type'] = relation['brat_mapping'].apply(get_relation_type)
    df['relation_entities'] = relation['brat_mapping'].apply(get_relation_entities)

    df['attribute_type'] = attribute['brat_mapping'].apply(get_attribute_type)
    df['attribute_entity'] = attribute['brat_mapping'].apply(get_attribute_entity)
    df['attribute_degree'] = attribute['brat_mapping'].apply(get_attribute_degree)

    # get non-null entities 
    comp = df[df.entity_span.notnull()]
    cols_to_keep = ['text', 'entity_span', 'brat_id', 'entity_type', 'case']
    comp = comp[cols_to_keep]
    
    # split span into two separate columns
    span = comp['entity_span'].apply(pd.Series)
    span.columns = ['begin', 'end']
    
    # merge back together
    span_comp = span.merge(comp, how='inner', left_index=True, right_index=True)
    cols_to_keep = ['text', 'begin', 'end', 'brat_id', 'case']
    span_comp = span_comp[cols_to_keep]
    
    return df, span, comp, span_comp

In [None]:
# get relations from .ann df
def get_gold_relations(df, span_comp):
    """
    extract BRAT relationships and associated entities
    """
    
    # get non-null relationship entities
    comp_r = df[df.relation_entities.notnull()]
    cols_to_keep = ['relation_type', 'relation_entities', 'brat_id', 'case']
    comp_r = comp_r[cols_to_keep]

    # split span into two separate columns 
    span_r = comp_r['relation_entities'].apply(pd.Series)
    span_r.columns = ['entity1_r', 'entity2_r']

    # merge back together
    span_comp_r = span_r.merge(comp_r, how='inner', left_index=True, right_index=True)
    cols_to_keep = ['relation_type', 'relation_entities','entity1_r', 'entity2_r', 'case']
    span_comp_r = span_comp_r[cols_to_keep]

    # get entities for relationship 
    
    # left entity
    entity_1r = span_comp_r.merge(span_comp, how='left', left_on=['entity1_r', 'case'], right_on=['brat_id', 'case'])
    entity_1r = entity_1r.rename(columns={'text': 'text_entity1', 'begin': 'entity1_begin', 'end': 'entity1_end', 'brat_id': 'entity1', 'case_y': 'case'})
    
    # right entity
    entity_2r = span_comp_r.merge(span_comp, how='left', left_on=['entity2_r', 'case'], right_on=['brat_id', 'case'])
    entity_2r = entity_2r.rename(columns={'text': 'text_entity2', 'begin': 'entity2_begin', 'end': 'entity2_end', 'brat_id': 'entity2', 'case_y': 'case'})

    # merge entities for relationship
    entity_relation = entity_1r.merge(entity_2r, how='inner', left_index=True, right_index=True)

    cols_to_keep = ['relation_type_x', 'entity1_r_x', 'entity2_r_x', 'text_entity1', 'entity1_begin', 'entity1_end', 'entity1',
                   'text_entity2', 'entity2_begin', 'entity2_end', 'entity2', 'case_x']
    entity_relation = entity_relation[cols_to_keep]
    entity_relation = entity_relation.rename(columns={'relation_type_x': 'relation_type', 'entity1_r_x': 'entity1_r', 'entity2_r_x': 'entity2_r', 'case_x': 'case'})

    return entity_relation

In [None]:
# get attributes from .ann df
def get_gold_attributes(df, span_comp):
    """
    extract BRAT attributes and associated entities 
    """
    comp_a = df[df.attribute_entity.notnull()]
    cols_to_keep = ['attribute_type', 'attribute_entity', 'attribute_degree', 'brat_id', 'case']
    comp_a = comp_a[cols_to_keep]
    entity_a = comp_a.merge(span_comp, how='left', left_on=['attribute_entity','case'], right_on=['brat_id', 'case'])
    entity_a = entity_a.rename(columns={'case_x': 'case', 'brat_id_x': 'brat_id'})
    cols_to_keep = ['attribute_type', 'attribute_entity', 'attribute_degree', 'brat_id', 'text']
    entity_a = entity_a[cols_to_keep]
   
    return entity_a

### Various methods 

In [None]:
def df_to_list(df, ann_type):
    """
    convert system or gold entity dataframe to list
    ann_type for control of conversion
    """
    data_out = []
    for row in df.itertuples(index=True):
        d = {'case': getattr(row, 'case'), 
             'begin': getattr(row, 'begin'), 
             'end': getattr(row, 'end'), 
             'text': getattr(row, 'text')}
        if ann_type == 'system':
            d['system'] = getattr(row, 'system')
            d['type'] = getattr(row, 'type')
        elif ann_type == 'gold':
            d['gold_entity_type'] = getattr(row, 'entity_type')
        
        if d not in data_out:
            data_out.append(d)

    return data_out

In [None]:
# co-occurences 
def get_cooccurences(gold_out, system_out, nested = True):
    """
    get coocurences between system and gold; nested -> no shared boundaries
    """
    class Coocurences(object):
        def __init__(self):
            self.gold_system_match = 0
            self.gold_only = 0
            self.system_only = 0
            self.system_n = 0
            self.gold_n = 0
            self.matches = []
            self.false_negatives = []
            
    c = Coocurences()
    
    for g in gold_out:
        g_begin = int(g['begin'])
        g_end = int(g['end'])
        g_case = g['case']
        mMatch = False
        for s in system_out:
            s_begin = int(s['begin'])
            s_end = int(s['end'])
            s_case = s['case']
            if nested and g_case == s_case:
                if (((s_begin >= g_begin and s_end < g_end) or 
                     (s_begin > g_begin and s_end <= g_end) or 
                     (g_begin >= s_begin and g_end < s_end) or 
                     (g_begin > s_begin and g_end <= s_end)) and 
                    ((s, g) not in c.matches)): 
                    if len(s['text']) <= 2*len(g['text']): 
                        c.matches.append((s, g))
                        mMatch = True
                        break
        
            if not nested and g_case == s_case:
                if (((s_begin >= g_begin and s_begin < g_end and s_end > g_end) or 
                     (s_begin < g_begin and s_end > g_begin and s_end < g_end) or 
                     (s_begin < g_begin and s_end > g_begin and s_end > g_end)) and # or 
                    ((s, g) not in c.matches)): 
                    if len(s['text']) <= 2*len(g['text']): 
                        c.matches.append((s, g))
                        mMatch = True
                        break
        
        # no match, so increment for FN count
        if mMatch == False:
            c.gold_only += 1
            
            # use for bic analysis
            fn = {'case': None, 
                  'begin': None, 
                  'end': None, 
                  'text': None, 
                  'system': None, 
                  'type': None}
            
            c.false_negatives.append((fn, g))
    
    # use for metrics 
    c.gold_system_match = len(c.matches)
    c.system_only = len(system_out) - len(c.matches)
    c.system_n = len(system_out)
    c.gold_n = len(gold_out)
    
    # sanity check
    if len(gold_out) - c.gold_system_match < 0:
        print(c.matches)
    
    return c 


In [None]:
def get_metrics(system_only, gold_only, gold_system_match, system_n):
    """
    returns an instance with confusion matrix metrics
    """
    
    class Metrics(object):
        """
        metrics class 
        """
        def __init__(self):
            self = self    
            self.system_only = system_only
            self.gold_only = gold_only
            self.gold_system_match = gold_system_match
            self.system_n = system_n
            
        def get_confusion_metrics(self):
            """
            compute confusion matrix measures, as per  
            https://stats.stackexchange.com/questions/51296/how-do-you-calculate-precision-and-recall-for-multiclass-classification-using-co
            """
            TP = self.gold_system_match
            FP = self.system_only
            FN = self.gold_only
            TM = TP/math.sqrt(self.system_n) # TigMetric
            confusion = [[0, self.system_only],[self.gold_only,self.gold_system_match]]
            c = np.asarray(confusion)
            recall = np.diag(c) / np.sum(c, axis = 1)
            precision = np.diag(c) / np.sum(c, axis = 0)
            F = 2*(precision*recall)/(precision + recall)
            
            # Tignanelli Metric
            if FN == 0:
                TP_FN_R = TP
            elif FN > 0:
                TP_FN_R = TP/FN
            
            return F, recall, precision, TP, FP, FN, TP_FN_R, TM
            
    
    return Metrics() 

In [None]:
# sanity test

#recall = TP/(FN + TP)

#precision = TP/(FP + TP)
#F = 2*(precision*recall)/(precision + recall)

#print('F-score: {0}, precision: {1}, recall: {2}'.format(F, precision, recall))

## Read in all CAS XMI and BRAT ann files for analysis

In [None]:
def get_evaluation_data(cs, case_config, run_all=True):
    """
    get data based on specified partition;
    run_all controls whether both XMI CAS and BRAT are to be proceessed
    """
    
    gold_directory = cs.gold_path
    cases, txt_directory, partition = case_config
    gold_directory += txt_directory
   
    # named columns from fike
    colnames=['brat_id', 'brat_mapping', 'text'] 
    # read files 
    gold_df = pd.DataFrame()
    system_df = pd.DataFrame()
    
    system_out = [] # write sys_out
    
    # amicus partition is pared down to just amicus
    if partition != 'amicus':
        systems = cs.systems
    else:
        systems = cs.amicus
    
    i = 0
    for case in cases: # ietrate through partitioned cases

        if run_all: # used to control whther to skip creating system annotations
            for system in systems:
                system_directory = cs.system_path + system + '_out/'
                system_file = system_directory + case + '-v1.txt.xmi'

                with open(system_file, 'r') as f:
                    lines = [x.strip() for x in f.readlines()]

                # get sofa text    
                text, xmi = system_sofa(lines)

                # get system annotations as df and list
                df, sys_out = get_system_annotations(xmi, system, case, text)
                system_df = pd.concat([system_df, df], ignore_index=True)
               
                # save to list
                system_out = sys_out + system_out
                
        # create dataframe of annotations from .ann 
        gold_file = gold_directory + case + '-v1.ann'
        gold_temp = pd.read_table(gold_file, header=None, names=colnames)
        gold_temp['case'] = case
        gold_df = pd.concat([gold_df,gold_temp], ignore_index=True)
    
    return gold_df, system_df, system_out

In [None]:
def metrics_out(cs, gold_out, system_out, case_config, nested = True):
    """
    iterate over type system and gold annotation entities generating metrics
    """
    
    metrics = pd.DataFrame()
    cases, txt_directory, partition = case_config

    # amicus partition is pared down to just amicus
    if partition != 'amicus':
        systems = cs.systems
    else:
        systems = cs.amicus
    
    # iterate through all specific systems
    for sys in systems:
        for ge in cs.gold_entities: # ieterate through specified entities
            gold = []
            for g in gold_out:
                if g['gold_entity_type'] == ge:
                    gold.append(g)
            
            types = AnnotationSystems().get_system_type(sys) # system types for iterable
            for t in types:
                system = [] 
                for s in system_out:
                    if s['system'] == sys and s['type'] == t:
                        system.append(s)
                
                c = get_cooccurences(gold, system, nested) # get matches, FN, etc.

                if c.gold_system_match > 0: # compute confusion matrix metrics and write to deictionary -> df
                    F, recall, precision, TP, FP, FN, TP_FN_R, TM = get_metrics(c.system_only, c.gold_only, c.gold_system_match, c.system_n).get_confusion_metrics()
                    d = {'system': sys, 
                         'type': t, 
                         'entity': ge, 
                         'F': F[1], 
                         'precision': precision[1], 
                         'recall': recall[1], 
                         'TP': TP, 
                         'FN': FN, 
                         'FP': FP, 
                         'TP/FN': TP_FN_R,
                         'n_gold': c.gold_n, 
                         'n_sys': c.system_n, 
                         'TM': TM}
            
                    data = pd.DataFrame(d,  index=[0])
                    metrics = pd.concat([metrics, data], ignore_index=True)
                    metrics.drop_duplicates(keep='last', inplace=True) # needed due to duplicates in brat entity list!! TODO: remove
    
    return metrics


In [None]:
# # get all anotations
# def get_gold_ann_data():

#     import re, os, glob, path
#     import pandas as pd

#     directory_to_parse = cs.gold_path + '/all'
#     os.chdir(directory_to_parse)

#     test = pd.DataFrame()
#     #for fname in glob.glob("/Volumes/GrenziData/development/nlp/nlpie/projects/trauma/brat_files/all/*.ann"):
#     for fname in glob.glob(directory_to_parse + '/*.ann'):
#         # get filename and use for processed output filename
#         t = os.path.basename(fname)
#         u = t.split('.')[0]

#         with open(fname) as f:
#             colnames=['brat_id', 'brat_mapping', 'text'] 
#             # read in files
#             temp = pd.read_table(f.name, header=None, names=colnames)
#             temp['case'] = t.split('.')[0].split('-')[0]
#         test = pd.concat([test,temp], ignore_index=True)
    
#     return test        

In [None]:
def all_annotations_out(cs, comp, entity_attributes, entity_relations, partition):
    """
    write BRAT annotations gto Excel for analysis
    """
    entities = comp['entity_type'].tolist()
    entities = set(entities)
    
    writer = pd.ExcelWriter(cs.output_path + '/brat_annotated_entities_'+ partition +'.xlsx')

    for entity in sorted(list(entities)):
        e = comp[comp['entity_type'] == entity]
        e.to_excel(writer,sheet_name=entity, engine='xlsxwriter')

    writer.save()
    
    attributes = entity_attributes['attribute_type'].tolist()
    attributes = set(attributes)

    writer = pd.ExcelWriter(cs.output_path + '/brat_annotated_attributes_'+ partition +'.xlsx')

    for attribute in sorted(list(attributes)):
        a = entity_attributes[entity_attributes['attribute_type'] == attribute]
        a.to_excel(writer,attribute)
    
    writer.save()
    
    relationships = entity_relations['relation_type'].tolist()
    relationships = set(relationships)
    
    writer = pd.ExcelWriter(cs.output_path + '/brat_annotated_relationships_'+ partition + '.xlsx')

    for relation in sorted(list(relationships)):
        r = entity_relations[entity_relations['relation_type'] == relation]
        r.to_excel(writer,relation)
    
    writer.save()

In [None]:
def get_gold_txt(cs, case_config):
    """
    get gold text for writing to JSON
    """
    
    test = pd.DataFrame()
    cases = []

    cases, txt_directory, partition = case_config
    
    for fname in glob.glob(cs.txt_path + txt_directory + '*.txt'):
        # get filename and use for processed output filename
        t = os.path.basename(fname)
        u = t.split('.')[0]
        
        with open(fname) as f:
                text = f.read()
                d = [text] 
                temp = pd.DataFrame(d, columns=['text'])
                temp['case'] = t.split('.')[0].split('-')[0]
                temp = temp.rename(columns={'0': 'text'})
        cases.append(t.split('.')[0].split('-')[0])
        text_out = pd.concat([text_out,temp], ignore_index=True)
     
    return text_out, cases

In [None]:
def annotations_to_json(span_comp, comp, gold_text, cases):
    """
    BRAT annotations to JSON as per Prodigy spec
    """
    cols_to_keep = ['begin', 'end', 'case', 'entity_type']

    span = span_comp.merge(comp, how='inner', left_index=True, right_index=True)
    span = span.rename(columns={'case_x': 'case'})
    span = span[cols_to_keep]

    #cases = [] TODO: use CaseSystem case_config object

    for case in cases:
        mask = span['case'] == case 
        df = span[mask]

        mask = gold_text['case'] == case
        txt = gold_text[mask]

        for index, row in txt.iterrows():
            out_text = row[0]
            out_text = re.sub('\|',' ', out_text)

        keyDict = {"spans"}
        s = dict([(key, []) for key in keyDict])

        s["text"] = out_text
        for index, row in df.iterrows():
            start = row[0]
            end = row[1]
            label = row[3]
            d = {"start": int(start), "end": int(end), "label": label}
            s['spans'].append(d)

    print(json.dumps([s]))

In [None]:
def annotated_patterns_to_json(cs, comp, case_config):
    """ 
    BRAT annotations to JSONL and Excel
    """
    cases, txt_directory, partition = case_config
        
    patterns = set()
    for case in cases:

        mask = comp['case'] == case 
        mp = comp[mask]

        cols_to_keep = ['text', 'entity_type']
        mp= mp[cols_to_keep]

        f = open(cs.output_path + '/match_patterns.jsonl', 'a')

        # add to set for unique pattern
        for index, row in mp.iterrows():
            d = {"text": row[0], "label": row[1]} # use for seed terms
            pattern = {"label": row[1], "pattern": row[0]}
            patterns.add(json.dumps(pattern))
        
    for p in sorted(patterns):
        #print(p)
        f.write(p + '\n')
    
    # output to spreadsheet by entity
    cols_to_keep = ['entity_type', 'text']
    out = comp[cols_to_keep]
    entities = out['entity_type'].tolist()
    entities = set(entities)

    writer = pd.ExcelWriter(cs.output_path + '/annotated_entities_for_patterns.xlsx')

    for entity in sorted(list(entities)):
        e = out[out['entity_type'] == entity]
        e.drop_duplicates(keep='last', inplace=True)
        e = e.sort_values(by=['text'])
        e.to_excel(writer,sheet_name=entity,columns=['text'],index=False)
    
    writer.save()

In [None]:
def read_case(case):
    with open(case) as f:
        lines = f.read().splitlines()
    return lines

# test
# read_case('/Users/gms/development/nlp/nlpie/projects/trauma/cases/pilot.txt')

## Initialization object for reading in BRAT and XMI CAS

In [None]:
class CaseSystem(object):
    """
    Configuration object:
    cases by pilot, training and validation sets
    paths by output, gold and system locations
    directories by partition case types
    extensions bu gold and system
    systems 
    """
    
    def __init__(self):
        self = self    
       
        self.systems = ['biomedicus', 'clamp', 'ctakes', 'metamap']
        self.amicus = ['amicus']
        self.pilot_directory = '/pass_one/'
        self.new_directory = '/new_data/'
        self.training_directory = '/training_set/'
        self.validation_directory = '/validation_set/'
        self.amicus_directory = '/adapt_amicus/'
        self.all_directory = '/all/'
        self.output_path = '/Users/gms/development/nlp/nlpie/projects/trauma/output'
        self.gold_path = '/Users/gms/development/nlp/nlpie/projects/trauma/brat_files'
        self.txt_path = '/Users/gms/development/nlp/nlpie/projects/trauma/txt_files'
        self.system_path = '/Users/gms/development/nlp/nlpie/data/adapt-pass_one/'
        self.gold_ftype = '/*.ann'
        self.system_ftype = '/*.txt'
        
        '''
        partitioned cases to be read into list
        '''
        self.case_directory = '/Users/gms/development/nlp/nlpie/projects/trauma/cases/'
        self.pilot_set = 'pilot.txt'
        self.training_set = 'training.txt'
        self.validation_set = 'validation.txt'
        self.new_set = 'new.txt'
        self.test_set = 'test.txt'
        
        self.gold_entities = ['Age',
                         'AirbagPresence',
                         'DriverPassengerStatus',
                         'EjectFromCar',
                         'Entrapment',
                         'Extricationtime',
                         'Gender',
                         'HeadOn',
                         'IndicationProcedure', 
                         'InsuranceStatus',
                         'LocationIntrusion',
                         'OtherMinor',
                         'OtherSevere',
                         'Procedure',
                         'Rollover',
                         'SeatbeltPresence',
                         'SeverityIntrusion',
                         'TBone',
                         'VehicleSpeed']
        
        self.gold_amicus_entities = ['Age',
                         'AirbagPresence',
                         'DriverPassengerStatus',
                         'HeadOn',
                         'IndicationProcedure', 
                         'InsuranceStatus',
                         'OtherMinor',
                         'Procedure',
                         'SeatbeltPresence',
                         'SeverityIntrusion',
                         'TBone',
                         'VehicleSpeed']
        
    # default config to training   
    def case_config(self, partition='training'):
        txt_directory = ''
        
        # read in cases by specified partition 
        def read_case(cases):
            with open(cases) as f:
                lines = f.read().splitlines()
            return lines
            
        if partition == 'pilot':
            txt_directory +=  self.pilot_directory
            cases = read_case(self.case_directory + self.pilot_set)
        elif partition == 'test':
            txt_directory = None
            cases = read_case(self.case_directory + self.test_set)
        elif partition == 'training':
            txt_directory += self.training_directory
            cases = read_case(self.case_directory + self.training_set)
        elif partition == 'validation':
            txt_directory += self.validation_directory
            cases = read_case(self.case_directory + self.validation_set)
        elif partition == 'all':
            txt_directory += self.all_directory
            cases = read_case(self.case_directory + self.training_set) + read_case(self.case_directory + self.validation_set)
        elif partition == 'amicusall': # run against 4-system output/full set
            txt_directory += self.amicus_directory
            cases = list(set(read_case(self.case_directory + self.training_set) + read_case(self.case_directory + self.validation_set)) - set(read_case(self.case_directory + self.pilot_set)))
        elif partition == 'amicus': # run against amicus munged set
            txt_directory += self.amicus_directory
            cases = list(set(read_case(self.case_directory + self.training_set) + read_case(self.case_directory + self.validation_set)) - set(read_case(self.case_directory + self.pilot_set)))
        elif partition == 'new':
            txt_directory += self.new_directory
            cases = read_case(self.case_directory + self.new_set)
            
        return cases, txt_directory, partition
        

In [None]:
def test_cs():
    cs = CaseSystem()
    cases, txt_directory, partition = cs.case_config('new')

    print(len(cases))
    print(txt_directory)

test_cs()

In [None]:
def gold_system_annotation(m, tp = True):
    """
    parse and return set of TP or FN annotations
    """
    data = pd.DataFrame()
    
    for i in m:

        if tp:
            d = {'case':i[0]['case'],'sys text':i[0]['text'],'sys':i[0]['system'],'sys type': i[0]['type'],
                 'sys begin':i[0]['begin'], 'sys end': i[0]['end'],'gold entity':i[1]['gold_entity_type'],
                 'gold begin':i[1]['begin'], 'gold end': i[1]['end'], 'gold text':i[1]['text']}
        else:
            d = {'case':i[1]['case'],'sys text': 'FN','sys': None,'sys type': None,
                 'sys begin': None, 'sys end': None,'gold entity':i[1]['gold_entity_type'],'gold begin':i[1]['begin'], 
                 'gold end': i[1]['end'], 'gold text':i[1]['text']}
        
        temp = pd.DataFrame(d, index=[0])
        
        data = pd.concat([temp, data], ignore_index=True)
    return data

In [None]:
def best_in_class():
    best ='''biomedicus Number Age
ctakes Sentence AirbagPresence
ctakes FractionAnnotation DriverPassengerStatus
biomedicus IndefiniteQuantifierCue EjectFromCar
biomedicus OtherAcronymSense Entrapment
ctakes Predicate HeadOn
ctakes SignSymptomMention IndicationProcedure
metamap Candidate InsuranceStatus
ctakes Predicate OtherMinor
biomedicus OtherAcronymSense OtherSevere
ctakes RomanNumeralAnnotation Procedure
biomedicus Acronym Procedure
ctakes Predicate Rollover
metamap Phrase SeatbeltPresence
ctakes SignSymptomMention SeatbeltPresence
biomedicus IndefiniteQuantifierCue SeverityIntrusion
biomedicus Acronym TBone
biomedicus IndefiniteQuantifierCue VehicleSpeed
'''.replace('\n',' ').split(' ')

    best_of = list()

    i = 0
    for b in best:
        if i == 0:
            d = {'system': b}
        elif i == 1:
            d['type'] = b
        elif i == 2:
            d['entity'] = b
        i += 1
        if i == 3:
            i = 0
            best_of.append(d)

    return best_of

In [None]:
def geometric_mean(metrics, cs):
    """
    1. Group by entity type
    2. Get rank average of F1, TP/FN, TM
        http://www.datasciencemadesimple.com/rank-dataframe-python-pandas-min-max-dense-rank-group/
        https://stackoverflow.com/questions/46686315/in-pandas-how-to-create-a-new-column-with-a-rank-according-to-the-mean-values-o?rq=1
    3. Take geomean of 2.
        https://stackoverflow.com/questions/42436577/geometric-mean-applied-on-row
    """
    
    data = pd.DataFrame() 
    for g in cs.gold_entities: # traverse entity groups to compute
        df = metrics[metrics['entity'] == g]

        df['F1 rank']=df['F'].rank(ascending=0,method='average')
        df['TP/FN rank']=df['TP/FN'].rank(ascending=0,method='average')
        df['TM rank']=df['TM'].rank(ascending=0,method='average')
        df['Gmean'] = gmean(df.iloc[:,-3:],axis=1)

        frames = [data, df]
        data = pd.concat(frames, ignore_index=True)

    return data

# Main function

In [None]:
%%time
# Task: parse xmi, brat cases and do "stuff" with data
def main():
    
    cs = CaseSystem()
    start = time.time()
    rtype = int(input("Run: 1->generate data; 2->brat ann out; 3->patterns out; 4->metrics; 5->top n bic w/ TP/FN; 6->amicus mash; 7->metrics"))
    
    nested = False # used to control type of overlapping match
    nest = 'system_coverage'
    
    partition = 'pilot'
    #partition = 'amicus'
    
    if (rtype == 1):
        # option 1: run evaluation on ADAPT versus brat
        brat, system, system_out = get_evaluation_data(cs, cs.case_config(partition), run_all=True)
        
        # write to output to save time!
        system.to_csv(cs.output_path + '/system_out'+ partition +'.csv')
        
        print('end get eval data', (time.time() - start))
        brat, span, comp, span_comp = get_gold_annotations(brat)
        
        print('end get gold annotations', (time.time() - start))

        gold = span.merge(comp, how='inner', left_index=True, right_index=True)
        gold.to_csv(cs.output_path + '/gold_out'+ partition + '.csv')
        
        cols_to_keep = ['case', 'begin', 'end', 'text', 'entity_type']
        
        gold_out = df_to_list(gold, 'gold')
        
        # 'nested' sets the type of span coverage: True: gold covers system; False: system covers gold
        m = metrics_out(cs, gold_out, system_out, cs.case_config(partition), nested)
        
        print('end metrics out', (time.time() - start))

        print(m.head())
        # get geometric mean of ranked averages of F1, TP/FN, TM
        metrics = geometric_mean(m, cs)
        
        print('end geometric mean', (time.time() - start))
        metrics.to_csv(cs.output_path + '/test_metrics'+ partition + nest + '.csv')
        
        print(gold.head())
        print(system.head())
    
    elif (rtype == 2):
        # option 2: write brat annotations to Excel
        brat, system, system_out = get_evaluation_data(cs, cs.case_config(partition), run_all=False)
        brat, span, comp, span_comp = get_gold_annotations(brat)
        entity_relations = get_gold_relations(brat, span_comp)
        entity_attributes = get_gold_attributes(brat, span_comp)
        all_annotations_out(cs, comp, entity_attributes, entity_relations, partition)
   
    elif (rtype == 3):
        # option 3: get annotated JSON, matching patterns -> NER label
        brat, system, system_out = get_evaluation_data(cs, cs.case_config('training'), run_all=False)
        brat, span, comp, span_comp = get_gold_annotations(brat)
        gold_text, cases = get_gold_txt(cs, cs.case_config('training'))   
        annotations_to_json(span_comp, comp, gold_text, cases)
        annotated_patterns_to_json(cs, comp, cs.case_config('training'))
    
    elif (rtype == 4):
        # run geometric mean
        data = pd.read_csv(cs.output_path + '/test_metrics.csv')
        
        geometric_mean(data, cs)
        
        geometric_mean(data, cs).to_csv(cs.output_path + '/bic_ranking.csv')
    
    elif (rtype == 5):
        n = input('Select top n:')
        print('Processing top' + n + ' for best system types:')

        # get system -> brat annotations from disk to save processing time
        sys = pd.read_csv(cs.output_path + '/system_out'+ partition +'.csv')
        ann = pd.read_csv(cs.output_path + '/gold_out'+ partition +'.csv')
        
        best_in_class = []
        data = pd.read_csv(cs.output_path + '/test_metrics'+ partition + nest +'.csv')
        
        bic = pd.DataFrame()
        for g in cs.gold_entities: # traverse enitities for generating bic metrics
            filter_metrics = data[data['entity'] == g]
            filter_metrics = filter_metrics.sort_values(by=['Gmean']).head(int(n))
            cols_to_keep = ['system', 'type', 'entity']
            top_n = filter_metrics[cols_to_keep]
            for index, row in top_n.iterrows():
                best_in_class.append({'system':row['system'],'type': row['type'], 'entity': row['entity']})
                
            frames = [bic, filter_metrics]
            bic = pd.concat(frames, ignore_index=True)
        
        writer = pd.ExcelWriter(cs.output_path + '/best_in_class_'+ partition +'_draft.xlsx')

        bic.to_excel(writer,sheet_name='Top ' + n + ' sys annotations')
        
        bic = pd.DataFrame()
        for b in best_in_class: # write bic metrics to csv
            gold = ann[ann['entity_type'] == b['entity']]
            system = sys[(sys['system'] == b['system']) & (sys['type'] == b['type'])]
            
            g1 = df_to_list(gold, 'gold')
            s1 = df_to_list(system, 'system')
            c = get_cooccurences(g1, s1)

            tp_ann = gold_system_annotation(c.matches)
            fn_ann = gold_system_annotation(c.false_negatives, False)
            
            frames = [tp_ann, fn_ann]
            out = pd.concat(frames, ignore_index=True)
            frames = [bic, out]
            bic = pd.concat(frames, ignore_index=True)
            
        bic.to_csv(cs.output_path + '/best_in_class_annotations_top_' + n + '_' + partition +'.csv')

        for ge in cs.gold_entities: # to sheet by entitiy
            b = bic[bic['gold entity'] == ge]
            if not b.empty:
                b.to_excel(writer,sheet_name=ge)
            
        writer.save()
    
    elif (rtype == 6):
        # get system -> brat annotations for specified lists of entities and system types 
        # used for bic analysis
        sys = pd.read_csv(cs.output_path + '/system_out'+ partition +'.csv')
        ann = pd.read_csv(cs.output_path + '/gold_out'+ partition +'.csv')
        
        entity_of_interest =  ['SeverityIntrusion'] #['IndicationProcedure']
        best_in_class = []
        
        data = pd.read_csv(cs.output_path + '/test_metrics'+ partition + nest +'.csv')
        
        filter_metrics = pd.DataFrame()
        for g in entity_of_interest:
            filter_metrics = data[(data['entity'] == g) & 
                        (data['type'] == 'Sentence')]
#             filter_metrics = data[(data['entity'] == g) & 
#                         ((data['type'] == 'IndefiniteQuantifierCue') |  
#                          (data['type'] == 'StandaloneQuantifier') |
#                          (data['type'] == 'Number'))]
#             filter_metrics = data[(data['entity'] == g) & 
#                         ((data['type'] == 'SignSymptomMention') |  
#                          (data['type'] == 'UmlsConcept'))]
#             filter_metrics = data[(data['entity'] == g) & 
#                         ((data['type'] == 'Sentence') |  
#                          (data['type'] == 'Phrase'))]
            
        
            cols_to_keep = ['system', 'type', 'entity']
            top = filter_metrics[cols_to_keep]
            for index, row in top.iterrows():
                best_in_class.append({'system':row['system'],'type': row['type'], 'entity': row['entity']})
                
            frames = [temp, filter_metrics]
            temp = pd.concat(frames, ignore_index=True)
            
            print(best_in_class)
        
        writer = pd.ExcelWriter(cs.output_path + '/best_in_class_all_'+ partition + ' ' + entity_of_interest[0] + nest +'.xlsx')

        temp.to_excel(writer,sheet_name='All sys annotations', engine='openpyxl')
        
        # only choose those from pilot eval
        #chosen_best = ['IndefiniteQuantifierCue','StandaloneQuantifier','Number'] #SeverityIntrusion -> gold covefage
        #chosen_best = ['SignSymptomMention','UmlsConcept'] # IndicationProcedure -> gold coverage
        #chosen_best = ['Sentence','Phrase'] # IndicationProcedure -> sys coverage
        chosen_best = ['Sentence'] # SeverityIntrusion -> sys coverage
        
        bic_ann = pd.DataFrame()
        for b in best_in_class:
            #print(b)
            if b['type'] in chosen_best:
                gold = ann[ann['entity_type'] == b['entity']]
                system = sys[(sys['system'] == b['system']) & (sys['type'] == b['type'])]

                g1 = df_to_list(gold, 'gold')
                s1 = df_to_list(system, 'system')
                c = get_cooccurences(g1, s1)

                tp_ann = gold_system_annotation(c.matches)
                
                print(temp.head(10))

                fn_ann = gold_system_annotation(c.false_negatives, False)

                frames = [tp_ann, fn_ann]
                out = pd.concat(frames, ignore_index=True)
                frames = [bic_ann, out]
                bic_ann = pd.concat(frames, ignore_index=True)
            
        temp_ann.to_csv(cs.output_path + '/best_in_class_annotations_' + partition +'.csv')

        for ge in entity_of_interest:
            t = bic_ann[bic_ann['gold entity'] == ge]
            if not t.empty:
                t.to_excel(writer,sheet_name=ge)
            
        writer.save()

    elif (rtype == 7):
        # print metrics
        
        sys = pd.read_csv(cs.output_path + '/system_out'+ partition +'.csv')
        ann = pd.read_csv(cs.output_path + '/gold_out'+ partition +'.csv')
        
        gold_out = df_to_list(ann, 'gold')
        system_out = df_to_list(sys, 'system')
        
        m = metrics_out(cs, gold_out, system_out, cs.case_config(partition), True)

        print(m.head())
        metrics = geometric_mean(m, cs)
        print(metrics.head())

if __name__ == '__main__':
    main()

In [None]:
def validate_partitions():
    cs = CaseSystem()
    cases, txt_directory, partition = cs.case_config('pilot')
    
    #print(cases)
    
    p = cases
    print(len(p))
    
    cases, txt_directory, partition = cs.case_config('amicusall')
    
    #print(cases)
    
    v = cases
    print(len(v))
   
    print(len(set(p).union(set(v))))
    print(len(set(p).intersection(set(v))))
    

    
validate_partitions()

In [None]:
def get_ngrams(data):
    l = analyzer(data)
    return ['_'.join(y.split()) for y in l]

def semantic_phrases():
    """
    method for generatin semantic lists 
    NBL need to run remotely, due to w2p model containing PHI
    """
    
    # use for creation of n-grams
    # https://stackoverflow.com/questions/13423919/computing-n-grams-using-python
    vectorizer = CountVectorizer(ngram_range=(1,2)) # use for n-gram generation
    analyzer = vectorizer.build_analyzer()
    partition = 'amicusall'
    output_path = '/home/gms/projects/trauma/data'
    threshold = 0.5
      
    sys = pd.read_csv(output_path + '/system_out'+ partition +'.csv')
    ann = pd.read_csv(output_path + '/gold_out'+ partition +'.csv')
    phrase = KeyedVectors.load_word2vec_format(datapath("/home/gms/projects/word2vec/fairview-vectors-c5_2010-2014-phrase1.bin"), binary=True)
    
    writer = pd.ExcelWriter('semantic_similarity_112_cases-new_terms.xlsx')
    
    # ADAPT study terms on IndicationProcedure
    terms = ['unresponsive',
            'unconscious',
            'agonal',
            'hypotensive',
            'tachycardic',
            'diminished_breath_sounds',
            'absent_breath_sounds',
            'breath_sounds',
            'desaturation',
            'cpr',
            'massive_hemorrhage',
            'entrapment']

    # Future research on Procedue entity
    # terms =['intubation',
    # 'et_tube',
    # 'iv',
    # 'io',
    # 'prbc',
    # 'transfusion',
    # 'txa',
    # 'lucas',
    # 'extrication',
    # 'igel',
    # 'airway',
    # 'intraosseous',
    # 'intravenous_access',
    # 'tranexamic', 
    # 'bolus']

    def get_gold_similarities():
        """
        get list of synonymous terms from gold standard annotations by entity of interest
        """
        mask = ann['entity_type'] == 'IndicationProcedure' 
        d = []
        gold = ann[mask].sort_values(by=['case'])
        for index, row in gold.iterrows():
            str = get_ngrams(row['text'])
            for s in str:
                if s in phrase.vocab: #ensure token is in model, other wise error is thrown
                    for term in terms:
                        if term in phrase.vocab:
                            if phrase.similarity(term, s) > threshold:
                                d.append({'term': term, 'begin': row['begin'], 'end': row['end'],  'case': row['case'], 'token': s, 'entity_type': row['entity_type'], 'sentence': row['text'], 'cos distance': phrase.similarity(term, s)})
        g = pd.DataFrame(d)
        g.to_excel(writer,sheet_name='fvphrases_gold_w2v_hits', engine='xlsxwriter')
        g.to_csv('fvphrases_gold.csv')

    def get_sys_similarities():
        """
        get list of synonymous terms from system annotations by bic annotations for entity of itnerest
        """
        mask = (((sys['system'] == 'ctakes') & (sys['type'] == 'SignSymptomMention')) | ((sys['system'] == 'biomedicus') & (sys['type'] == 'UmlsConcept')) | ((sys['system'] == 'metamap') & (sys['type'] == 'Phrase')) | ((sys['system'] == 'ctakes') & (sys['type'] == 'Sentence')) | ((sys['system'] == 'clamp') & (sys['type'] == 'Sentence')))
        d = []
        ref = sys[mask].sort_values(by=['case'])
        for index, row in ref.iterrows():
            str = get_ngrams(row['text'])
            for s in str:
                if s in phrase.vocab:
                    for term in terms:
                        if term in phrase.vocab:
                            if phrase.similarity(term, s) > threshold:
                                d.append({'term': term, 'begin': row['begin'], 'end': row['end'], 'system': row['system'], 'type': row['type'], 'case': row['case'], 'token': s, 'sentence': row['text'], 'cos distance': phrase.similarity(term, s)})
        s = pd.DataFrame(d)
        s.to_excel(writer,sheet_name='fvphrases_sys_w2v_hits', engine='xlsxwriter')
        s.to_csv('fvphrases_sys.csv')

    get_gold_similarities()
    get_sys_similarities()
    writer.save()

In [None]:
#fv = KeyedVectors.load_word2vec_format(datapath("/Users/gms/Downloads/GoogleNews-vectors-negative300.bin"), binary=True)
#pmc = KeyedVectors.load_word2vec_format(datapath("/Users/gms/Downloads/GoogleNews-vectors-negative300.bin"), binary=True)
#phrase = KeyedVectors.load_word2vec_format(datapath("/home/gms/projects/word2vec/fairview-vectors-c5_2010-2014-phrase1.bin"), binary=True)
#pmc = KeyedVectors.load_word2vec_format(datapath("/home/gms/projects/word2vec/pmc-vectors_lc.bin"), binary=True)
#fv = KeyedVectors.load_word2vec_format(datapath("/home/gms/projects/word2vec/fairview-vectors-c5.bin"), binary=True)
#phrase = KeyedVectors.load_word2vec_format(datapath("/home/gms/projects/word2vec/fairview-vectors-c5_2010-2014-phrase1.bin"), binary=True)

In [None]:
def levenshtein(seq1, seq2):  
    """
    from https://stackabuse.com/levenshtein-distance-and-text-similarity-in-python/
    """
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    #print (matrix)
    return (matrix[size_x - 1, size_y - 1])

In [None]:
def build_annotation_lists(cs, case_config):
    
    cases, txt_directory, partition = case_config
    sys = pd.read_csv(cs.output_path + '/fvphrases_sys.csv')
    gold = pd.read_csv(cs.output_path + '/fvphrases_gold.csv')
    print(cases)

    for case in cases:  
        sys = sys[sys['case'] == case]
        gold = gold[(gold['case']  == case) & (gold['entity_type'] == 'IndicationProcedure')]
        sys_d = dict()
        gold_d = dict()
        sys_l = list()
        gold_l = list()

        # build system annotation list by case
        for index, row in sys.iterrows():
            if sys_d.get('case_annotation') is None: # define system key for determining LD 
                sys_d['case_annotation'] = {'case':row['case'],'system':row['system'],'type':row['type'],'token':row['token'],'term':row['term']}
                sys_l.append(sys_d['case_annotation'])
            else:
                sys_d['case_annotation'] = {'case':row['case'],'system':row['system'],'type':row['type'],'token':row['token'],'term':row['term']}
                if sys_d['case_annotation'] not in sys_l:
                    sys_l.append(sys_d['case_annotation'])

        # build gold annotation list by case
        for index, row in gold.iterrows():
            if gold_d.get('case_annotation') is None: # define gold key for comparison to system key for determing LD
                gold_d['case_annotation'] = {'case':row['case'],'token':row['token'],'term':row['term']}
                gold_l.append(gold_d['case_annotation'])
            else:
                gold_d['case_annotation'] = {'case':row['case'],'token':row['token'],'term':row['term']}
                if gold_d['case_annotation'] not in gold_l:
                    gold_l.append(gold_d['case_annotation'])

        return gold_l, sys_l
        
    

In [None]:
cs = CaseSystem()
gold_l, sys_l = build_annotation_lists(cs, cs.case_config('test'))

def get_min_ld(gold_l, sys_l):
    """
    iterate over system annotations to find gold annotation with min LD
    """ 
    b = []
    for i in sys_l:
        best_score = 99999
        if i not in b:
            # https://stackoverflow.com/questions/41806076/accessing-items-from-a-frozenset-in-python
            key = frozenset(i.items())
            b.append({key:()})
        for j in gold_l:
            temp_score = levenshtein(i['token'],j['token'])
            if temp_score < best_score:
                best_score = temp_score
                for d in b:
                    # https://stackoverflow.com/questions/4291236/edit-the-values-in-a-list-of-dictionaries
                    d.update((k, (j, best_score)) for k, v in d.items() if k == key)

    for k in b:
        for key, value in k.items():
            print (dict(key), value)
            
get_min_ld(gold_l, sys_l)    