In [193]:
from xmljson import badgerfish as bf
from xmljson import cobra as ca 
from xml.etree.ElementTree import fromstring
import json
from json import dumps
import pandas as pd
import re



In [194]:
# https://gist.github.com/douglasmiranda/5127251
def find(key, dictionary):
    for k, v in dictionary.items():
        if k == key:
            yield v
        elif isinstance(v, dict):
            for result in find(key, v):
                yield result
        elif isinstance(v, list):
            for d in v:
                for result in find(key, d):
                    yield result

In [195]:
def system_sofa(lines):   
    
    # get text block for subject of analysis 
    data = dumps(ca.data(fromstring(lines[0])))
    d = json.loads(data)

    # print to get all output of file
    #print(d)

    xmi = d.get("{http://www.omg.org/XMI}XMI").get("children")

    for t in xmi:
        #t.get("{http:///uima/cas.ecore}Sofa")
        if t.get('{http:///uima/cas.ecore}Sofa'):
            u = t.get('{http:///uima/cas.ecore}Sofa')
            #print(u['attributes']['sofaString'])
            if list(find('sofaString', u)):
                text = u 
            #print(text)
    return text, xmi

# use to get text by given span                               
def myprint(d, begin, end):
    i = 0
    out = ''
    for k, v in d.items():
        if isinstance(v, dict):
            myprint(v, begin, end)
            return myprint(v, begin, end)
        elif i == 3 and k != 'mimeType':
            if v:
                return v[begin:end]
        i += 1
        
# test slicing from text
#v = myprint(text, 0, 1540)
#print(v)

#print(text)

In [196]:
class AnnotationSystems(object):
    
    def __init__(self):
        self.annotation_type_system = ['http:///org/metamap/uima/ts.ecore',
                                     'http:///org/apache/ctakes/typesystem/type/syntax.ecore',
                                     'http:///org/apache/uima/ruta/type.ecore',
                                     'http:///edu/uth/clamp/nlp/typesystem.ecore',
                                     'http:///org/apache/ctakes/typesystem/type/textsem.ecore',
                                     'http:///org/apache/ctakes/typesystem/type/textspan.ecore',
                                     'http:///biomedicus/v2.ecore']

        self.annotation_type_relations = ['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA',
                                     '{http:///org/apache/ctakes/typesystem/type/textsem.ecore}SemanticArgument',
                                     '{http:///org/apache/ctakes/typesystem/type/textsem.ecore}Predicate']

        self.annotation_relation_attributes = ['ClampRelationUIMA', 'SemanticRoleRelation']
        
        self.biomedicus_types = ['Acronym',
                         'DayOfWeek',
                         'DictionaryTerm',
                         'Fuzzy',
                         'Historical',
                         'IndefiniteQuantifierCue',
                         'ModificationCue', 
                         #'Negated',
                         'NormForm', 
                         'Number',
                         'NumberRange',
                         'OtherAcronymSense',
                         'ParseToken',
                         'SeasonWord', 
                         'StandaloneQuantifier',
                         'TemporalPhrase',
                         'TextData', 
                         'TimeUnits',
                         'UmlsConcept',
                         'YearNumber']
        
        self.clamp_types = ['All',
                             'Any', 
                             'BaseToken',
                             #'CW',
                             'Chunk',
                             'ClampNameEntityUIMA',
                             #'ClampRelationUIMA', # TODO with NN
                             #'ConllDependencyNode',
                             'NUM',
                             'RutaBasic',
                             #'SPACE',
                             #'SPECIAL',
                             #'SW',
                             'Segment',
                             'Sentence',
                             'TokenSeed',
                             'W']
        
        self.ctakes_types = ['AnatomicalSiteMention',
                             'ConllDependencyNode',
                             'ContractionToken',
                             'DateAnnotation',
                             'DiseaseDisorderMention',
                             'EntityMention',
                             'EventMention',
                             'FractionAnnotation',
                             'IdentifiedAnnotation',
                             'MeasurementAnnotation',
                             'MedicationMention',
                             'NumeToken',
                             'Predicate',
                             'ProcedureMention',
                             'RangeAnnotation',
                             'RomanNumeralAnnotation',
                             'SemanticArgument', # TODO with NN
                             #'SemanticRoleRelation',
                             'Sentence',
                             'SignSymptomMention',
                             'UmlsConcept',
                             'WordToken']
        
        self.metamap_types = ['AcronymAbbrev',
                             'Annotation',
                             'AnnotationBase',
                             'Candidate',
                             #'Negation'#,
                             'Phrase',
                             'Span',
                             'Utterance']
        
        self.amicus_types = ['AnatomicalSiteMention',
                             'Candidate',
                             'Chunk',
                             'IndefiniteQuantifierCue',
                             #'MedicationMention',
                             'Number',
                             'Phrase',
                             'Predicate',
                             'SemanticArgument',
                             'SignSymptomMention',
                             'StandalonQuantifier',
                             'UMLSConcept']
        
        self.amicus_type = ['IndefiniteQuantifierCue','StandaloneQuantifier','Number']
        
        
    def get_system_type(self, system):

        if system == 'biomedicus':
            types = self.biomedicus_types

        elif system == 'clamp':
            types = self.clamp_types

        elif system == 'ctakes':
            types = self.ctakes_types

        elif system == 'metamap':
            types = self.metamap_types
        
        elif system == 'amicus':
            types = self.amicus_types

        return types


def get_system_annotations(xmi, system, case, text, type_of_analysis=None): # use type_of_analysis to control flow for relationship annotations
    
    annSys = AnnotationSystems()
    
    types = annSys.get_system_type(system)
    
    annotation_out = []
    
    for x in xmi:
        for t in types:
            for ats in annSys.annotation_type_system:

                if list(find('{' + ats + '}' + t, x)):
                    
                    if type_of_analysis is not None and t in annSys.annotation_relation_attributes: 
                        # ['ClampRelationUIMA', 'SemanticRoleRelation']:
                        if t == 'ClampRelationUIMA':
                             #'Relation', need to resolve linkage by semantic argument and predicate ids
                             # link each by 'relation' 
                             # <textsem:SemanticRoleRelation argument="34379" category="A1" conditional="false" confidence="0.0" discoveryTechnique="0" id="0" polarity="0" predicate="34372" uncertainty="0" xmi:id="34385"/>
                             # <textsem:SemanticArgument begin="1400" end="1403" label="A1" relation="34385" sofa="1" xmi:id="34379"/>
                             # <textsem:Predicate begin="1385" end="1396" frameSet="extricate.01" relations="34385" sofa="1" xmi:id="34372"/>
                            #print(t['{' + ats + '}' + token]['attributes']['entTo'])
                            #print(t['{' + ats + '}' + token]['attributes']['entFrom'])

                            endTo = x['{' + ats + '}' + t]['attributes']['entTo']
                            endFrom = x['{' + ats + '}' + t]['attributes']['entFrom']

                            #print('{0}, {1}'.format('PARENT',  t['{' + ats + '}' + token]['attributes']))
                            for x1 in xmi:
                                if list(find('{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA', x1)):
                                    if x1['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['{http://www.omg.org/XMI}id'] == endTo:
                                        #print(x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes'])
                                        #print('{0}, {1}, {2}, {3}'.format('endTo', endTo, x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['begin'],x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['end']))
                                        x1['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']
                                        #print('{0}, {1}, {2}, {3}'.format('endTo', endTo, x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['begin'],x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['end']))
                                    if x1['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['{http://www.omg.org/XMI}id'] == endFrom:
                                        #print(x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes'])
                                        #print('{0}, {1}, {2}, {3}'.format('endFrom', endFrom, x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['begin'],x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['end']))
                                        x1['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']
                                        #print('{0}, {1}, {2}, {3}'.format('endFrom', endFrom, x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['begin'],x['{http:///edu/uth/clamp/nlp/typesystem.ecore}ClampNameEntityUIMA']['attributes']['end']))
                        if t == 'SemanticRoleRelation':
                             #'Relation', need to resolve linkage by semantic argument and predicate ids
                             # link each by 'relation' 
                             # <textsem:SemanticRoleRelation argument="34379" category="A1" conditional="false" confidence="0.0" discoveryTechnique="0" id="0" polarity="0" predicate="34372" uncertainty="0" xmi:id="34385"/>
                             # <textsem:SemanticArgument begin="1400" end="1403" label="A1" relation="34385" sofa="1" xmi:id="34379"/>
                             # <textsem:Predicate begin="1385" end="1396" frameSet="extricate.01" relations="34385" sofa="1" xmi:id="34372"/>
#                             print(t['{' + ats + '}' + token]['attributes']['argument'])
#                             print(t['{' + ats + '}' + token]['attributes']['predicate'])
#                             print(t['{' + ats + '}' + token]['attributes'])

                            argument = t['{' + ats + '}' + t]['attributes']['argument']
                            predicate = t['{' + ats + '}' + t]['attributes']['predicate']

                            for x1 in xmi:
                                if list(find('{http:///org/apache/ctakes/typesystem/type/textsem.ecore}SemanticArgument', x1)):
                                    if x1['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}SemanticArgument']['attributes']['{http://www.omg.org/XMI}id'] == argument:
                                        #print(x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}SemanticArgument']['attributes'])
                                        #print('{0}, {1}, {2}, {3}'.format('argument', argument, x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}SemanticArgument']['attributes']['begin'],x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}SemanticArgument']['attributes']['end']))
                                        x1['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}SemanticArgument']['attributes']
                                        #print('{0}, {1}, {2}, {3}'.format('argument', argument, x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}SemanticArgument']['attributes']['begin'],x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}SemanticArgument']['attributes']['end']))
                            for x1 in xmi:
                                 if list(find('{http:///org/apache/ctakes/typesystem/type/textsem.ecore}Predicate', x)):
                                    if x1['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}Predicate']['attributes']['{http://www.omg.org/XMI}id'] == predicate:
                                        #print(x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}Predicate']['attributes'])
                                        #print('{0}, {1}, {2}, {3}'.format('argument', predicate, x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}Predicate']['attributes']['begin'],x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}Predicate']['attributes']['end']))
                                        x1['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}Predicate']['attributes']
                                        #print('{0}, {1}, {2}, {3}'.format('argument', predicate, x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}Predicate']['attributes']['begin'],x['{http:///org/apache/ctakes/typesystem/type/textsem.ecore}Predicate']['attributes']['end']))
                    elif t not in annSys.annotation_relation_attributes: # ['ClampRelationUIMA', 'SemanticRoleRelation']:
                        #if (x.get('{http:///org/metamap/uima/ts.ecore}Phrase')):
                            #print(x.get('{http:///org/metamap/uima/ts.ecore}Phrase'))
                            #print(x['{' + ats + '}' + t]['attributes'])
                        begin = x['{' + ats + '}' + t]['attributes']['begin']
                        end = x['{' + ats + '}' + t]['attributes']['end']
                        d = {'system': system, 'type': t, 'begin': begin, 'end': end, 'text': myprint(text, int(begin), int(end)), 'case': case}
                        
                        if d not in annotation_out:
                            annotation_out.append(d)

    df = pd.DataFrame(annotation_out)
    # examine output from parsed xmi
    #print(df)
    return df, annotation_out

In [197]:
# brat
# https://stackoverflow.com/questions/15325182/how-to-filter-rows-in-pandas-by-regex

# entity = test[test.brat_id.str.startswith("T")]
# relation = test[test.brat_id.str.startswith("R")]
# attribute = test[test.brat_id.str.startswith("A")]

In [198]:
def get_span(string):
    pattern = re.compile(r'\s(\d+).*?(\d+$)')
    match = re.search(pattern, string)
    return match.groups()

def get_entity_type(string):
    return string.split()[0]

def get_relation_type(string):
    out = string.split()
    return out[0]

def get_relation_entities(string):
    out = string.split()[1:3]
    return (out[0].split(':')[1], out[1].split(':')[1])
    
def get_attribute_type(string):
    out = string.split()
    return out[0]

def get_attribute_entity(string):
    out = string.split()[1:2]
    return out[0]

def get_attribute_degree(string):
    if len(string.split()) > 2:
        out = string.split()[2:3]
        return out[0]
    else:
        return None

In [199]:
def get_gold_annotations(test):

    entity = test[test.brat_id.str.startswith("T")]
    relation = test[test.brat_id.str.startswith("R")]
    attribute = test[test.brat_id.str.startswith("A")]
    
    # segregate by entity, relation, annotation
    #print(test[test.entity_span.notnull()])
    test['entity_span'] = entity['brat_mapping'].apply(get_span)
    test['entity_type'] = entity['brat_mapping'].apply(get_entity_type)

    test['relation_type'] = relation['brat_mapping'].apply(get_relation_type)
    test['relation_entities'] = relation['brat_mapping'].apply(get_relation_entities)

    test['attribute_type'] = attribute['brat_mapping'].apply(get_attribute_type)
    test['attribute_entity'] = attribute['brat_mapping'].apply(get_attribute_entity)
    test['attribute_degree'] = attribute['brat_mapping'].apply(get_attribute_degree)

    # get entities
    comp = test[test.entity_span.notnull()]
    #print(comp)
    cols_to_keep = ['text', 'entity_span', 'brat_id', 'entity_type', 'case']
    comp = comp[cols_to_keep]
    #print(comp)
    #print(comp['entity_span'].apply(pd.Series))
    
    span = comp['entity_span'].apply(pd.Series)
    span.columns = ['begin', 'end']
    #print(span)
    span_comp = span.merge(comp, how='inner', left_index=True, right_index=True)
    cols_to_keep = ['text', 'begin', 'end', 'brat_id', 'case']
    span_comp = span_comp[cols_to_keep]
    #print(span_comp)
    
    return test, span, comp, span_comp

In [200]:
# get relations from .ann df
def get_gold_relations(test, span_comp):
    comp_r = test[test.relation_entities.notnull()]
    cols_to_keep = ['relation_type', 'relation_entities', 'brat_id', 'case']
    comp_r = comp_r[cols_to_keep]

    span_r = comp_r['relation_entities'].apply(pd.Series)
    span_r.columns = ['entity1_r', 'entity2_r']

    span_comp_r = span_r.merge(comp_r, how='inner', left_index=True, right_index=True)
    cols_to_keep = ['relation_type', 'relation_entities','entity1_r', 'entity2_r', 'case']
    span_comp_r = span_comp_r[cols_to_keep]

    # get entities for relationship 
    entity_1r = span_comp_r.merge(span_comp, how='left', left_on=['entity1_r', 'case'], right_on=['brat_id', 'case'])
    entity_1r = entity_1r.rename(columns={'text': 'text_entity1', 'begin': 'entity1_begin', 'end': 'entity1_end', 'brat_id': 'entity1', 'case_y': 'case'})
    
    entity_2r = span_comp_r.merge(span_comp, how='left', left_on=['entity2_r', 'case'], right_on=['brat_id', 'case'])
    entity_2r = entity_2r.rename(columns={'text': 'text_entity2', 'begin': 'entity2_begin', 'end': 'entity2_end', 'brat_id': 'entity2', 'case_y': 'case'})

    # merge entities for relationship
    entity_relation = entity_1r.merge(entity_2r, how='inner', left_index=True, right_index=True)

    cols_to_keep = ['relation_type_x', 'entity1_r_x', 'entity2_r_x', 'text_entity1', 'entity1_begin', 'entity1_end', 'entity1',
                   'text_entity2', 'entity2_begin', 'entity2_end', 'entity2', 'case_x']
    entity_relation = entity_relation[cols_to_keep]
    entity_relation = entity_relation.rename(columns={'relation_type_x': 'relation_type', 'entity1_r_x': 'entity1_r', 'entity2_r_x': 'entity2_r', 'case_x': 'case'})

    return entity_relation

In [201]:
# get attributes from .ann df
def get_gold_attributes(test, span_comp):
    comp_a = test[test.attribute_entity.notnull()]
    cols_to_keep = ['attribute_type', 'attribute_entity', 'attribute_degree', 'brat_id', 'case']
    comp_a = comp_a[cols_to_keep]
    entity_a = comp_a.merge(span_comp, how='left', left_on=['attribute_entity','case'], right_on=['brat_id', 'case'])
    entity_a = entity_a.rename(columns={'case_x': 'case', 'brat_id_x': 'brat_id'})
    cols_to_keep = ['attribute_type', 'attribute_entity', 'attribute_degree', 'brat_id', 'text']
    entity_a = entity_a[cols_to_keep]
   
    return entity_a

In [202]:
def df_to_list(df, ann_type):

    data_out = []
    for row in df.itertuples(index=True):
        d = {'case': getattr(row, 'case'), 
             'begin': getattr(row, 'begin'), 
             'end': getattr(row, 'end'), 
             'text': getattr(row, 'text')}
        if ann_type == 'system':
            d['system'] = getattr(row, 'system')
            d['type'] = getattr(row, 'type')
        elif ann_type == 'gold':
            d['gold_entity_type'] = getattr(row, 'entity_type')
        
        if d not in data_out:
            data_out.append(d)

    return data_out

In [203]:
def gold_system_list(gold, system):

    gold_out = []
    for row in gold.itertuples(index=True):
        if {'case': getattr(row, 'case'), 'begin': getattr(row, 'begin'), 'end': getattr(row, 'end'), 'text': getattr(row, 'text'), 'gold_entity_type': getattr(row, 'entity_type')} not in gold_out:
            gold_out.append({'case': getattr(row, 'case'), 'begin': getattr(row, 'begin'), 'end': getattr(row, 'end'), 'text': getattr(row, 'text'), 'gold_entity_type': getattr(row, 'entity_type')})

    system_out = []
    for row in system.itertuples(index=True):
        if {'case': getattr(row, 'case'), 'begin': getattr(row, 'begin'), 'end': getattr(row, 'end'), 'text': getattr(row, 'text'), 'system': getattr(row, 'system'), 'type': getattr(row, 'type')} not in system_out:
            system_out.append({'case': getattr(row, 'case'), 'begin': getattr(row, 'begin'), 'end': getattr(row, 'end'), 'text': getattr(row, 'text'), 'system': getattr(row, 'system'), 'type': getattr(row, 'type')})

    return gold_out, system_out

In [204]:
def get_matches(gold_out, system_out):
    # system annotation contained in gold annotation
    matches = []
    
    for g in gold_out:
        g_begin = int(g['begin'])
        g_end = int(g['end'])
        g_case = g['case']
        for s in system_out:
            s_begin = int(s['begin'])
            s_end = int(s['end'])
            s_case = s['case']
            if (((s==g) or (s_begin >= g_begin and s_end < g_end) or (s_begin > g_begin and s_end <= g_end) or 
                (g_begin >= s_begin and g_end < s_end) or (g_begin > s_begin and g_end <= s_end)) and ((s, g) not in matches) and g_case == s_case):
                
                    if len(s['text'])*2 >= len(g['text']): 
                    #print('gold standard anotation: {0}, biomedicus annotation {1}'.format(g, b))
                        matches.append((s, g))
                
    #print(matches)
    d = []
    for m in matches:
        #print({'span_g': (m[1]['begin'], m[1]['end']), 'span_b': (m[0]['begin'], m[0]['end']), 'text_g': m[1]['text'], 'text_b': m[0]['text'], 'type': m[0]['type']})
        d.append({'case': getattr(row, 'case'), 'span_gold': (m[1]['begin'], m[1]['end']), 'span_system': (m[0]['begin'], m[0]['end']), 'text_gold': m[1]['text'], 'text_system': m[0]['text'], 'system': m[0]['system'], 'type_gold': m[1]['gold_entity_type'], 'type_system': m[0]['type']})

    #print(d)
    analysis = pd.DataFrame(d)
    #print(analysis)
    analysis.to_csv('/Users/gregsilverman/development/nlpie_dev/nlp/nlpie/projects/trauma/gold_'+ system +'_summary.csv')
    #print(matches)
    
    return matches

In [205]:

#             if not inner:
#                 if (((s_begin >= g_begin and s_begin < g_end and s_end > g_end) or 
#                      (s_begin < g_begin and s_end > g_begin and s_end < g_end) or 
#                      (g_begin >= s_begin and g_begin < s_end and g_end > s_end) or 
#                      (g_begin < s_begin and g_end > s_begin and g_end < s_end)) and 
#                     ((s, g) not in c.matches) and g_case == s_case):
#     #            if (((s_begin >= g_begin and s_begin <= g_end) or (g_begin >= s_begin and g_begin <= s_end)) and ((s, g) not in matches) and g_case == s_case):
#                         #print('gold standard anotation: {0}, biomedicus annotation {1}'.format(g, b))
#                     if len(s['text']) <= 2*len(g['text']): 
#                         c.matches.append((s, g))
#                         mMatch = True
#                         break

In [206]:
# co-occurences 
def get_cooccurences(gold_out, system_out, nested = True):
    
    class Coocurences(object):
        def __init__(self):
            self.gold_system_match = 0
            self.gold_only = 0
            self.system_only = 0
            self.system_n = 0
            self.gold_n = 0
            self.matches = []
            self.false_negatives = []
            
    c = Coocurences()
    
    #matches = []
    #false_negatives = [] 
    #gold_only = 0
    for g in gold_out:
        g_begin = int(g['begin'])
        g_end = int(g['end'])
        g_case = g['case']
        mMatch = False
        for s in system_out:
            s_begin = int(s['begin'])
            s_end = int(s['end'])
            s_case = s['case']
            if nested and g_case == s_case:
                if (((s_begin >= g_begin and s_end < g_end) or 
                     (s_begin > g_begin and s_end <= g_end) or 
                     (g_begin >= s_begin and g_end < s_end) or 
                     (g_begin > s_begin and g_end <= s_end)) and 
                    ((s, g) not in c.matches)): # and g_case == s_case):
    #            if (((s_begin >= g_begin and s_begin <= g_end) or (g_begin >= s_begin and g_begin <= s_end)) and ((s, g) not in matches) and g_case == s_case):
                        #print('gold standard anotation: {0}, biomedicus annotation {1}'.format(g, b))
                    if len(s['text']) <= 2*len(g['text']): 
                        c.matches.append((s, g))
                        mMatch = True
                        break
        
            if not nested and g_case == s_case:
                if (((s_begin >= g_begin and s_begin < g_end and s_end > g_end) or 
                     (s_begin < g_begin and s_end > g_begin and s_end < g_end) or 
                     (s_begin < g_begin and s_end > g_begin and s_end > g_end)) and # or 
                     #(g_begin < s_begin and g_end > s_begin and g_end > s_end) or 
                     #(g_begin >= s_begin and g_begin < s_end and g_end > s_end) or 
                     #(g_begin < s_begin and g_end > s_begin and g_end < s_end)) and 
                    ((s, g) not in c.matches)): # and g_case == s_case):
    #            if (((s_begin >= g_begin and s_begin <= g_end) or (g_begin >= s_begin and g_begin <= s_end)) and ((s, g) not in matches) and g_case == s_case):
                        #print('gold standard anotation: {0}, biomedicus annotation {1}'.format(g, b))
                    if len(s['text']) <= 2*len(g['text']): 
                        c.matches.append((s, g))
                        mMatch = True
                        break
                        
        if mMatch == False:
            c.gold_only += 1
            
            fn = {'case': None, 
                  'begin': None, 
                  'end': None, 
                  'text': None, 
                  'system': None, 
                  'type': None}
            
            c.false_negatives.append((fn, g))
        

#.    gives erroneous results, since not a set theoretic difference
#     gold_test_match = len(matches)
#     gold_only = len(gold_out) - gold_test_match
#     test_only = len(system_out) - len(matches)

    c.gold_system_match = len(c.matches)
    c.system_only = len(system_out) - len(c.matches)
    #gold_only = len(gold_out) - gold_test_match
    # would use set diffrence, but cannot hash on dictionary
    #gold_only = len([item for item in gold_out if item not in matches])
    #system_only = len([item for item in system_out if item not in matches])
    c.system_n = len(system_out)
    c.gold_n = len(gold_out)
    
    if len(gold_out) - c.gold_system_match < 0:
        print(c.matches)
    
    return c #gold_system_match, gold_only, system_only, gold_n, system_n,  matches, false_negatives


In [207]:
def get_metrics(system_only, gold_only, gold_system_match, system_n):
    import numpy as np
    import math
    from scipy import stats
    
    class Metrics(object):
        """
        metrics 
        """
        def __init__(self):
            self = self    
            
            self.system_only = system_only
            self.gold_only = gold_only
            self.gold_system_match = gold_system_match
            self.system_n = system_n
            
            
        def get_confusion_metrics(self):
            
            TP = self.gold_system_match
            FP = self.system_only
            FN = self.gold_only
            TM = TP/math.sqrt(self.system_n) # TigMetric
            confusion = [[0, self.system_only],[self.gold_only,self.gold_system_match]]
            c = np.asarray(confusion)
            recall = np.diag(c) / np.sum(c, axis = 1)
            precision = np.diag(c) / np.sum(c, axis = 0)
            F = 2*(precision*recall)/(precision + recall)
            
            if FN == 0:
                TP_FN_R = TP
            elif FN > 0:
                TP_FN_R = TP/FN
            
            return F, recall, precision, TP, FP, FN, TP_FN_R, TM
            

    #confusion = [[0, system_only],[gold_only,gold_system_match]]
    #c = np.asarray(confusion)

#     m.TP = gold_system_match
#     m.FP = system_only
#     m.FN = gold_only
#     m.TM = m.TP/math.sqrt(system_n) # TigMetric
    
#     if m.FN == 0:
#         m.TP_FN_R = m.TP
#     elif m.FN > 0:
#         m.TP_FN_R = m.TP/m.FN
    
    # https://stats.stackexchange.com/questions/51296/how-do-you-calculate-precision-and-recall-for-multiclass-classification-using-co
    #m.recall = np.diag(c) / np.sum(c, axis = 1)
    #m.precision = np.diag(c) / np.sum(c, axis = 0)
    #m.F = 2*(m.precision*m.recall)/(m.precision + m.recall)
    
    #F, recall, precision = m.get_confusion_metrics(test_only, gold_only, gold_test_match)
    
    #print('F-score: {0}, precision: {1}, recall: {2}'.format(F, precision, recall))
    
    return Metrics() #, m.get_confusion_metrics(test_only, gold_only, gold_test_match) #precision, recall, F, FN, FP, TP, TP_FN_R, TM

In [208]:
# def get_metrics(system_only, gold_only, gold_system_match, system_n):
#     import numpy as np
#     import math
#     from scipy import stats
    
#     class Metrics(object):
#         """
#         metrics 
#         """
#         def __init__(self):
#             self = self    

#             self.TN = 0
#             self.TP = gold_system_match
#             self.FP = 0
#             self.FN = 0
#             self.TM = 0
#             self.TP_FN_R = 0
#             self.recall = np.array([]) 
#             self.precision = np.array([])
#             self.F = np.array([])
            
            
#         def get_confusion_metrics(self, system_only, gold_only, gold_system_match):
#             confusion = [[0, system_only],[gold_only,gold_system_match]]
#             c = np.asarray(confusion)
#             recall = np.diag(c) / np.sum(c, axis = 1)
#             precision = np.diag(c) / np.sum(c, axis = 0)
#             F = 2*(precision*recall)/(precision + recall)
            
#             return F, recall, precision
            
#     m = Metrics()

#     #confusion = [[0, system_only],[gold_only,gold_system_match]]
#     #c = np.asarray(confusion)

#     m.TP = gold_system_match
#     m.FP = system_only
#     m.FN = gold_only
#     m.TM = m.TP/math.sqrt(system_n) # TigMetric
    
#     if m.FN == 0:
#         m.TP_FN_R = m.TP
#     elif m.FN > 0:
#         m.TP_FN_R = m.TP/m.FN
    
#     # https://stats.stackexchange.com/questions/51296/how-do-you-calculate-precision-and-recall-for-multiclass-classification-using-co
#     #m.recall = np.diag(c) / np.sum(c, axis = 1)
#     #m.precision = np.diag(c) / np.sum(c, axis = 0)
#     #m.F = 2*(m.precision*m.recall)/(m.precision + m.recall)
    
#     #F, recall, precision = m.get_confusion_metrics(test_only, gold_only, gold_test_match)
    
#     #print('F-score: {0}, precision: {1}, recall: {2}'.format(F, precision, recall))
    
#     return m #, m.get_confusion_metrics(test_only, gold_only, gold_test_match) #precision, recall, F, FN, FP, TP, TP_FN_R, TM

In [209]:
#recall = TP/(FN + TP)

#precision = TP/(FP + TP)
#F = 2*(precision*recall)/(precision + recall)

#print('F-score: {0}, precision: {1}, recall: {2}'.format(F, precision, recall))

In [210]:
def get_evaluation_data(cs, case_config, run_all=True):
    
    gold_directory = cs.gold_path

    cases, txt_directory, partition = case_config
    gold_directory += txt_directory
    
    colnames=['brat_id', 'brat_mapping', 'text'] 
    # read in files
    test = pd.DataFrame()
    system_df = pd.DataFrame()
    
    system_out_test = [] # refactor out from gold_system_list
    
    if partition != 'amicus':
        systems = cs.systems
    else:
        systems = cs.amicus
    
    i = 0
    for case in cases:

        if run_all:
            for system in systems:
                #system_directory = '/Users/gregsilverman/development/nlpie_dev/nlp/nlpie/data/' + system + '_out/'
                system_directory = cs.system_path + system + '_out/'
                system_file = system_directory + case + '-v1.txt.xmi'

                with open(system_file, 'r') as f:
                    lines = [x.strip() for x in f.readlines()]

                #lines, test, case, system = iter_cases()
                text, xmi = system_sofa(lines)
                #print(xmi)

                df, sys_out = get_system_annotations(xmi, system, case, text)
                system_df = pd.concat([system_df, df], ignore_index=True)
                
                # refactor out from gold_system_list
                #if sys_out_dict not in system_out_test:
                
                system_out_test = sys_out + system_out_test
                
                #print('Ya Ya!', len(sys_out), type(sys_out)) 
            #print('i:', i)
            #i += 1
        # create dataframe of annotations from .ann 
        gold_file = gold_directory + case + '-v1.ann'
        temp = pd.read_table(gold_file, header=None, names=colnames)
        temp['case'] = case
        test = pd.concat([test,temp], ignore_index=True)
    
    #system_df.to_csv(cs.output_path + '/system_out.csv')
    #test.to_csv(cs.output_path + '/gold_out.csv')
    
    return test, system_df, system_out_test

In [211]:
#print(system_out)

In [212]:
'''
get type system metrics against annotation entities
'''

def metrics_out(cs, gold_out, system_out, case_config, nested = True):
    
    metrics = pd.DataFrame()
    
    cases, txt_directory, partition = case_config

    if partition != 'amicus':
        systems = cs.systems
    else:
        systems = cs.amicus
        
    for sys in systems:
        #print(sys)
        for ge in cs.gold_entities:
            gold = []
            for g in gold_out:
                if g['gold_entity_type'] == ge:
                    #print(g)
                    gold.append(g)
            #print(ge)
            #print(gold)

            #types = get_system_type(sys)
            types = AnnotationSystems().get_system_type(sys)
    
            for t in types:
                system = [] 
                for s in system_out:
                    if s['system'] == sys and s['type'] == t:
                        system.append(s)
                #print(t)
                #print(system)
        #             print(s['system'], s['type'], g['gold_entity_type'])
                # TODO refactor to a class
                #gold_system_match, gold_only, system_only, gold_n, system_n, matches, false_negatives = get_cooccurences(gold, system)
                #gold_system_match, gold_only, system_only, gold_n, system_n, matches, false_negatives 
                c = get_cooccurences(gold, system, nested)

                if c.gold_system_match > 0:
                    #print(get_metrics(test_only, gold_only, gold_test_match))
                    #print('{0}, {1}, {2}'.format(test_only, gold_only, gold_test_match))
                    #precision, recall, F, FN, FP, TP, TP_FN_R, TM = get_metrics(system_only, gold_only, gold_test_match, system_n)
                    #m = get_metrics(system_only, gold_only, gold_system_match, system_n)
#                     F, recall, precision = m.get_confusion_metrics(system_only, gold_only, gold_system_match)
                    F, recall, precision, TP, FP, FN, TP_FN_R, TM = get_metrics(c.system_only, c.gold_only, c.gold_system_match, c.system_n).get_confusion_metrics()
                    #data = pd.DataFrame({'system': s['system'], 'type': t, 'entity': ge, 'F': F, 'precision': precision, 'recall': recall}, )
                    d = {'system': sys, 
                         'type': t, 
                         'entity': ge, 
                         'F': F[1], 
                         'precision': precision[1], 
                         'recall': recall[1], 
                         'TP': TP, 
                         'FN': FN, 
                         'FP': FP, 
                         'TP/FN': TP_FN_R,
                         'n_gold': c.gold_n, 
                         'n_sys': c.system_n, 
                         'TM': TM}
            
                    data = pd.DataFrame(d,  index=[0])
                    metrics = pd.concat([metrics, data], ignore_index=True)
                    metrics.drop_duplicates(keep='last', inplace=True) # needed due to duplicates in brat entity list!! TODO: remove
            #else:
            #    print('no matches')
            

    #print(metrics.to_string())
    #metrics.to_csv('/Users/gregsilverman/development/nlpie_dev/nlp/nlpie/projects/trauma/test_metrics.csv')
    #metrics.to_csv(cs.output_path + '/test_metrics.csv')
    
    return metrics


In [213]:
# get all anotations
def get_gold_ann_data():

    import re, os, glob, path
    import pandas as pd

    #directory_to_parse = '/Volumes/GrenziData/development/nlp/nlpie/projects/trauma/brat_files/all' # use for aggregate counts
    directory_to_parse = cs.gold_path + '/all'
    os.chdir(directory_to_parse)

    test = pd.DataFrame()
    #for fname in glob.glob("/Volumes/GrenziData/development/nlp/nlpie/projects/trauma/brat_files/all/*.ann"):
    for fname in glob.glob(directory_to_parse + '/*.ann'):
        # get filename and use for processed output filename
        t = os.path.basename(fname)
        u = t.split('.')[0]

        with open(fname) as f:
            colnames=['brat_id', 'brat_mapping', 'text'] 
            # read in files
            temp = pd.read_table(f.name, header=None, names=colnames)
            temp['case'] = t.split('.')[0].split('-')[0]
        test = pd.concat([test,temp], ignore_index=True)
    
    return test
            

In [214]:

# output all annotations: change test variable in cell block above accordingly to get these
# def get_all_annotations():
#     test = get_gold_ann_data()

#     test, span, comp, span_comp = get_gold_annotations(test)
#     entity_relations = get_gold_relations(test, span_comp)
#     entity_attributes = get_gold_attributes(test, span_comp)

#     comp.to_csv('/Users/gregsilverman/development/nlpie_dev/nlp/nlpie/projects/trauma/entity_ann_trauma.csv')
#     entity_relations.to_csv('/Users/gregsilverman/development/nlpie_dev/nlp/nlpie/projects/trauma/entity_relations_ann_trauma.csv')
#     entity_attributes.to_csv('/Users/gregsilverman/development/nlpie_dev/nlp/nlpie/projects/trauma/entity_attributes_ann_trauma.csv')


In [215]:
def all_annotations_out(cs, comp, entity_attributes, entity_relations, partition):
    entities = comp['entity_type'].tolist()
    entities = set(entities)
    #print(sorted(list(entities)))
    
    #print(comp)

    #writer = pd.ExcelWriter('/Users/gregsilverman/development/nlpie_dev/nlp/nlpie/projects/trauma/brat_annotated_entities.xlsx')
    writer = pd.ExcelWriter(cs.output_path + '/brat_annotated_entities_'+ partition +'.xlsx')

    for entity in sorted(list(entities)):
        e = comp[comp['entity_type'] == entity]
        e.to_excel(writer,sheet_name=entity, engine='xlsxwriter')

    writer.save()
    
    attributes = entity_attributes['attribute_type'].tolist()
    attributes = set(attributes)
    #print(sorted(list(attributes)))

    #writer = pd.ExcelWriter('/Users/gregsilverman/development/nlpie_dev/nlp/nlpie/projects/trauma/brat_annotated_attributes.xlsx')
    writer = pd.ExcelWriter(cs.output_path + '/brat_annotated_attributes_'+ partition +'.xlsx')

    for attribute in sorted(list(attributes)):
        a = entity_attributes[entity_attributes['attribute_type'] == attribute]
        a.to_excel(writer,attribute)

    
    writer.save()
    
    relationships = entity_relations['relation_type'].tolist()
    relationships = set(relationships)
    #print(sorted(list(relationships)))

    #writer = pd.ExcelWriter('/Users/gregsilverman/development/nlpie_dev/nlp/nlpie/projects/trauma/brat_annotated_relationships.xlsx')
    writer = pd.ExcelWriter(cs.output_path + '/brat_annotated_relationships_'+ partition + '.xlsx')

    for relation in sorted(list(relationships)):
        r = entity_relations[entity_relations['relation_type'] == relation]
        r.to_excel(writer,relation)
    
    
    writer.save()

In [216]:
# entity_attributes.groupby(['attribute_type','attribute_degree'])['attribute_type'].count()

In [217]:
#print(comp.groupby(['entity_type'])['entity_type'].count())

#mask = comp['entity_type'] == 'Age'
#print(comp[mask])

In [218]:
# entity_relations.groupby(['relation_type'])['relation_type'].count()

In [219]:
'''
get annotation text
'''
def get_gold_txt(cs, case_config):

    import os, glob, path
    import pandas as pd

    #directory_to_parse = '/Volumes/GrenziData/development/nlp/nlpie/projects/trauma/txt_files/pass_one'

    #os.chdir(directory_to_parse)

    test = pd.DataFrame()
    cases = []

#     txt_directory = ''
#     if partition == 'pilot':
#         txt_directory +=  cs.pilot_directory
#     elif partition == 'training':
#         txt_directory += cs.training_directory
#     elif partition == 'validation':
#         txt_directory += cs.validation_directory

    cases, txt_directory, partition = case_config
    
    #for fname in glob.glob("/Volumes/GrenziData/development/nlp/nlpie/projects/trauma/txt_files/training_set/*.txt"):
    for fname in glob.glob(cs.txt_path + txt_directory + '*.txt'):
        # get filename and use for processed output filename
        t = os.path.basename(fname)
        u = t.split('.')[0]
        #print(u)
        
        with open(fname) as f:
                text = f.read()
                #print(t.split('.')[0].split('-')[0])
                d = [text] #, 'case': t.split('.')[0].split('-')[0]}
                
                #print(d)
                temp = pd.DataFrame(d, columns=['text'])
                temp['case'] = t.split('.')[0].split('-')[0]
                temp = temp.rename(columns={'0': 'text'})
        cases.append(t.split('.')[0].split('-')[0])
                #print(temp)
        test = pd.concat([test,temp], ignore_index=True)
    #print(test.columns)
    
    return test, cases

In [220]:
# create JSON with text and span as keys

def annotations_to_json(span_comp, comp, gold_text):
    cols_to_keep = ['begin', 'end', 'case', 'entity_type']

    span = span_comp.merge(comp, how='inner', left_index=True, right_index=True)
    span = span.rename(columns={'case_x': 'case'})
    span = span[cols_to_keep]


    #for case in cases:
    mask = span['case'] == 'AC71231'
    df = span[mask]

    mask = gold_text['case'] == 'AC71231'
    txt = gold_text[mask]
    #print(txt['text'])

    for index, row in txt.iterrows():
        out_text = row[0]
        out_text = re.sub('\|',' ', out_text)
    #def gold_ann_to_json(comp):

    keyDict = {"spans"}
    s = dict([(key, []) for key in keyDict])
    #print(s)


    s["text"] = out_text
    for index, row in df.iterrows():
        start = row[0]
        end = row[1]
        label = row[3]
        #print(row[0], row[1], row[2], row[3])
        d = {"start": int(start), "end": int(end), "label": label}
        #print(d)
        s['spans'].append(d)

    print(json.dumps([s]))

In [221]:
# create jsonl, csv for annotated patterns
def annotated_patterns_to_json(cs, comp, case_config):
    
#     if partition == 'pilot':
#         cases = cs.pilot_set
#     elif partition == 'training':
#         cases = cs.training_set
#     elif partition == 'validation':
#         cases = cs.validation_set

    cases, txt_directory, partition = case_config
        
    #print(cases)
    patterns = set()
    for case in cases:

        mask = comp['case'] == case #'AC71231'
        mp = comp[mask]

        cols_to_keep = ['text', 'entity_type']
        mp= mp[cols_to_keep]

        #f = open("/Volumes/GrenziData/development/nlp/nlpie/projects/trauma/match_patterns.jsonl", "a")
        f = open(cs.output_path + '/match_patterns.jsonl', 'a')

        # add to set for unique pattern
        for index, row in mp.iterrows():
            d = {"text": row[0], "label": row[1]} # use for seed terms
            pattern = {"label": row[1], "pattern": row[0]}
            patterns.add(json.dumps(pattern))
        
    for p in sorted(patterns):
        #print(p)
        f.write(p + '\n')
    
    # output to spreadsheet by entity
    cols_to_keep = ['entity_type', 'text']
    out = comp[cols_to_keep]
    entities = out['entity_type'].tolist()
    entities = set(entities)
    #print(sorted(list(entities)))

    #writer = pd.ExcelWriter('/Users/gregsilverman/development/nlpie_dev/nlp/nlpie/projects/trauma/brat_annotated_entities.xlsx')
    writer = pd.ExcelWriter(cs.output_path + '/annotated_entities_for_patterns.xlsx')

    for entity in sorted(list(entities)):
        e = out[out['entity_type'] == entity]
        e.drop_duplicates(keep='last', inplace=True)
        e = e.sort_values(by=['text'])
        e.to_excel(writer,sheet_name=entity,columns=['text'],index=False)
        
    
    writer.save()
    
    #print(json.dumps(pattern, sort_keys=True))
    #s.update(d)
#f.close

#print(s)

# test slicing from text
#v = myprint(text, 0, 1540)
#print(v)

In [222]:
class CaseSystem(object):
    """
    Configuration object:
    cases by pilot, training and validation sets
    paths by output, gold and system locations
    directories by partition case types
    extensions bu gold and system
    systems 
    """
    
    def __init__(self):
        self = self    
       
        self.systems = ['biomedicus', 'clamp', 'ctakes', 'metamap']
        self.amicus = ['amicus']
        self.pilot_directory = '/pass_one/'
        self.new_directory = '/new_data/'
        self.training_directory = '/training_set/'
        self.validation_directory = '/validation_set/'
        self.amicus_directory = '/adapt_amicus/'
        self.all_directory = '/all/'
        self.output_path = '/Users/gms/development/nlp/nlpie/projects/trauma/output'
        self.gold_path = '/Users/gms/development/nlp/nlpie/projects/trauma/brat_files'
        self.txt_path = '/Users/gms/development/nlp/nlpie/projects/trauma/txt_files'
        self.system_path = '/Users/gms/development/nlp/nlpie/data/'
        self.gold_ftype = '/*.ann'
        self.system_ftype = '/*.txt'
        
        self.pilot_set = ['AC71231',
                          'AC74787',
                          'AL725222',
                          'AL731702',
                          'AL731704',
                          'AL733357',
                          'AL740639',
                          'AL742150',
                          'AL749501',
                          'AL753536']
        
        self.training_set = ['AC71231',
                             'AC72114',
                             #'AC72244', # issues with too large file
                             'AC72253',
                             'AC72336',
                             'AC73846',
                             'AC73930',
                             'AC74744',
                             'AC74787',
                             'AC74877',
                             'AC7579',
                             'AC7701',
                             'AL711198',
                             'AL711277',
                             'AL711278',
                             'AL713195',
                             'AL714496',
                             'AL715220',
                             'AL715243',
                             'AL715482',
                             'AL719219',
                             'AL722081',
                             'AL722083',
                             'AL722382',
                             'AL724394',
                             'AL724464',
                             'AL725183',
                             'AL725222',
                             'AL725608',
                             'AL725821',
                             'AL730352',
                             'AL731702',
                             'AL731704',
                             'AL733357',
                             'AL734634',
                             'AL735140',
                             'AL735651',
                             'AL740639',
                             'AL741783',
                             'AL741825',
                             'AL741827',
                             'AL742150',
                             'AL742510',
                             'AL742565',
                             'AL742910',
                             'AL743316',
                             'AL743429',
                             'AL746324',
                             'AL749396',
                             'AL749501',
                             'AL753532',
                             'AL753536',
                             'AL76615',
                             'AL77146',
                             'AX7941',
                             'MS71663',
                             'NP71049',
                             'RF7241',
                             'TD70320',
                             'al76592']
        
        self.validation_set = ['AC71734',
                                 'AC72148',
                                 'AC72242',
                                 'AC72666',
                                 'AC72825',
                                 'AC72996',
                                 'AC73178',
                                 'AC73805',
                                 'AC74444',
                                 'AC74464',
                                 'AC74831',
                                 'AC74846',
                                 'AL710225',
                                 'AL710238',
                                 'AL710582',
                                 'AL711233',
                                 'AL711578',
                                 'AL712404',
                                 'AL713181',
                                 'AL713696',
                                 'AL713698',
                                 'AL721960',
                                 'AL723667',
                                 'AL723966',
                                 'AL724390',
                                 'AL724391',
                                 'AL724913',
                                 'AL725290',
                                 'AL726570',
                                 'AL728175',
                                 'AL729303',
                                 'AL730355',
                                 'AL732160',
                                 'AL732487',
                                 'AL733571',
                                 'AL733991',
                                 'AL736690',
                                 'AL737026',
                                 'AL737342',
                                 'AL737916',
                                 'AL738954',
                                 'AL738975',
                                 'AL739435',
                                 'AL740173',
                                 'AL740913',
                                 'AL743595',
                                 'AL744275',
                                 'AL744464',
                                 'AL744694',
                                 'AL745137',
                                 'AL747275',
                                 'AL747982',
                                 'AL748508',
                                 'AL748716',
                                 'AL752098',
                                 'AL752157',
                                 'AL76062',
                                 'AL77500',
                                 'RF7718',
                                 'TG70292',
                                 'TW70891',
                                 'WS71500',
                                 'ac73876']

        
        self.new_set = ['AC71056',
                         'AC71361',
                         'AC71554',
                         'AC71586',
                         'AC71789',
                         'AC71817',
                         'AC71884',
                         'AC71902',
                         'AC72080',
                         'AC72098',
                         'AC72254',
                         'AC72383',
                         'AC72560',
                         'AC72640',
                         'AC72785',
                         'AC72995',
                         'AC73035',
                         'AC73312',
                         'AC73677',
                         'AC73807',
                         'AC73856',
                         'AC74181',
                         'AC74187',
                         'AC74533',
                         'AC74601',
                         'AC74616',
                         'AC7493',
                         'AC74933',
                         'AC75032',
                         'AC7616',
                         'AC7688',
                         'AC7969',
                         'PZ72028']

        self.gold_entities = ['Age',
                         'AirbagPresence',
                         'DriverPassengerStatus',
                         'EjectFromCar',
                         'Entrapment',
                         'Extricationtime',
                         'Gender',
                         'HeadOn',
                         'IndicationProcedure', 
                         'InsuranceStatus',
                         'LocationIntrusion',
                         'OtherMinor',
                         'OtherSevere',
                         'Procedure',
                         'Rollover',
                         'SeatbeltPresence',
                         'SeverityIntrusion',
                         'TBone',
                         'VehicleSpeed']
        
        self.gold_amicus_entities = ['Age',
                         'AirbagPresence',
                         'DriverPassengerStatus',
                         'HeadOn',
                         'IndicationProcedure', 
                         'InsuranceStatus',
                         'OtherMinor',
                         'Procedure',
                         'SeatbeltPresence',
                         'SeverityIntrusion',
                         'TBone',
                         'VehicleSpeed']
        
    # default config to training   
    def case_config(self, partition='training'):
        txt_directory = ''
            
        if partition == 'pilot':
            txt_directory +=  self.pilot_directory
            cases = self.pilot_set
        elif partition == 'training':
            txt_directory += self.training_directory
            cases = self.training_set
        elif partition == 'validation':
            txt_directory += self.validation_directory
            cases = self.validation_set
        elif partition == 'all':
            txt_directory += self.all_directory
            cases = self.training_set + self.validation_set
        elif partition == 'amicusall': # run against 4-system output/full set
            txt_directory += self.amicus_directory
            cases = list(set(self.training_set + self.validation_set) - set(self.pilot_set))
        elif partition == 'amicus': # run against amicus munged set
            txt_directory += self.amicus_directory
            cases = list(set(self.training_set + self.validation_set) - set(self.pilot_set))
            #experiment_set = self.amicus_directory
        elif partition == 'new':
            txt_directory += self.new_directory
            cases = self.new_set
            
        return cases, txt_directory, partition
        

In [223]:
#cs = CaseSystem()
#cases, txt_directory = cs.case_config('all')

#print(len(cases))
#print(txt_directory)

In [224]:
def gold_system_annotation(m, tp = True):
    temp = pd.DataFrame()
    
    for i in m:
        #print({'case':i[0]['case'],'sys text':i[0]['text'],'sys':i[0]['system'],'sys type': i[0]['type'],
        #       'sys span':(i[0]['begin'],i[0]['end']),'gold entity':i[1]['gold_entity_type'],'gold span':(i[1]['begin'],i[1]['end']),
        #       'gold text':i[1]['text']})

        if tp:
            d = {'case':i[0]['case'],'sys text':i[0]['text'],'sys':i[0]['system'],'sys type': i[0]['type'],
                 'sys begin':i[0]['begin'], 'sys end': i[0]['end'],'gold entity':i[1]['gold_entity_type'],
                 'gold begin':i[1]['begin'], 'gold end': i[1]['end'], 'gold text':i[1]['text']}
        else:
            d = {'case':i[1]['case'],'sys text': 'FN','sys': None,'sys type': None,
                 'sys begin': None, 'sys end': None,'gold entity':i[1]['gold_entity_type'],'gold begin':i[1]['begin'], 
                 'gold end': i[1]['end'], 'gold text':i[1]['text']}

        #print(d)
        
        data = pd.DataFrame(d, index=[0])
        
        temp = pd.concat([temp, data], ignore_index=True)
    return temp

In [225]:
def best_in_class():
    best ='''biomedicus Number Age
ctakes Sentence AirbagPresence
ctakes FractionAnnotation DriverPassengerStatus
biomedicus IndefiniteQuantifierCue EjectFromCar
biomedicus OtherAcronymSense Entrapment
ctakes Predicate HeadOn
ctakes SignSymptomMention IndicationProcedure
metamap Candidate InsuranceStatus
ctakes Predicate OtherMinor
biomedicus OtherAcronymSense OtherSevere
ctakes RomanNumeralAnnotation Procedure
biomedicus Acronym Procedure
ctakes Predicate Rollover
metamap Phrase SeatbeltPresence
ctakes SignSymptomMention SeatbeltPresence
biomedicus IndefiniteQuantifierCue SeverityIntrusion
biomedicus Acronym TBone
biomedicus IndefiniteQuantifierCue VehicleSpeed
'''.replace('\n',' ').split(' ')

    best_of = list()

    i = 0
    for b in best:
        if i == 0:
            d = {'system': b}
        elif i == 1:
            d['type'] = b
        elif i == 2:
            d['entity'] = b
        i += 1
        if i == 3:
            i = 0
            best_of.append(d)

    return best_of

In [226]:
def geometric_mean(metrics, cs):
    
    from scipy.stats.mstats import gmean
    # 1. Group by entity type
    # 2. Get rank average of F1, TP/FN, TM
    # http://www.datasciencemadesimple.com/rank-dataframe-python-pandas-min-max-dense-rank-group/
    # https://stackoverflow.com/questions/46686315/in-pandas-how-to-create-a-new-column-with-a-rank-according-to-the-mean-values-o?rq=1
    # 3. Take geomean of 2.
    # https://stackoverflow.com/questions/42436577/geometric-mean-applied-on-row
    
    test = pd.DataFrame()
    for g in cs.gold_entities:
        df = metrics[metrics['entity'] == g]

        df['F1 rank']=df['F'].rank(ascending=0,method='average')
        df['TP/FN rank']=df['TP/FN'].rank(ascending=0,method='average')
        df['TM rank']=df['TM'].rank(ascending=0,method='average')
        df['Gmean'] = gmean(df.iloc[:,-3:],axis=1)

        frames = [test, df]
        test = pd.concat(frames, ignore_index=True)

    return test

In [227]:
# print('a')
# test, system, system_out = get_evaluation_data(cs, cs.case_config('pilot'), run_all=True)
# test, span, comp, span_comp = get_gold_annotations(test)
# print('b')

In [228]:
#%%time
import time
# Task: parse xmi, brat cases and do "stuff" with data
def main():
    
    cs = CaseSystem()
    start = time.time()
    rtype = int(input("Run: 1->generate data; 2->brat ann out; 3->patterns out; 4->metrics; 5->top n bic w/ TP/FN; 6->amicus mash; 7->metrics"))
    
    nested = False # used to control type of overlapping match
    nest = 'system_coverage'
    
    partition = 'amicusall'
    if (rtype == 1):
        # option 1: run evaluation on ADAPT versus brat
        #partition = 'amicusall'
        
        test, system, system_out = get_evaluation_data(cs, cs.case_config(partition), run_all=True)
        
        # write to output to save time!
        system.to_csv(cs.output_path + '/system_out'+ partition +'.csv')
        
        print('end get eval data', (time.time() - start))
        test, span, comp, span_comp = get_gold_annotations(test)
        
        print('end get gold annotations', (time.time() - start))

        gold = span.merge(comp, how='inner', left_index=True, right_index=True)
        gold.to_csv(cs.output_path + '/gold_out'+ partition + '.csv')
        
        cols_to_keep = ['case', 'begin', 'end', 'text', 'entity_type']
        
        #gold = gold[cols_to_keep]
        #gold = gold.rename(columns={'entity_type': 'gold_entity_type'})
        #print(gold.to_dict('records'))
        #gold_out = gold.to_dict('records')
        gold_out = df_to_list(gold, 'gold')
        
        #m = metrics_out(cs, gold_out, system_out, cs.case_config(partition), False) 
        # 'nested' sets the type of span coverage: True: gold covers system; False: system covers gold
      
        m = metrics_out(cs, gold_out, system_out, cs.case_config(partition), nested)
        
        print('end metrics out', (time.time() - start))

        print(m.head())
        # get geometric mean of ranked averages of F1, TP/FN, TM
        metrics = geometric_mean(m, cs)
        
        print('end geometric mean', (time.time() - start))
        metrics.to_csv(cs.output_path + '/test_metrics'+ partition + nest + '.csv')
        
        print(gold.head())
        print(system.head())
        # write to output to save time!
        #system.to_csv(cs.output_path + '/system_out'+ partition +'.csv')
        #gold.to_csv(cs.output_path + '/gold_out'+ partition +'.csv')
    elif (rtype == 2):
        # option 2: write brat annotations to Excel
        #partition = 'new'
        test, system, system_out = get_evaluation_data(cs, cs.case_config(partition), run_all=False)
        test, span, comp, span_comp = get_gold_annotations(test)
        entity_relations = get_gold_relations(test, span_comp)
        entity_attributes = get_gold_attributes(test, span_comp)
        all_annotations_out(cs, comp, entity_attributes, entity_relations, partition)
    elif (rtype == 3):
        # option 3: get annotated JSON, matching patterns -> NER label
        test, system, system_out = get_evaluation_data(cs, cs.case_config('training'), run_all=False)
        test, span, comp, span_comp = get_gold_annotations(test)
        gold_text, cases = get_gold_txt(cs, cs.case_config('training'))   
        annotations_to_json(span_comp, comp, gold_text)
        annotated_patterns_to_json(cs, comp, cs.case_config('training'))
#     elif (rtype == 4):
#         # get system -> brat annotations
#         sys = pd.read_csv(cs.output_path + '/system_out.csv')
#         ann = pd.read_csv(cs.output_path + '/gold_out.csv')
        
#         # loop through annotations by "best-in-class" and write output to worksheets by annotation type
#         best = best_in_class()
       
#         test = pd.DataFrame()
#         for b in best:
#             #print(b)
#             gold = ann[ann['entity_type'] == b['entity']]
#             system = sys[(sys['system'] == b['system']) & (sys['type'] == b['type'])]
        
#             g1 = gold_list(gold)
#             s1 = system_list(system)
#             c = get_cooccurences(g1, s1)

#             temp = gold_system_annotation(c.matches)
#             testing = gold_system_annotation(c.false_negatives, False)
            
#             frames = [temp, testing]
#             out = pd.concat(frames, ignore_index=True)
#             frames = [test, out]
#             test = pd.concat(frames, ignore_index=True)
        
#         print(test)
        
#         test.to_csv(cs.output_path + '/patterns_all.csv')
    
    elif (rtype == 7):
        
        partition = 'amicus'
        sys = pd.read_csv(cs.output_path + '/system_out'+ partition +'.csv')
        ann = pd.read_csv(cs.output_path + '/gold_out'+ partition +'.csv')
        
        gold_out = df_to_list(ann, 'gold')
        system_out = df_to_list(sys, 'system')
        
        #m = metrics_out(cs, gold_out, system_out, cs.case_config(partition), False) # nested sets type of span coverage
        m = metrics_out(cs, gold_out, system_out, cs.case_config(partition), True)

        print(m.head())
        # get geometric mean of ranked averages of F1, TP/FN, TM
        metrics = geometric_mean(m, cs)
        print(metrics.head())
        #metrics.to_csv(cs.output_path + '/test_metrics'+ partition +'.csv')
        
        
        #data = pd.read_csv(cs.output_path + '/test_metrics.csv')
        
        #geometric_mean(data, cs)
        
        #geometric_mean(data, cs).to_csv(cs.output_path + '/bic_ranking.csv')
    
    elif (rtype == 4):
        
        data = pd.read_csv(cs.output_path + '/test_metrics.csv')
        
        geometric_mean(data, cs)
        
        geometric_mean(data, cs).to_csv(cs.output_path + '/bic_ranking.csv')
    
    elif (rtype == 5):
        
        #partition = 'amicusall'
        n = input('Select top n:')
            
        print('Processing top' + n + ' for best system types:')

        # get system -> brat annotations
        sys = pd.read_csv(cs.output_path + '/system_out'+ partition +'.csv')
        ann = pd.read_csv(cs.output_path + '/gold_out'+ partition +'.csv')
        
        best_in_class = []
        data = pd.read_csv(cs.output_path + '/test_metrics'+ partition + nest +'.csv')
        
        temp = pd.DataFrame()
        for g in cs.gold_entities:
            test = data[data['entity'] == g]
            a = test.sort_values(by=['Gmean']).head(int(n))
            cols_to_keep = ['system', 'type', 'entity']
            top_n = a[cols_to_keep]
            for index, row in top_n.iterrows():
                best_in_class.append({'system':row['system'],'type': row['type'], 'entity': row['entity']})
                
            frames = [temp, a]
            temp = pd.concat(frames, ignore_index=True)
        
        #temp = temp.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
        
        #print(temp)
        
        writer = pd.ExcelWriter(cs.output_path + '/best_in_class_'+ partition +'_draft.xlsx')

        temp.to_excel(writer,sheet_name='Top ' + n + ' sys annotations')
        
        test = pd.DataFrame()
        for b in best_in_class:
            gold = ann[ann['entity_type'] == b['entity']]
            system = sys[(sys['system'] == b['system']) & (sys['type'] == b['type'])]
            
            g1 = df_to_list(gold, 'gold')
            s1 = df_to_list(system, 'system')
            #c = get_cooccurences(g1, s1, False)
            c = get_cooccurences(g1, s1)

            temp = gold_system_annotation(c.matches)
            
            #print(b)
            #print(temp)
            testing = gold_system_annotation(c.false_negatives, False)
            
            #print(testing)
            
            frames = [temp, testing]
            out = pd.concat(frames, ignore_index=True)
            frames = [test, out]
            test = pd.concat(frames, ignore_index=True)
            
        test.to_csv(cs.output_path + '/best_in_class_annotations_top_' + n + '_' + partition +'.csv')

        for ge in cs.gold_entities:
            t = test[test['gold entity'] == ge]
            if not t.empty:
                t.to_excel(writer,sheet_name=ge)
            
        writer.save()
    
    elif (rtype == 6):
        
        #n = input('Select top n:')
            
        #print('Processing top' + n + ' for best system types:')

        # get system -> brat annotations
        sys = pd.read_csv(cs.output_path + '/system_out'+ partition +'.csv')
        ann = pd.read_csv(cs.output_path + '/gold_out'+ partition +'.csv')
        
        entity_of_interest =  ['SeverityIntrusion'] #['IndicationProcedure']
        best_in_class = []
        
        data = pd.read_csv(cs.output_path + '/test_metrics'+ partition + nest +'.csv')
        
        temp = pd.DataFrame()
        for g in entity_of_interest:
#             test = data[(data['entity'] == g) & 
#                         ((data['type'] == 'IndefiniteQuantifierCue') |  
#                          (data['type'] == 'StandaloneQuantifier') |
#                          (data['type'] == 'Number'))]
#             test = data[(data['entity'] == g) & 
#                         ((data['type'] == 'SignSymptomMention') |  
#                          (data['type'] == 'UmlsConcept'))]
#             test = data[(data['entity'] == g) & 
#                         ((data['type'] == 'Sentence') |  
#                          (data['type'] == 'Phrase'))]
            #a = test.sort_values(by=['Gmean']).head(int(n))
            test = data[(data['entity'] == g) & 
                        (data['type'] == 'Sentence')]
            cols_to_keep = ['system', 'type', 'entity']
            top = test[cols_to_keep]
            for index, row in top.iterrows():
                best_in_class.append({'system':row['system'],'type': row['type'], 'entity': row['entity']})
                
            frames = [temp, test]
            temp = pd.concat(frames, ignore_index=True)
            
            print(best_in_class)
        
        #temp = temp.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
        
        print(temp)
        
        writer = pd.ExcelWriter(cs.output_path + '/best_in_class_all_'+ partition + ' ' + entity_of_interest[0] + nest +'.xlsx')

        temp.to_excel(writer,sheet_name='All sys annotations', engine='openpyxl')
        
        # only choose those from pilot eval
        #chosen_best = ['IndefiniteQuantifierCue','StandaloneQuantifier','Number'] #SeverityIntrusion -> gold covefage
        #chosen_best = ['SignSymptomMention','UmlsConcept'] # IndicationProcedure -> gold coverage
        #chosen_best = ['Sentence','Phrase'] # IndicationProcedure -> sys coverage
        chosen_best = ['Sentence'] # SeverityIntrusion -> sys coverage
        
        test = pd.DataFrame()
        for b in best_in_class:
            #print(b)
            if b['type'] in chosen_best:
                gold = ann[ann['entity_type'] == b['entity']]
                system = sys[(sys['system'] == b['system']) & (sys['type'] == b['type'])]

                g1 = df_to_list(gold, 'gold')
                s1 = df_to_list(system, 'system')
                c = get_cooccurences(g1, s1)

                temp = gold_system_annotation(c.matches)
                
                print(temp.head(10))

                testing = gold_system_annotation(c.false_negatives, False)

                #print('testing', testing)

                frames = [temp, testing]
                out = pd.concat(frames, ignore_index=True)
                frames = [test, out]
                test = pd.concat(frames, ignore_index=True)
                
                #print(test.head())
            
        test.to_csv(cs.output_path + '/best_in_class_annotations_' + partition +'.csv')

        for ge in entity_of_interest:
            t = test[test['gold entity'] == ge]
            if not t.empty:
                t.to_excel(writer,sheet_name=ge)
            
        writer.save()


if __name__ == '__main__':
    main()

Run: 1->generate data; 2->brat ann out; 3->patterns out; 4->metrics; 5->top n bic w/ TP/FN; 6->amicus mash; 7->metrics 6


[{'system': 'clamp', 'type': 'Sentence', 'entity': 'SeverityIntrusion'}, {'system': 'ctakes', 'type': 'Sentence', 'entity': 'SeverityIntrusion'}]
   Unnamed: 0  system      type             entity         F  precision  \
0         147   clamp  Sentence  SeverityIntrusion  0.006479   0.003255   
1         150  ctakes  Sentence  SeverityIntrusion  0.002391   0.001200   

     recall  TP  FN     FP     TP/FN  n_gold  n_sys        TM  F1 rank  \
0  0.661290  82  42  25108  1.952381     124  25190  0.516654      1.0   
1  0.306452  38  86  31619  0.441860     124  31657  0.213574      3.0   

   TP/FN rank  TM rank     Gmean  
0         1.0      1.0  1.000000  
1         3.0      2.0  2.620741  
       case                                           sys text    sys  \
0  AL725290                                         site; > 18  clamp   
1  AL738975                                                in.  clamp   
2  AL738975                                                in.  clamp   
3   TD70

In [229]:
best ='''biomedicus Number Age
ctakes Sentence AirbagPresence
ctakes FractionAnnotation DriverPassengerStatus
biomedicus IndefiniteQuantifierCue EjectFromCar
biomedicus OtherAcronymSense Entrapment
ctakes Predicate HeadOn
ctakes SignSymptomMention IndicationProcedure
metamap Candidate InsuranceStatus
ctakes Predicate OtherMinor
biomedicus OtherAcronymSense OtherSevere
ctakes RomanNumeralAnnotation Procedure
ctakes Predicate Rollover
metamap Phrase SeatbeltPresence
ctakes SignSymptomMention SeatbeltPresence
biomedicus IndefiniteQuantifierCue SeverityIntrusion
biomedicus Acronym TBone
biomedicus IndefiniteQuantifierCue VehicleSpeed
'''.replace('\n',' ').split(' ')

best_of = list()

#print(best)

i = 0
for b in best:
    if i == 0:
        d = {'system': b}
    elif i == 1:
        d['type'] = b
    elif i == 2:
        d['entity'] = b
    i += 1
    if i == 3:
        i = 0
        best_of.append(d)
        

In [230]:
c = CaseSystem()

c.pilot_set
c.training_set
c.validation_set

len(c.validation_set)
len(c.pilot_set)
print(len(c.training_set))

len(set(c.validation_set).union(set(c.training_set)))
len(set(c.validation_set).union(set(c.training_set)))
len(set(c.pilot_set).intersection(set(c.training_set)))

59


10

In [231]:
#sorted([line.strip() for line in open("/Users/gms/development/nlp/nlpie/projects/trauma/brat_files/new_data/out.txt", 'r')])