In [1]:
import pandas as pd
import numpy as np
import math
import pymysql
import time 
import functools as ft
import glob   
import operator as op
import shelve
from itertools import combinations, product
from sqlalchemy.engine import create_engine
from datetime import datetime
from ast import literal_eval
from scipy import stats  
from scipy.stats.mstats import gmean
from pythonds.basic.stack import Stack
from pythonds.trees.binaryTree import BinaryTree, height #, clear_tree_out, post_order
from collections import defaultdict

In [2]:
# config class for analysis
class AnalysisConfig(object):
    """
    Configuration object:
    notes by test, full per corpus
    paths by output, gold and system location
    """
    def __init__(self):
        self = self    
       
        #self.systems = ['biomedicus', 'clamp', 'ctakes', 'metamap', 'quick_umls']
        self.systems = ['biomedicus', 'clamp', 'ctakes', 'metamap']
        #self.systems = ['metamap']
        #self.systems = ['biomedicus']
        #self.systems = ['quick_umls']
        self.data_dir = '/Users/gms/development/nlp/nlpie/data/amicus-u01/output/'
    
    def corpus_config(self, corpus):
        
        if corpus == 'mipacq':
            usys_data = 'analytical_cui_mipacq_concepts.csv'
            ref_data = 'test.mipacq_all'
        elif corpus == 'i2b2':
            usys_data = 'analytical_cui_i2b2_concepts.csv'
            ref_data = 'test.i2b2_all'
            
        return usys_data, ref_data
        
corpus = 'mipacq'
analysisConf =  AnalysisConfig()
print(analysisConf.systems, analysisConf.corpus_config(corpus))
usys, ref = analysisConf.corpus_config(corpus)

['biomedicus', 'clamp', 'ctakes', 'metamap'] ('analytical_cui_mipacq_concepts.csv', 'test.mipacq_all')


In [3]:
# annotation class for UIMA systems
class AnnotationSystems(object):
    """   
    CAS XMI Annotations of interest
    
    """
    def __init__(self):
        
        """ 
        annotation base types
        """   
        
        self.biomedicus_dir = "biomedicus_out/"
        self.biomedicus_types = ["biomedicus.v2.UmlsConcept"]
                                  #"biomedicus.v2.Negated"
                                 #"biomedicus.v2.Acronym",
                                 #"biomedicus.v2.DictionaryTerm",
                                 #"biomedicus.v2.Historical"]
        
        self.clamp_dir = "clamp_out/"
        self.clamp_types = ["edu.uth.clamp.nlp.typesystem.ClampNameEntityUIMA"]
                             #"org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode",
                             #"edu.uth.clamp.nlp.typesystem.ClampRelationUIMA"]    
        
        self.ctakes_dir = "ctakes_out/"
        self.ctakes_types = ['ctakes_mentions_all']#"org.apache.ctakes.typesystem.type.textspan.Sentence",
                             #"org.apache.ctakes.typesystem.type.textsem.DiseaseDisorderMention",
                             #"org.apache.ctakes.typesystem.type.textsem.MedicationMention",
                             #"org.apache.ctakes.typesystem.type.textsem.ProcedureMention",
                             #"org.apache.ctakes.typesystem.type.refsem.UmlsConcept",
                             #"org.apache.ctakes.typesystem.type.textsem.SignSymptomMention",
                             #"org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention"]
                             #"org.apache.ctakes.typesystem.type.textsem.MeasurementAnnotation",
                             #"org.apache.ctakes.typesystem.type.textsem.EventMention",
                             #"org.apache.ctakes.typesystem.type.textsem.EntityMention",
                             #"org.apache.ctakes.typesystem.type.textsem.Predicate",
                             #"org.apache.ctakes.typesystem.type.syntax.WordToken"]
        
        self.metamap_dir = "metamap_out/"
        self.metamap_types = [#"org.metamap.uima.ts.Utterance",
                              #"org.metamap.uima.ts.Span",
                              #"org.metamap.uima.ts.Phrase"]
                              "org.metamap.uima.ts.Candidate"]
                              #"org.metamap.uima.ts.CuiConcept",
                              #"org.metamap.uima.ts.Negation"]
                
        self.quick_umls_types = [#'concept']#,
                                #'concept_cosine_length_false',
                                #'concept_cosine_length_true',
                                #'concept_cosine_score_false',
                                #'concept_cosine_score_true',
                                #'concept_dice_length_false',
                                #'concept_dice_length_true',
                                #'concept_dice_score_false',
                                #'concept_dice_score_true',
                                #'concept_jaccard_length_false',
                                #'concept_jaccard_length_true',
                                'concept_jaccard_score_False']
                                #'concept_jaccard_score_true']
                
        
        '''

        self.biomedicus_dir = "biomedicus_out/"
        self.biomedicus_types = [#"biomedicus.v2.UmlsConcept"]
                                  #"biomedicus.v2.Negated"
                                 "biomedicus.v2.Acronym",
                                 "biomedicus.v2.DictionaryTerm",
                                 "biomedicus.v2.Historical"]
        
        
        self.clamp_dir = "clamp_out/"
        #self.clamp_types = [#"edu.uth.clamp.nlp.typesystem.ClampNameEntityUIMA"]
                             #"org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode",
                             #"edu.uth.clamp.nlp.typesystem.ClampRelationUIMA"]
        
        
        self.ctakes_dir = "ctakes_out/"
        self.ctakes_types = ["org.apache.ctakes.typesystem.type.textspan.Sentence",
                             #"org.apache.ctakes.typesystem.type.textsem.DiseaseDisorderMention",
                             #"org.apache.ctakes.typesystem.type.textsem.MedicationMention",
                             #"org.apache.ctakes.typesystem.type.textsem.ProcedureMention",
                             #"org.apache.ctakes.typesystem.type.refsem.UmlsConcept",
                             #"org.apache.ctakes.typesystem.type.textsem.SignSymptomMention",
                             #"org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention"]
                             #"org.apache.ctakes.typesystem.type.textsem.MeasurementAnnotation",
                             #"org.apache.ctakes.typesystem.type.textsem.EventMention",
                             #"org.apache.ctakes.typesystem.type.textsem.EntityMention",
                             "org.apache.ctakes.typesystem.type.textsem.Predicate",
                             "org.apache.ctakes.typesystem.type.syntax.WordToken"]
        
        self.metamap_dir = "metamap_out/"
        self.metamap_types = ["org.metamap.uima.ts.Utterance",
                              "org.metamap.uima.ts.Span",
                              "org.metamap.uima.ts.Phrase"]
                              #"org.metamap.uima.ts.Candidate"]
                              #"org.metamap.uima.ts.CuiConcept",
                              #"org.metamap.uima.ts.Negation"]
                              
        '''
       
    def get_system_type(self, system):
        
        """
        return system types
        """
        
        if system == "biomedicus":
            view = "Analysis"
        else:
            view = "_InitialView"

        if system == 'biomedicus':
            types = self.biomedicus_types
            output = self.biomedicus_dir

        elif system == 'clamp':
            types = self.clamp_types
            output = self.clamp_dir

        elif system == 'ctakes':
            types = self.ctakes_types
            output = self.ctakes_dir

        elif system == 'metamap':
            types = self.metamap_types
            output = self.metamap_dir
        
        elif system == "quick_umls":
            types = self.quick_umls_types
            output = None
            
        return types, view, output
    
annSys = AnnotationSystems()

In [4]:
# qumls = pd.read_csv('/Users/gms/development/nlp/nlpie/data/amicus-u01/output/qumls_similarity.csv')

# #print(len(qumls))
# #print(len(qumls.drop_duplicates()))

# cols_to_keep = ['begin', 'end', 'note_id', 'cui', 'similarity', 'overlap', 'best_match', 'system', 'type'] 

# qumls = qumls[cols_to_keep].drop_duplicates()

# #print(len(qumls))

# #print(len(qumls[['overlap']=='score']))
# print(len(qumls[(qumls.similarity=='jaccard') & (qumls.overlap=='length') & (qumls.best_match=='true')].drop_duplicates()))
# print(len(qumls[(qumls.similarity=='jaccard') & (qumls.overlap=='length') & (qumls.best_match=='false')].drop_duplicates()))
# print(len(qumls[(qumls.similarity=='jaccard') & (qumls.overlap=='score') & (qumls.best_match=='true')].drop_duplicates()))
# print(len(qumls[(qumls.similarity=='jaccard') & (qumls.overlap=='score') & (qumls.best_match=='false')].drop_duplicates()))

# print(len(qumls[(qumls.similarity=='cosine') & (qumls.overlap=='length') & (qumls.best_match=='true')].drop_duplicates()))
# print(len(qumls[(qumls.similarity=='cosine') & (qumls.overlap=='length') & (qumls.best_match=='false')].drop_duplicates()))
# print(len(qumls[(qumls.similarity=='cosine') & (qumls.overlap=='score') & (qumls.best_match=='true')].drop_duplicates()))
# print(len(qumls[(qumls.similarity=='cosine') & (qumls.overlap=='score') & (qumls.best_match=='false')].drop_duplicates()))

# print(len(qumls[(qumls.similarity=='dice') & (qumls.overlap=='length') & (qumls.best_match=='true')].drop_duplicates()))
# print(len(qumls[(qumls.similarity=='dice') & (qumls.overlap=='length') & (qumls.best_match=='false')].drop_duplicates()))
# print(len(qumls[(qumls.similarity=='dice') & (qumls.overlap=='score') & (qumls.best_match=='true')].drop_duplicates()))
# print(len(qumls[(qumls.similarity=='dice') & (qumls.overlap=='score') & (qumls.best_match=='false')].drop_duplicates()))

In [5]:
%reload_ext Cython

In [6]:
def get_notes(analysis_type, corpus):
    
    if 'test' in analysis_type:
        # test set of notes
        if corpus == 'mipacq':
            notes = ['522412787',
             '617637585',
             '3307880735-8',
             '9080688558',
             '618370565',
             '573718188',
             '534584',
             '60891',
             '62620',
             '616172834']
            
        elif corpus == 'i2b2':
            print('TODO')
        
        print('TEST NOTES!')
        #,
#          '4130154172-4',
#          '3580478614',
#          '5024581165-5',
#          '4486835700-9',
#          '534828617',
#          '8154986253',
#          '533855209',
#          '60118',
#          '3537704982-3',
#          '617637585',
#          '60118',
#          '9045889026',
#          '8819868493-8',
#          '533698',
#          '535978760']
     
    else:
        
        if corpus == 'mipacq':
        # these did not meet the minimal criteria for parsing
            notes = ["0595040941-0",
                    "0778429553-0",
                    "1014681675",
                    "2889522952-2",
                    "3080383448-5",
                    "3300000926-3",
                    "3360037185-3",
                    "3580973392",
                    "3627629462-3",
                    "4323116051-4",
                    "477704053-4",
                    "528317073",
                    "531702602",
                    "534061073",
                    "54832076",
                    "5643725437-6",
                    "5944412090-5",
                    "6613169476-6",
                    "7261075903-7",
                    "7504944368-7",
                    "7999462393-7",
                    "8131081430",
                    "8171084310",
                    "8193787896",
                    "8295055184-8",
                    "8823185307-8"]
            
        elif corpus == 'i2b2':
            notes = ['0081', 
                     '0401']

    return notes# training_notes
print(get_notes('entity', corpus))

['0595040941-0', '0778429553-0', '1014681675', '2889522952-2', '3080383448-5', '3300000926-3', '3360037185-3', '3580973392', '3627629462-3', '4323116051-4', '477704053-4', '528317073', '531702602', '534061073', '54832076', '5643725437-6', '5944412090-5', '6613169476-6', '7261075903-7', '7504944368-7', '7999462393-7', '8131081430', '8171084310', '8193787896', '8295055184-8', '8823185307-8']


In [7]:
%%cython

import numpy as np # access to Numpy from Python layer
import math

class Metrics(object):
    """
    metrics class:
    returns an instance with confusion matrix metrics
    """
    def __init__(self, system_only, gold_only, gold_system_match, system_n, neither = 0):

        self = self    
        self.system_only = system_only
        self.gold_only = gold_only
        self.gold_system_match = gold_system_match
        self.system_n = system_n
        self.neither = neither
        
    def get_confusion_metrics(self, test = False):
        
        """
        compute confusion matrix measures, as per  
        https://stats.stackexchange.com/questions/51296/how-do-you-calculate-precision-and-recall-for-multiclass-classification-using-co
        """
        cdef:
            int TP, FP, FN
            double TM

        TP = self.gold_system_match
        FP = self.system_only
        FN = self.gold_only
        TM = TP/math.sqrt(self.system_n) # TigMetric
        
        if not test:
            if self.neither == 0:
                confusion = [[0, self.system_only],[self.gold_only,self.gold_system_match]]
            else:
                confusion = [[self.neither, self.system_only],[self.gold_only,self.gold_system_match]]
            c = np.asarray(confusion)
            recall = np.diag(c) / np.sum(c, axis = 1)
            precision = np.diag(c) / np.sum(c, axis = 0)
            F = 2*(precision*recall)/(precision + recall)
        else:
            precision = TP/(TP+FP)
            recall = TP/(TP+FN)
            F = 2*(precision*recall)/(precision + recall)
        
        # Tignanelli Metric
        if FN == 0:
            TP_FN_R = TP
        elif FN > 0:
            TP_FN_R = TP/FN
 
        return F, recall, precision, TP, FP, FN, TP_FN_R, TM

In [8]:
def write_out(name, analysis_type, c):
   
    """
    write matching and reference-only sets to file for ease in merging combinations
    """
    
    # write output to file
    dir_out = analysisConf.data_dir + 'single_system_out/'
    with open(dir_out + name + '_' + analysis_type + '_' + c.corpus + '_matches.txt', 'w') as f:
        for item in list(c.matches):
            f.write("%s\n" % str(item))

    # write to file
    with open(dir_out + name + '_' + analysis_type + '_' + c.corpus + '_ref_only.txt', 'w') as f:
        for item in list(c.false_negatives):
            f.write("%s\n" % str(item))

In [9]:
#%%cython 

#from __main__ import write_out

#import numpy as np # access to Numpy from Python layer
def label_vector(doc, ann, labels) -> np.array:

    #print(ann, doc, labels)

    v = np.zeros(doc)
    labels = list(labels)
    
    for (i, lab) in enumerate(labels):
        i += 1  # 0 is reserved for no label
        idxs = [np.arange(a.begin, a.end) for a in ann if a.label == lab]
            
        idxs = [j for mask in idxs for j in mask]
        v[idxs] = i

    return v

# test confusion matrix elements for vectorized annotation set; includes TN
def confused(sys1, ann1):
    TP = np.sum(np.logical_and(sys1 >= 1, ann1 == sys1 ))

    # True Negative (TN): we predict a label of 0 (negative), and the true label is 0.
    TN = np.sum(np.logical_and(sys1 == 0, ann1 == 0))

    # False Positive (FP): we predict a label of 1 (positive), but the true label is 0.
    FP = np.sum(np.logical_and(sys1 >= 1, ann1 == 0))

    # False Negative (FN): we predict a label of 0 (negative), but the true label is 1.
    FN = np.sum(np.logical_and(sys1 == 0, ann1 >= 1))
    
    return TP, TN, FP, FN


def get_cooccurences(ref, sys, analysis_type, corpus, single_sys = True, name = None):
    """
    get coocurences between system and reference; exact match; TODO: add relaxed
    """
    # test cooccurences
    class Coocurences(object):
        
        def __init__(self):
            self.ref_system_match = 0
            self.ref_only = 0
            self.system_only = 0
            self.system_n = 0
            self.ref_n = 0
            self.matches = set()
            self.false_negatives = set()
            self.corpus = corpus
            self.cases = set(ref["file"].tolist()) # cases to label 

    c = Coocurences()
    
    # test for converting to vectorization and i-o labeling
    def test_io():
        test = c.cases
        if analysis_type == 'entity':
            docs = [(x, len(open("/Users/gms/development/nlp/nlpie/data/amicus-u01/i2b2/source_data/test_data/" + x + ".txt", 'r').read())) for x in test]
        elif analysis_type == 'full':
            docs = [(x, len(open("/Users/gms/development/nlp/nlpie/data/amicus-u01/mipacq/source_data/source/" + x + ".source", 'r').read())) for x in test]

        ann = ref.copy()
        ann = ann.rename(index=str, columns={"start": "begin", "file": "case"}).copy()
        cols_to_keep = ['begin', 'end', 'case', 'label']
        if analysis_type == 'entity':
            labels = ["concept"]
            ann["label"] = 'concept'
            ann = ann[cols_to_keep].copy()
        elif analysis_type == 'full':  
            ann["label"] = ann["value"]
            sys["label"] = sys["cui"]
            labels = set(ref['value'].tolist())
            print('labels', len(set(labels)))

        sys_ = sys.rename(index=str, columns={"note_id": "case"}).copy()
        
        # need for enttity-only
        if analysis_type == 'entity':
            sys_["label"] = 'concept'
        
        sys_ = sys_[cols_to_keep]
       
        tp = []
        tn = []
        fp = []
        fn = []
        cvals = []
        out = []
        t = []
        d = defaultdict(list)
        
        for n in range(len(docs)):
            a1 = [i for i in ann[ann["case"] == docs[n][0]].copy().itertuples(index=False)]
            s1 = [i for i in sys_[sys_["case"] == docs[n][0]].copy().itertuples(index=False)]

            ann1 = label_vector(docs[n][1], a1, labels)
            sys1 = label_vector(docs[n][1], s1, labels)
            
            TP, TN, FP, FN = confused(sys1, ann1)
            cvals.append([TP, TN, FP, FN])
            
                 
            d['sys'].append(list([int(i) for i in sys1]))
            d['oracle'].append(list([int(i) for i in ann1]))
            d['case'].append(docs[n][0])
            
            '''
            print("tn:", np.intersect1d(np.where(ann1 == 0)[0], np.where(sys1 == 0)[0]),  
                  "tp:", np.intersect1d(np.where(ann1 == 1)[0], np.where(sys1 == 1)[0]), 
                  "fn:", np.intersect1d(np.where(ann1 == 1)[0], np.where(sys1 == 0)[0]), 
                  "fp:", np.intersect1d(np.where(ann1 == 0)[0], np.where(sys1 == 1)[0]))
            '''
        d['labels'] = labels
        
        corp = shelve.open('/Users/gms/Desktop/' + sys.name + '.dat')
        
        for k in d:
            corp[k] = d[k]
        
        corp.close()
       
        return cvals
    
    TP, TN, FP, FN = np.sum(test_io(), axis=0)
    F, recall, precision, TP, FP, FN, TP_FN_R, TM = Metrics(FP, FN, TP, len(sys), TN).get_confusion_metrics() #no TN
    print('test_io():', TP, TN, FP, FN, F, recall, precision)
    
    # non-vectorized:
    if 'entity' in analysis_type and single_sys: # mipacq n -> 16793
        cols_to_keep = ['begin', 'end', 'note_id']
        sys = sys[cols_to_keep].drop_duplicates()
        ref = ref[['start', 'end', 'file']].drop_duplicates()
        sys.name = name
    elif 'cui' in analysis_type and single_sys: # mipacq n -> 10799
        cols_to_keep = ['cui', 'note_id']
        sys = sys[cols_to_keep].drop_duplicates()
        # do not overestimate FP
        sys = sys[~sys['cui'].isnull()] 
        ref = ref[['value', 'file']].drop_duplicates()
        ref = ref[~ref['value'].isnull()]
        sys.name = name
    elif 'full' in analysis_type and single_sys: # mipacq n -> 17393
        cols_to_keep = ['begin', 'end', 'cui', 'note_id']
        sys = sys[cols_to_keep].drop_duplicates()
        sys = sys[~sys['cui'].isnull()]
        ref = ref[['start', 'end', 'value', 'file']].drop_duplicates()
        ref = ref[~ref['value'].isnull()]
        sys.name = name
    
    # matches via inner join
    matches = pd.merge(sys, ref, how = 'inner', left_on=['begin','end','note_id'], right_on = ['start','end','file']) 
    # reference-only via left outer join
    fn = pd.merge(ref, sys, how = 'left', left_on=['start','end','file'], right_on = ['begin','end','note_id']) 
    
    fn = fn[fn['begin'].isnull()] # get as outer join with no match
    
    if 'entity' in analysis_type and single_sys:
        cols_to_keep = ['start', 'end', 'file']
    else:
        cols_to_keep = ['start', 'end', 'value', 'file']
        
        
    matches = matches[cols_to_keep]
    fn = fn[cols_to_keep]
    
    # use for metrics 
    c.matches = c.matches.union(df_to_set(matches, analysis_type, 'ref'))
    c.false_negatives = c.false_negatives.union(df_to_set(fn, analysis_type, 'ref'))
    c.ref_system_match = len(c.matches)
    c.system_only = len(sys) - len(c.matches)
    c.system_n = len(sys)
    c.ref_n = len(ref)
    c.ref_only = len(c.false_negatives)

    # sanity check
    if len(ref) - c.ref_system_match < 0:
        print('Error: ref_system_match > len(ref)!')
    if len(ref) != c.ref_system_match + c.ref_only:
        print('Error: ref count mismatch!')
   
    # save TP/FN
    if single_sys:
        print(analysis_type)
        write_out(sys.name, analysis_type, c)

    return c 

In [10]:
# merging test for i-o labeled data

import numpy as np
import shelve
         
# load shelve
def read_shelve():
        corp = shelve.open('/Users/gms/Desktop/test.dat')
        #print(corp['case'])
        
        return corp
        
test = read_shelve()

# get sample for testing
for case in test['case'][3:5]:
    for i in range(len(test['case'][3:5])):
        if i == 3:
            t0 = test['oracle'][3][0:500]
        else:
            t1 = test['oracle'][4][0:500]
        
        #print('case:', case, test['sys'][i], test['oracle'][i], confused(np.array(test['sys'][i]), np.array(test['oracle'][i])))
    #print(t0, t1)
        
t0 = np.array(test['oracle'][3][0:500])
t1 = np.array(test['oracle'][5][0:500])

l0 = list(t0)
l1 = list(t1)
#l0 = [0, [4, 1], 4, 4, 0, 0, 0, 8, 0, 0] 
#l1 = [0, [1, 4], 4, 0, 0, 0, 0, 8, 8, 8]

def intersection(lst1, lst2): 
    out = list()
    if isinstance(lst1, list) and isinstance(lst2, list):
        out = list(set(lst1) & set(lst2))
    elif isinstance(lst1, list) and isinstance(lst2, int):
        out = list(set(lst1) & set([lst2]))
    elif isinstance(lst1, int) and isinstance(lst2, list):
        out = list(set([lst1]) & set(lst2))
    elif isinstance(lst1, int) and isinstance(lst2, int):
        out = list(set([lst1]) & set([lst2]))
    if len(out) > 1:
        return out
    elif len(out) == 1:
        return out[0]
    else:
        return 0
    
def union(lst1, lst2): 
    out = list()
    if isinstance(lst1, list) and isinstance(lst2, list):
        out = list(set(lst1) | set(lst2))
    elif isinstance(lst1, list) and isinstance(lst2, int):
        out = list(set(lst1) | set([lst2]))
    elif isinstance(lst1, int) and isinstance(lst2, list):
        out = list(set([lst1]) | set(lst2))
    elif isinstance(lst1, int) and isinstance(lst2, int):
        out = list(set([lst1]) | set([lst2]))
    if len(out) == 1:
        out = out[0]
    return out
    
# union and intersect
def umerges(l0, l1):
    un = [0]*len(l0)
    for i in range(len(l0)):
        un[i] = union(l0[i], l1[i])

    return un
    
%timeit un = umerges(l0, l1)

def imerges(l0, l1):
    inter = [0]*len(l0)
    for i in range(len(l0)):
        inter[i] = intersection(l0[i], l1[i])

    return inter

%timeit inter = imerges(l0, l1)

union = [
    ( [set(x) | set(y)] if isinstance(x, list) and isinstance(y, list)
      else [set(x) | set([y])] if isinstance(x, list) and isinstance(y, int)
      else [set([x]) | set(y)] if isinstance(x, int) and isinstance(y, list)
      else [set([x]) | set([y])])
        
     for x, y in zip(l0, l1)
]

# unpack map object
*y, = list(map(list, zip(*union)))
%timeit y

intersection = [
    ( [set(x) & set(y)] if isinstance(x, list) and isinstance(y, list)
      else [set(x) & set([y])] if isinstance(x, list) and isinstance(y, int)
      else [set([x]) & set(y)] if isinstance(x, int) and isinstance(y, list)
      else [set([x]) & set([y])])
      for x, y in zip(l0, l1)
    
]

*x, = list(map(list, zip(*intersection)))
%timeit x

300 µs ± 9.61 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
337 µs ± 21.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
19.9 ns ± 0.645 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)
17.9 ns ± 0.293 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [11]:
def get_metric_data(training_notes, analysis_type, corpus):

    engine = create_engine('mysql+pymysql://gms:nej123@localhost/test', pool_pre_ping=True, pool_size=20, max_overflow=30)
    
    usys_file, ref_table = AnalysisConfig().corpus_config(corpus)
    systems = AnalysisConfig().systems
    
    sys_ann = pd.read_csv(analysisConf.data_dir + usys_file, dtype={'note_id': str})
    
    if 'test' not in analysis_type:
        sql = "SELECT * FROM " + ref_table + " where file not in %(training_notes)s"  
        sys_ann = sys_ann[~sys_ann['note_id'].isin(training_notes)]
        
        
    else:
        sql = "SELECT * FROM " + ref_table + " where file in %(training_notes)s"  
        sys_ann = sys_ann[sys_ann['note_id'].isin(training_notes)]
    
    ref_ann = pd.read_sql(sql, params={"training_notes":training_notes}, con=engine)
    sys_ann = sys_ann.drop_duplicates()
    
    return ref_ann, sys_ann

In [12]:
%%cython
import pandas as pd
from scipy import stats
from scipy.stats.mstats import gmean

def geometric_mean(metrics):
    """
    1. Get rank average of F1, TP/FN, TM
        http://www.datasciencemadesimple.com/rank-dataframe-python-pandas-min-max-dense-rank-group/
        https://stackoverflow.com/questions/46686315/in-pandas-how-to-create-a-new-column-with-a-rank-according-to-the-mean-values-o?rq=1
    2. Take geomean of 2.
        https://stackoverflow.com/questions/42436577/geometric-mean-applied-on-row
    """
    
    data = pd.DataFrame() 

    metrics['F1 rank']=metrics['F'].rank(ascending=0,method='average')
    metrics['TP/FN rank']=metrics['TP/FN'].rank(ascending=0,method='average')
    metrics['TM rank']=metrics['TM'].rank(ascending=0,method='average')
    metrics['Gmean'] = gmean(metrics.iloc[:,-3:],axis=1)

    return metrics  

In [13]:
def generate_metrics(analysis_type, corpus, single_sys = None):
    start = time.time()

    systems = AnalysisConfig().systems
    metrics = pd.DataFrame()
    print('corpus:', corpus)

    training_notes = get_notes(analysis_type, corpus)
    ref_ann, sys_ann = get_metric_data(training_notes, analysis_type, corpus)
    
    for sys in systems:
            types, _, _ = AnnotationSystems().get_system_type(sys) # system types for iterable
            for t in types:
                print(t)
                system = pd.DataFrame()
                
                system_annotations = sys_ann.copy()
                
                system = system_annotations[system_annotations['type'] == str(t)]
            
                if sys == 'quick_umls':
                    system = system[system.similarity.astype(float) >= 0.75]
            
                system = system.drop_duplicates()
                system.name = sys
                
                c = get_cooccurences(ref_ann, system, analysis_type, corpus, True, system.name) # get matches, FN, etc.
                
                print(c.ref_n, c.ref_only, c.system_n, c.system_only, c.ref_system_match)
                
            if c.ref_system_match > 0: # compute confusion matrix metrics and write to dictionary -> df
                F, recall, precision, TP, FP, FN, TP_FN_R, TM = Metrics(c.system_only, c.ref_only, c.ref_system_match, c.system_n).get_confusion_metrics()
                d = {'system': sys, 
                     'type': t, 
                     'F': F[1], 
                     'precision': precision[1], 
                     'recall': recall[1], 
                     'TP': TP, 
                     'FN': FN, 
                     'FP': FP, 
                     'TP/FN': TP_FN_R,
                     'n_gold': c.ref_n, 
                     'n_sys': c.system_n, 
                     'TM': TM}

                data = pd.DataFrame(d,  index=[0])
                metrics = pd.concat([metrics, data], ignore_index=True)
                metrics.drop_duplicates(keep='last', inplace=True)
            else:
                print("NO EXACT MATCHES FOR", t)
            elapsed = (time.time() - start)
            print("elapsed:", sys, elapsed)
     
    elapsed = (time.time() - start)
    print(geometric_mean(metrics))
    
    now = datetime.now()
    timestamp = datetime.timestamp(now)
    
    if single_sys is None:
        file_name = 'metrics_'
    
    metrics.to_csv(analysisConf.data_dir + corpus + '_' + file_name + analysis_type + '_' + str(timestamp) + '.csv')
    
    print("total elapsed time:", elapsed) 

# used to iterate through mm scores
def generate_metrics_mm(analysis_type, corpus, single_sys = None):
    start = time.time()
    from pandas.api.types import is_numeric_dtype
    #systems = ["biomedicus","ctakes","metamap","clamp","quick_umls"]
    systems = AnalysisConfig().systems
    #systems = ["quick_umls"]
    metrics = pd.DataFrame()
    print('corpus:', corpus)

    training_notes = get_notes(analysis_type, corpus)
    ref_ann, sys_ann = get_metric_data(training_notes, analysis_type, corpus)
    
    sys_ann = sys_ann[(sys_ann.score.notnull()) & (sys_ann['system'] == 'metamap')]
    sys_ann = sys_ann[['begin', 'end', 'note_id', 'system', 'score']].drop_duplicates()
    sys_ann.score = sys_ann.score.astype(int)
    
    for sys in systems:
        types, _, _ = AnnotationSystems().get_system_type(sys) # system types for iterable
        for t in types:
            print(t)

            for i in range(500, 1050, 50): 

                sys_ann = sys_ann[(sys_ann["score"] >= i)].copy()

                print('score:', i, len(sys_ann), sys_ann.columns) 

                sys_ann.name = sys + str(i)

                c = get_cooccurences(ref_ann, sys_ann, analysis_type, corpus, True, sys_ann.name) # get matches, FN, etc.

                print(c.ref_n, c.ref_only, c.system_n, c.system_only, c.ref_system_match)

                #print(i, len(system))

                if c.ref_system_match > 0: # compute confusion matrix metrics and write to dictionary -> df
                    F, recall, precision, TP, FP, FN, TP_FN_R, TM = Metrics(c.system_only, c.ref_only, c.ref_system_match, c.system_n).get_confusion_metrics()
                    d = {'system': sys + '_score_' + str(i), 
                         'type': t, 
                         'F': F[1], 
                         'precision': precision[1], 
                         'recall': recall[1], 
                         'TP': TP, 
                         'FN': FN, 
                         'FP': FP, 
                         'TP/FN': TP_FN_R,
                         'n_gold': c.ref_n, 
                         'n_sys': c.system_n, 
                         'TM': TM}

                    data = pd.DataFrame(d,  index=[0])
                    metrics = pd.concat([metrics, data], ignore_index=True)
                    metrics.drop_duplicates(keep='last', inplace=True)
                else:
                    print("NO EXACT MATCHES FOR", t)
                elapsed = (time.time() - start)
                print("elapsed:", sys, elapsed)
     
    elapsed = (time.time() - start)
    print(geometric_mean(metrics))
    
    now = datetime.now()
    timestamp = datetime.timestamp(now)
    # UIMA or QuickUMLS
    if single_sys is None:
        file_name = 'mm_metrics_'
    metrics.to_csv(analysisConf.data_dir + corpus + '_' + file_name + analysis_type + '_' + str(timestamp) + '.csv')
    
    print("total elapsed time:", elapsed) 

In [14]:
# read in system matches from file

def get_ref_n(analysis_type):
   
    training_notes = get_notes(analysis_type, corpus)
    ref_ann, _ = get_metric_data(training_notes, analysis_type, corpus)
    
    # do not overestimate fn
    if 'entity' in analysis_type:
        ref_ann = ref_ann[['start', 'end', 'file']].drop_duplicates()
    elif 'cui' in analysis_type:
        ref_ann = ref_ann[['value', 'file']].drop_duplicates()
    elif 'full' in analysis_type:
        ref_ann = ref_ann[['start', 'end', 'value', 'file']].drop_duplicates()
    else:
        pass
        
    ref_n = len(ref_ann.drop_duplicates())
    
    return ref_n

def get_sys_data(system, analysis_type, corpus):
    
    training_notes = get_notes(analysis_type, corpus)
    _, data = get_metric_data(training_notes, analysis_type, corpus)
        
    out = data[data['system']== system].copy()
    
    if system == 'quick_umls':
        out = out[(out.similarity.astype(float) >= 0.75) & (out["type"] == 'concept_jaccard_score_False')]

    if 'entity' in analysis_type:
        cols_to_keep = ['begin', 'end', 'note_id']
    elif 'cui' in analysis_type:
        cols_to_keep = ['cui', 'note_id']
    elif 'full' in analysis_type:
        cols_to_keep = ['begin', 'end', 'cui', 'note_id']

    out = out[cols_to_keep]
    
    return out.drop_duplicates()

def get_system_matches(system, analysis_type, corpus):
    dir_test = analysisConf.data_dir + 'single_system_out/'

    file = dir_test + system + '_' + analysis_type + '_' + corpus + '_matches.txt'
    data_matches = set(literal_eval(line.strip()) for line in open(file))

    file = dir_test + system + '_' + analysis_type + '_' + corpus + '_ref_only.txt'
    data_fn = set(literal_eval(line.strip()) for line in open(file)) #{ f for f in file.readlines() }

    return data_matches, data_fn

Code to generate QuickUMLS system annotations (must run from shell):

import os, glob
from client import get_quickumls_client
from quickumls import QuickUMLS
import pandas as pd

directory_to_parse = '/Users/gms/development/nlp/nlpie/data/amicus-u01/mipacq/data_in/'
quickumls_fp = '/Users/gms/development/nlp/engines_misc_tools/QuickUMLS/data/'
os.chdir(directory_to_parse)

#similarity = ['dice', 'cosine', 'jaccard', 'overlap']
similarity = ['jaccard']
overlapping_criteria = ['score', 'length']

for s in similarity:
    for o in overlapping_criteria:
        #matcher = get_quickumls_client(similarity_name)
        matcher = QuickUMLS(quickumls_fp=quickumls_fp, overlapping_criteria, threshold=0.7, window=5, similarity_name=s)
        test = pd.DataFrame()
        for fname in glob.glob(directory_to_parse + '*.txt'):
            t = os.path.basename(fname)
            u = t.split('.')[0]
            with open(directory_to_parse + u + '.txt') as f:
                f1 = f.read()
                out = matcher.match(f1, best_match=True, ignore_syntax=False)
                for i in out:
                    i[0]['note_id'] = u
                    frames = [ test, pd.DataFrame(i[0], index = [0]) ]
                    test = pd.concat(frames, ignore_index=True)
        test['system'] = 'quick_umls'
        test['similarity'] = s
        test['overlap'] = o
        test['type'] = 'concept'
        test['note_id'] = u
        testt['best_match'] = 'true'
        temp = test.rename(columns={'start': 'begin'}).copy()
        print(temp.tail())

        temp.to_csv('/Users/gms/development/nlp/nlpie/data/amicus-u01/output/qumls.csv', mode='a', header=False)

GENERATE merges

In [15]:
class SetTotals(object):
    """
    returns an instance with merged match set numbers using either union or intersection of elements in set 
    """
    def __init__(self, ref_n, sys_n, match_set): #_left, match_set_right):

        self = self    
        self.ref_ann = ref_n
        self.sys_n = sys_n
        self.match_set = match_set

    def get_ref_sys(self):

        ref_only = self.ref_ann - len(self.match_set)
        sys_only = self.sys_n - len(self.match_set)

        return ref_only, sys_only, len(self.match_set), self.match_set

In [16]:
def merge_eval(ref_only, system_only, ref_system_match, matches, system_n, ref_n):
    """
    Generate confusion matrix params
    :params: ref_only, system_only, reference_system_match -> sets
    matches, system_n, reference_n -> counts
    :return: dictionary object
    
    """

    if ref_only + ref_system_match != ref_n:
        print('ERROR!')

    # get evaluation metrics
    d = {}
    
    F, recall, precision, TP, FP, FN, TP_FN_R, TM  = Metrics(system_only, ref_only, ref_system_match, system_n).get_confusion_metrics()

    d = {
         'F': F[1], 
         'precision': precision[1], 
         'recall': recall[1], 
         'TP': TP, 
         'FN': FN, 
         'FP': FP, 
         'TP/FN': TP_FN_R,
         'n_gold': ref_n, 
         'n_sys': system_n, 
         'TM': TM
    }
    
    
    if system_n - FP != TP:
        print('inconsistent system n!')

    return d

QUERY TO VALIDATE qumls system counts
select count(*), type from (select cui, begin, end, note_id, type from
(SELECT distinct *
 FROM test.qumls_cui
where note_id not in ("0595040941-0",
                            "0778429553-0",
                            "1014681675",
                            "2889522952-2",
                            "3080383448-5",
                            "3300000926-3",
                            "3360037185-3",
                            "3580973392",
                            "3627629462-3",
                            "4323116051-4",
                            "477704053-4",
                            "528317073",
                            "531702602",
                            "534061073",
                            "54832076",
                            "5643725437-6",
                            "5944412090-5",
                            "6613169476-6",
                            "7261075903-7",
                            "7504944368-7",
                            "7999462393-7",
                            "8131081430",
                            "8171084310",
                            "8193787896",
                            "8295055184-8",
                            "8823185307-8") 
                            and similarity >= 0.8 ) t
group by cui, begin, end, note_id, type) t
group by type;

In [17]:
%%cython

import operator as op
import pandas as pd
from __main__ import get_system_matches, get_sys_data

def process_sentence(pt, sentence, analysis_type, corpus):
    """
    Recursively evaluate parse tree, 
    with check for existence before build
       :param sentence: to process
       :return class of merged annotations, boolean operated system df 
    """
    
    class Results(object):
        def __init__(self):
            self.results = set()
            #self.operations = []
            self.sytem_merges = pd.DataFrame()
            
    r = Results()
    
    if 'entity' in analysis_type: 
        cols_to_keep = ['begin', 'end', 'note_id'] # entity only
    elif 'full' in analysis_type: 
        cols_to_keep = ['cui', 'begin', 'end', 'note_id'] # entity only
    elif 'cui' in analysis_type:
        cols_to_keep = ['cui', 'note_id'] # entity only
        
    
    def evaluate(parseTree):
        oper = {'&': op.and_, '|': op.or_}
        
        if parseTree:
            leftC = evaluate(parseTree.getLeftChild())
            rightC = evaluate(parseTree.getRightChild())
            
            if leftC and rightC:
                query = set()
                system_query = pd.DataFrame()
                fn = oper[parseTree.getRootVal()]
                
                if isinstance(leftC, str):
                    
                    # get system as leaf node 
                    left, _ = get_system_matches(leftC, analysis_type, corpus)
                    left_sys = get_sys_data(leftC, analysis_type, corpus)
                
                elif isinstance(leftC, tuple):
                    left = leftC[0]
                    l_sys = leftC[1]
                
                if isinstance(rightC, str):
                    
                    # get system as leaf node
                    right, _ = get_system_matches(rightC, analysis_type, corpus)
                    right_sys = get_sys_data(rightC, analysis_type, corpus)
                    
                elif isinstance(rightC, tuple):
                    right = rightC[0]
                    r_sys = rightC[1]
                    
                # create match set based on boolean operation
                match_set = fn(left, right)
                
                if fn == op.or_:
                    r.results = r.results.union(match_set)
                   
                    if isinstance(leftC, str) and isinstance(rightC, str):
                        df = left_sys.append(right_sys)
                        df = df[cols_to_keep].drop_duplicates()
                        
                    elif isinstance(leftC, str) and isinstance(rightC, tuple):
                        df = left_sys.append(r_sys)
                        df = df[cols_to_keep].drop_duplicates()
                    
                    elif isinstance(leftC, tuple) and isinstance(rightC, str):
                        df = right_sys.append(l_sys)
                        df = df[cols_to_keep].drop_duplicates()
                    
                    elif isinstance(leftC, tuple) and isinstance(rightC, tuple):
                        df = l_sys.append(r_sys)
                        df = df[cols_to_keep].drop_duplicates()
        
                if fn == op.and_:
                    if len(r.results) == 0:
                        r.results = match_set
                    r.results = r.results.intersection(match_set)
                    
                    if isinstance(leftC, str) and isinstance(rightC, str):
                        df = left_sys.merge(right_sys, on=cols_to_keep, how='inner')
                        df = df[cols_to_keep].drop_duplicates()
                     
                    elif isinstance(leftC, str) and isinstance(rightC, tuple):
                        df = left_sys.merge(r_sys, on=cols_to_keep, how='inner')
                        df = df[cols_to_keep].drop_duplicates()
                    
                    elif isinstance(leftC, tuple) and isinstance(rightC, str):
                        df = l_sys.merge(right_sys, on=cols_to_keep, how='inner')
                        df = df[cols_to_keep].drop_duplicates()
                    
                    elif isinstance(leftC, tuple) and isinstance(rightC, tuple):
                        df = l_sys.merge(r_sys, on=cols_to_keep, how='inner')
                        df = df[cols_to_keep].drop_duplicates()
                
                # get matched results
                query.update(r.results)
                
                # get combined system results
                r.sytem_merges = df
                
                if len(df) > 0:
                    system_query = system_query.append(df)
                else:
                    print('wtf!')
                    
                return query, system_query
            else:
                return parseTree.getRootVal()
    
    if sentence.n_or > 0 or sentence.n_and > 0:
        evaluate(pt)  
    
    # trivial case
    elif sentence.n_or == 0 and sentence.n_and == 0:
        
        r.results, _ = get_system_matches(sentence.sentence, analysis_type, corpus)
        r.sytem_merges = get_sys_data(sentence.sentence, analysis_type, corpus)
        print('trivial:', sentence.sentence, len(r.results), len(r.sytem_merges))
    
    return r

In [18]:
"""
Incoming Boolean sentences are parsed into a binary tree.

Test expressions to parse:

sentence = '((((A&B)|C)|D)&E)'

sentence = '(E&(D|(C|(A&B))))'

sentence = '(((A|(B&C))|(D&(E&F)))|(H&I))'

"""
# build parse tree from passed sentence
# using grammatical rules of Boolean logic
def buildParseTree(fpexp):
    """
       Iteratively build parse tree from passed sentence using grammatical rules of Boolean logic
       :param fpexp: sentence to parse
       :return eTree: parse tree representation
       Incoming Boolean sentences are parsed into a binary tree.
       Test expressions to parse:
       sentence = '(A&B)'
       sentence = '(A|B)'
       sentence = '((A|B)&C)'
       
    """

    fplist = fpexp.split()
    pStack = Stack()
    eTree = BinaryTree('')
    pStack.push(eTree)
    currentTree = eTree

    for i in fplist:

        if i == '(':
            currentTree.insertLeft('')
            pStack.push(currentTree)
            currentTree = currentTree.getLeftChild()
        elif i not in ['&', '|', ')']:
            currentTree.setRootVal(i)
            parent = pStack.pop()
            currentTree = parent
        elif i in ['&', '|']:
            currentTree.setRootVal(i)
            currentTree.insertRight('')
            pStack.push(currentTree)
            currentTree = currentTree.getRightChild()
        elif i == ')':
            currentTree = pStack.pop()
        else:
            raise ValueError

    return eTree

def make_parse_tree(payload):
    """
    Ensure data to create tree are in standard form
    :param sentence: sentence to preprocess
    :return pt, parse tree graph
            sentence, processed sentence to build tree
            a: order
    """
    def preprocess_sentence(sentence):
        # prepare statement for case when a boolean AND/OR is given
        sentence = payload.replace('(', ' ( '). \
            replace(')', ' ) '). \
            replace('&', ' & '). \
            replace('|', ' | '). \
            replace('  ', ' ')
        return sentence

    sentence = preprocess_sentence(payload)
    print(sentence)
    
    pt = buildParseTree(sentence)
    #pt.postorder() 
    
    return pt

class Sentence(object):

    def __init__(self, sentence):
        self = self
        self.n_and = sentence.count('&')
        self.n_or = sentence.count('|')
        self.sentence = sentence
    
def get_metrics(boolean_expression, analysis_type, corpus):
    """
    Traverse binary parse tree representation of Boolean sentence
        :params: boolean expression in form of '(<annotator_engine_name1><boolean operator><annotator_engine_name2>)'
                 analysis_type (string value of: 'test', 'entity', 'cui', 'full') used to filter set of reference and system annotations 
        :return: dictionary with values needed for confusion matrix
    """
    sentence = Sentence(boolean_expression)   

    pt = make_parse_tree(sentence.sentence)

    r = process_sentence(pt, sentence, analysis_type, corpus)

    system_n = len(r.sytem_merges)
    reference_n = get_ref_n(analysis_type)

    reference_only, system_only, reference_system_match, match_set = SetTotals(reference_n, system_n, r.results).get_ref_sys()

    # get overall TP/TF and various other counts for running confusion matrix metric analysis
    return merge_eval(reference_only, system_only, reference_system_match, r.results, system_n, reference_n)

In [19]:
# generate all combinations of given list of annotators:
def expressions(l, n):
    for (operations, *operands), operators in product(
            combinations(l, n), product(('&', '|'), repeat=n - 1)):
        for operation in zip(operators, operands):
            operations = [operations, *operation]
        yield operations

def run_ensemble(l, analysis_type, corpus):

    metrics = pd.DataFrame()

    for i in range(1, len(l)+1):
        test = list(expressions(l, i))
        for t  in test:
            if i > 1:
                # format Boolean sentence for parse tree 
                t = '(' + " ".join(str(x) for x in t).replace('[','(').replace(']',')').replace("'","").replace(",","").replace(" ","") + ')'

            d = get_metrics(t, analysis_type, corpus)
            d['merge'] = t
            frames = [metrics, pd.DataFrame(d, index=[0]) ]
            metrics = pd.concat(frames, ignore_index=True, sort=False) 
    
    now = datetime.now()
    timestamp = datetime.timestamp(now)
    
    file_name = corpus + '_all_merge_metrics_'
        
    geometric_mean(metrics).to_csv(analysisConf.data_dir + file_name + analysis_type + '_' + str(timestamp) + '.csv')
    print(geometric_mean(metrics))

In [20]:
def df_to_set(df, analysis_type = 'entity', df_type = 'sys'):
    
    #print(df[0:10])
    
    # get values for creation of series of type tuple
    if 'entity' in analysis_type: 
        if df_type == 'sys':
            arg = df.begin, df.end, df.note_id
        else:
            arg = df.start, df.end, df.file
    elif 'cui' in analysis_type:
        if df_type == 'sys':
            arg = df.cui, df.note_id
        else:
            arg = df.value, df.file
    elif 'full' in analysis_type:
        if df_type == 'sys':
            arg = df.begin, df.end, df.cui, df.note_id
        else:
            arg = df.start, df.end, df.value, df.file
    
    return set(list(zip(*arg)))

In [21]:
#TESTS -> ensemble:
def test_match_consistency(matches, ref_only, ref_n, sys):
    """test for reference only/match set consistency:
        params: match, system and reference only sets"""
   
    print('len', len(sys), len(matches), len(matches.union(sys)), len(matches.intersection(sys)))
    assert len(matches.union(ref_only)) == ref_n, 'Reference annotation mismatch union'
    assert len(matches.intersection(sys)) == len(matches), 'System annotation mismatch intersect'
    assert len(matches.union(sys)) == len(sys), 'System annotation mismatch union'
    assert len(matches.intersection(ref_only)) == 0, 'Reference annotation mismatch intersect'

def test_systems(analysis_type, systems, corpus):
    sys = df_to_set(get_sys_data(systems[0], analysis_type, corpus), analysis_type)
    test_match_consistency(*get_system_matches(systems[0], analysis_type, corpus), get_ref_n(analysis_type), sys)
    print('Match consistency:', len(sys),get_ref_n(analysis_type))

def test_metrics(ref, sys_m, match_m):
    test = True
    reference_n = len(ref)
    system_n = len(sys_m)

    print('Test metrics:', type(reference_n), type(system_n), type(match_m))

    reference_only, system_only, reference_system_match, match_set = SetTotals(reference_n, system_n, match_m).get_ref_sys()
    F, recall, precision, _, _, _, _, _ = Metrics(system_only, reference_only, reference_system_match, system_n).get_confusion_metrics()
    F_, recall_, precision_, _, _, _, _, _ = Metrics(system_only, reference_only, reference_system_match, system_n).get_confusion_metrics(test)

    assert F[1] == F_, 'F1 issue'
    assert recall[1] == recall_, 'recall issue'
    assert precision[1] == precision_, 'precision issue'
    print(F[1], F_)
    print(recall[1], recall_)
    print(precision[1], precision_)

def test_count(analysis_type, corpus):
    # test match counts:
    ctakes, _ = get_system_matches('ctakes', analysis_type, corpus)
    clamp, _ = get_system_matches('clamp', analysis_type, corpus)
    b9, _ = get_system_matches('biomedicus', analysis_type, corpus)
    mm, _ = get_system_matches('metamap', analysis_type, corpus)

    print('count:', len(mm.intersection(b9.intersection(clamp.intersection(ctakes)))))
    
def test_ensemble(analysis_type, corpus):
    
    print('ensemble:')
    # Get mixed system_n
    training_notes = get_notes(analysis_type, corpus)
    ref_ann, data = get_metric_data(training_notes, analysis_type, corpus)

    names = ['ctakes', 'biomedicus', 'metamap', 'clamp']
    if 'entity' in analysis_type: 
        cols_to_keep = ['begin', 'end', 'note_id']
    elif 'cui' in analysis_type:
        cols_to_keep = ['cui', 'note_id']
    elif 'full' in analysis_type:
        cols_to_keep = ['begin', 'end', 'cui', 'note_id']

    biomedicus = data[data["system"]=='biomedicus'][cols_to_keep].copy()
    ctakes = data[data["system"]=='ctakes'][cols_to_keep].copy()
    clamp = data[data["system"]=='clamp'][cols_to_keep].copy()
    metamap = data[data["system"]=='metamap'][cols_to_keep].copy()
    quickumls = data[data["system"]=='quick_umls'][cols_to_keep].copy()

    print('systems:', len(biomedicus), len(clamp), len(ctakes), len(metamap), len(quickumls))

    b9 = set()
    cl = set()
    ct = set()
    mm = set()
    qu = set()

    b9 = df_to_set(get_sys_data('biomedicus', analysis_type, corpus), analysis_type)
    print(len(b9))

    ct = df_to_set(get_sys_data('ctakes', analysis_type, corpus), analysis_type)
    print(len(ct))

    cl = df_to_set(get_sys_data('clamp', analysis_type, corpus), analysis_type)
    print(len(cl))

    mm = df_to_set(get_sys_data('metamap', analysis_type, corpus), analysis_type)
    print(len(mm))

    qu = df_to_set(get_sys_data('quick_umls', analysis_type, corpus), analysis_type)
    print(len(qu))
    
    print('various merges:')
    print(len(b9), len(cl), len(ct), len(mm), len(qu))
    print(len(mm.intersection(b9.intersection(cl.intersection(ct)))))
    print(len(mm.union(b9.intersection(cl.intersection(ct)))))
    print(len(mm.union(b9.union(cl.intersection(ct)))))
    print(len(mm.union(b9.union(cl.union(ct)))))
    print(len(b9.intersection(ct)))

    sys_m = b9.intersection(ct.intersection(qu))
    print('sys_m:', len(sys_m))

    # Get match merges:
    ct, _ = get_system_matches('ctakes', analysis_type, corpus)
    cl, _ = get_system_matches('clamp', analysis_type, corpus)
    b9, _ = get_system_matches('biomedicus', analysis_type, corpus)
    mm, _ = get_system_matches('metamap', analysis_type, corpus)
    qu, _ = get_system_matches('quick_umls', analysis_type, corpus)

    match_m = b9.intersection(ct.intersection(qu))
    print('match_m:', len(match_m))
    # reference df to set
    if 'entity' in analysis_type: 
        cols_to_keep = ['end', 'start','file']
    elif 'cui' in analysis_type:
        cols_to_keep = ['value','file']
    elif 'full' in analysis_type:
        cols_to_keep = ['end', 'start', 'value','file']

    ref = df_to_set(ref_ann[cols_to_keep], analysis_type, 'ref')

    print('ref:', len(ref))

    # test difference:
    print('FP:', len(sys_m - match_m), len(sys_m - ref))
    assert len(sys_m - match_m) == len(sys_m - ref), 'FP mismatch'
    print('FN:', len(ref - match_m), len(ref - sys_m))
    assert len(ref - match_m) == len(ref - sys_m), 'FN mismatch'
    
    test_metrics(ref, sys_m, match_m)

In [22]:
%%time
def main():
    
    rtype = int(input("Run: 1->Single systems; 2->Ensemble; 3->Tests; 4-> MM Test"))
   
    '''
        corpora: i2b2, mipacq, fv017
        analyses: entity only (exact span), cui by document, full (aka (entity and cui on exaact span/exact cui)
                  NB: add "_test" using mipacq to egnerate small test sample 
        systems: ctakes, biomedicus, clamp, metamap, quick_umls
        
        TODO -> Vectorization (entity only and full):
                add switch for use of TN on single system performance evaluations 
                add switch for overlap matching versus exact span
             -> Other tasks besides concept extraction
             -> Use of https://scikit-learn.org/stable/modules/generated/sklearn.metrics.fbeta_score.html
        
    ''' 
    corpus = 'i2b2'
    #corpus = 'mipacq'
    analysis_type = 'entity'
    #analysis_type = 'full'
    analysisConf =  AnalysisConfig()
    print(analysisConf.systems, analysisConf.corpus_config(corpus))
    
    if (rtype == 1):
        generate_metrics(analysis_type, corpus)
    elif (rtype == 2):
        l = ['ctakes','biomedicus','clamp','metamap','quick_umls']
        run_ensemble(l, analysis_type, corpus) 
    elif (rtype == 3):
        systems = ['biomedicus']
        t = ['concept_jaccard_score_false']
        test_systems(analysis_type, systems, corpus)  
        test_count(analysis_type, corpus)
        test_ensemble(analysis_type, corpus)
    elif (rtype == 4):
        generate_metrics_test(analysis_type, corpus)

if __name__ == '__main__':
    main()

Run: 1->Single systems; 2->Ensemble; 3->Tests; 4-> MM Test 2


['biomedicus', 'clamp', 'ctakes', 'metamap'] ('analytical_cui_i2b2_concepts.csv', 'test.i2b2_all')
ctakes




trivial: ctakes 12780 52630


  app.launch_new_instance()


biomedicus
trivial: biomedicus 13626 64917
clamp
trivial: clamp 23181 52884
metamap
trivial: metamap 14661 107927
quick_umls
trivial: quick_umls 14579 85676
 ( ctakes & biomedicus ) 
 ( ctakes | biomedicus ) 
 ( ctakes & clamp ) 
 ( ctakes | clamp ) 
 ( ctakes & metamap ) 
 ( ctakes | metamap ) 
 ( ctakes & quick_umls ) 
 ( ctakes | quick_umls ) 
 ( biomedicus & clamp ) 
 ( biomedicus | clamp ) 
 ( biomedicus & metamap ) 
 ( biomedicus | metamap ) 
 ( biomedicus & quick_umls ) 
 ( biomedicus | quick_umls ) 
 ( clamp & metamap ) 
 ( clamp | metamap ) 
 ( clamp & quick_umls ) 
 ( clamp | quick_umls ) 
 ( metamap & quick_umls ) 
 ( metamap | quick_umls ) 
 ( ( ctakes & biomedicus ) & clamp ) 
 ( ( ctakes & biomedicus ) | clamp ) 
 ( ( ctakes | biomedicus ) & clamp ) 
 ( ( ctakes | biomedicus ) | clamp ) 
 ( ( ctakes & biomedicus ) & metamap ) 
 ( ( ctakes & biomedicus ) | metamap ) 
 ( ( ctakes | biomedicus ) & metamap ) 
 ( ( ctakes | biomedicus ) | metamap ) 
 ( ( ctakes & biomedicus ) 

In [23]:
# test dkpro-cassis
from cassis import *
import numpy as np
import collections
from typing import List, Set

ts_test = "/Users/gms/development/nlp/nlpie/data/irr_mts/"
dir_test = "/Users/gms/development/nlp/nlpie/data/irr_mts/"
#fname = "527982345-v1.txt.xmi"
fname = "92_1666/alber475.xmi"
case = fname.split('/')[0]
view_name = "_InitialView"
Span = collections.namedtuple('Span',['begin','end','label']) # define named tuple for span object 
  
def get_ann(fname, dir_test, ts_test, view_name):
    #t = "org.apache.ctakes.typesystem.type.textsem.DiseaseDisorderMention"
    t = "webanno.custom.Term"
    with open(ts_test + 'TypeSystem.xml', 'rb') as f:
        typesystem = load_typesystem(f)
    with open(dir_test + fname, 'rb') as f:
        cas = load_cas_from_xmi(f, typesystem=typesystem)
    view = cas.get_view(view_name)
    #print([x for x in view.select(t)])
    #print(view.sofa_string, len(view.sofa_string))
    d = {}
    ann = []
    labels = set()
    attribs = get_attribs(view.select(t))
    # only parse if type exists in file
    if view.select(t):
        for sentence in view.select(t): 
            for i in range(len(attribs)):
                key = attribs[i]
                # helper method to get val for given key
                val = sentence.__getattribute__(attribs[i])
                d[key] = val 
               
                if key == 'termType':
                    #print(key, val)
                    labels.add(val)
                
                if i == len(attribs) - 1:
                    ann.append( Span(d["begin"], d["end"], d["termType"]))
    
    #print(ann, labels)
    return ann, view.sofa_string, labels
# extract attributes from cas Annotation object

def get_attribs(v):
    attribs = []
    for sentence in v:
        #print(sentence)
        for s in sentence.__dir__():
            if '__' not in s:
                if s not in attribs:
                    #print(s)
                    attribs.append(s)
                else:
                    break

    return attribs