In [28]:
import gevent
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
import math
import pymysql
import time 
import functools as ft
import glob   
import operator as op
import shelve
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from pathlib import Path
from itertools import combinations, product, permutations
from sqlalchemy.engine import create_engine
from datetime import datetime
from ast import literal_eval
from scipy import stats  
from scipy.stats.mstats import gmean
from pythonds.basic.stack import Stack
from pythonds.trees.binaryTree import BinaryTree
from collections import defaultdict
from typing import List, Set, Tuple 

In [29]:
# STEP-1: CHOOSE YOUR CORPUS
# TODO: allow for list of corpora
corpus = 'fairview' #options include 'fairview', 'mipacq' OR 'i2b2'

# STEP-2: CHOOSE YOUR DATA DIRECTORY; this is where output data will be saved on your machine
data_directory = '/Users/gms/development/nlp/nlpie/data/ensembling-u01/output/' 

# STEP-3: CHOOSE WHICH SYSTEMS YOU'D LIKE TO EVALUATE AGAINST THE CORPUS REFERENCE SET
systems = ['biomedicus', 'clamp', 'ctakes', 'metamap', 'quick_umls']

# STEP-4: CHOOSE TYPE OF RUN
rtype = 2      # OPTIONS INCLUDE: 1->Single systems; 2->Ensemble; 3->Tests; 4-> MM Test
               # The Ensemble can include the max system set ['ctakes','biomedicus','clamp','metamap','quick_umls']
    
# STEP-5: CHOOSE WHAT TYPE OF ANALYSIS YOU'D LIKE TO RUN ON THE CORPUS
analysis_type = 'entity' #options include 'entity' OR 'full'

# STEP-(6A): ENTER DETAILS FOR ACCESSING MANUAL ANNOTATION DATA
database_type = 'mysql+pymysql' # We use mysql+pymql as default
database_username = 'gms'
database_password = 'nej123' 
database_url = 'localhost' # HINT: use localhost if you're running database on your local machine
database_name = 'concepts' # Enter database name
table_name = corpus + '_all' # Enter the table within the database where your reference data is stored

# STEP-(6B): ENTER DETAILS FOR ACCESSING SYSTEM ANNOTATION DATA
system_annotation = 'analytical_'+corpus+'.csv' # OPTIONS include 'analytical_cui_mipacq_concepts.csv' OR 'analytical_cui_i2b2_concepts.csv' 

# STEP-7: WE'LL CREATE A 'SYSTEM OUTPUT' DIRECTORY FOR YOU INSIDE THE DIRECTORY YOU SPECIFIED IN (STEP 2)
single_sys_dir = Path(data_directory + "single_system_out")
single_sys_dir.mkdir(parents=True, exist_ok=True)
dir_out = Path(data_directory + 'single_system_out/')

# STEP-8: CREATE A DB CONNECTION POOL
engine_request = str(database_type)+'://'+database_username+':'+database_password+"@"+database_url+'/'+database_name
engine = create_engine(engine_request, pool_pre_ping=True, pool_size=20, max_overflow=30)

# STEP-(9A): FILTER BY SEMTYPE
filter_semtype = True

# STEP-(9B): IF STEP-(9A) == True -> CHOOSE REFERENCE SEMTYPES TO FILTER BY
semtypes = ['Drug']

In [30]:
# config class for analysis
class AnalysisConfig():
    """
    Configuration object:
    systems to use
    notes by corpus
    paths by output, gold and system location
    """
    def __init__(self):
        self = self    
        self.systems = systems
        self.data_dir = data_directory
    
    def corpus_config(self): 
        usys_data = system_annotation
        ref_data = database_name+'.'+table_name
        return usys_data, ref_data

analysisConf =  AnalysisConfig()
usys, ref = analysisConf.corpus_config()

In [31]:
# class SemanticTypes():
    
#     def __init__(self):
#         self = self
#         self.biomedicus_types = ['placeholder']
#         self.ctakes_types = {'DiseaseDisorderMention'}
#         self.clamp_types = {'problem'}
#         self.reference_types = {'Finding'}
    
#     def get_system_type(self):  
        
#         if system == 'biomedicus':
#             semtypes = self.biomedicus_types
#         elif system == 'ctakes':
#             semtypes = self.ctakes_types
#         elif system == 'clamp':
#             semtypes = self.clamp_types
#         elif system == 'reference':
#             semtypes = self.reference_types
            
#         return semtypes

In [32]:
class SemanticTypes(object):
    
    def __init__(self, semtypes):
        self = self
        
        sql = "SELECT st.tui, abbreviation, clamp_name, ctakes_name FROM concepts.semantic_groups sg join semantic_types st on sg.tui = st.tui where fairview_name in ({})"\
           .format(', '.join(['%s' for _ in semtypes]))  
        
        stypes = pd.read_sql(sql, params=[semtypes], con=engine) 
        
        self.biomedicus_types = set(stypes['tui'].tolist())
        self.clamp_types = set(stypes['clamp_name'].tolist()[0].split(','))
        self.ctakes_types = set(stypes['ctakes_name'].tolist()[0].split(','))
        self.metamap_types = set(stypes['abbreviation'].tolist())
        self.qumls_types = set(stypes['tui'].tolist())
        self.reference_types = semtypes
    
    def get_system_type(self, system):  
        
        if system == 'biomedicus':
            semtypes = self.biomedicus_types
        elif system == 'ctakes':
            semtypes = self.ctakes_types
        elif system == 'clamp':
            semtypes = self.clamp_types
        elif system == 'metamap':
            semtypes = self.metamap_types
        elif system == 'quick_umls':
            semtypes = self.qumls_types
        elif system == 'reference':
            semtypes = self.reference_types
            
        return semtypes
    
SemanticTypes(['Finding']).get_system_type('ctakes')

{'DiseaseDisorderMention', 'SignSymptomMention'}

In [33]:
# annotation class for systems
class AnnotationSystems():
    """   
    System annotations of interest for UMLS concept extraction
    NB: ctakes combines all "mentions" annotation types
    
    """
    def __init__(self):
        
        """ 
        annotation base types
        """   
        
        self.biomedicus_types = ["biomedicus.v2.UmlsConcept"]
        self.clamp_types = ["edu.uth.clamp.nlp.typesystem.ClampNameEntityUIMA"]
        self.ctakes_types = ['ctakes_mentions']
        self.metamap_types = ["org.metamap.uima.ts.Candidate"]
        self.quick_umls_types = ['concept_jaccard_score_False']
       
    def get_system_type(self, system):
        
        """
        return system types
        """
        
        if system == "biomedicus":
            view = "Analysis"
        else:
            view = "_InitialView"

        if system == 'biomedicus':
            types = self.biomedicus_types

        elif system == 'clamp':
            types = self.clamp_types

        elif system == 'ctakes':
            types = self.ctakes_types

        elif system == 'metamap':
            types = self.metamap_types
        
        elif system == "quick_umls":
            types = self.quick_umls_types
            
        return types, view
    
annSys = AnnotationSystems()

In [34]:
%reload_ext Cython

In [35]:
%%cython

import numpy as np # access to Numpy from Python layer
import math

class Metrics(object):
    """
    metrics class:
    returns an instance with confusion matrix metrics
    """
    def __init__(self, system_only, gold_only, gold_system_match, system_n, neither = 0): # neither: no sys or manual annotation

        self = self    
        self.system_only = system_only
        self.gold_only = gold_only
        self.gold_system_match = gold_system_match
        self.system_n = system_n
        self.neither = neither
        
    def get_confusion_metrics(self, corpus = None, test = False):
        
        """
        compute confusion matrix measures, as per  
        https://stats.stackexchange.com/questions/51296/how-do-you-calculate-precision-and-recall-for-multiclass-classification-using-co
        """
        cdef:
            int TP, FP, FN
            double TM

        TP = self.gold_system_match
        FP = self.system_only
        FN = self.gold_only
        
        TM = TP/math.sqrt(self.system_n) # TigMetric
       
        if not test:
            
            if corpus == 'casi':
                recall = TP/(TP + FN)
                precision = TP/(TP + FP)
                F = 2*(precision*recall)/(precision + recall)
            else:
                if self.neither == 0:
                    confusion = [[0, self.system_only],[self.gold_only,self.gold_system_match]]
                else:
                    confusion = [[self.neither, self.system_only],[self.gold_only,self.gold_system_match]]
                c = np.asarray(confusion)
                recall = np.diag(c) / np.sum(c, axis = 1)
                precision = np.diag(c) / np.sum(c, axis = 0)
                F = 2*(precision*recall)/(precision + recall)
        else:
            precision = TP/(TP+FP)
            recall = TP/(TP+FN)
            F = 2*(precision*recall)/(precision + recall)
        
        # Tignanelli Metric
        if FN == 0:
            TP_FN_R = TP
        elif FN > 0:
            TP_FN_R = TP/FN
 
        return F, recall, precision, TP, FP, FN, TP_FN_R, TM

In [36]:
def write_out(name: str, analysis_type: str, c: object):
   
    """
    write matching and reference-only sets to file for use in merging combinations
    """
    
    # write output to file
    dir_out = analysisConf.data_dir + 'single_system_out/'
    with open(dir_out + name + '_' + analysis_type + '_' + c.corpus + '_matches.txt', 'w') as f:
        for item in list(c.matches):
            f.write("%s\n" % str(item))

    # write to file
    with open(dir_out + name + '_' + analysis_type + '_' + c.corpus + '_ref_only.txt', 'w') as f:
        for item in list(c.false_negatives):
            f.write("%s\n" % str(item))

In [37]:
def df_to_set(df, analysis_type = 'entity', df_type = 'sys', corpus = None):
    
    # get values for creation of series of type tuple
    if 'entity' in analysis_type: 
        if corpus == 'casi':
            arg = df.case, df.overlap
        else:    
            if df_type == 'sys':
                arg = df.begin, df.end, df.note_id
            else:
                arg = df.start, df.end, df.file
            
    elif 'cui' in analysis_type:
        if df_type == 'sys':
            arg = df.cui, df.note_id
        else:
            arg = df.value, df.file
    elif 'full' in analysis_type:
        if df_type == 'sys':
            arg = df.begin, df.end, df.cui, df.note_id
        else:
            arg = df.start, df.end, df.value, df.file
    
    return set(list(zip(*arg)))

In [38]:
%%cython 

from __main__ import write_out, df_to_set, engine
import numpy as np 
import pandas as pd

def get_cooccurences(ref, sys, analysis_type: str, corpus: str, single_sys = True, name = None):
    """
    get coocurences between system and reference; exact match; TODO: add relaxed
    """
    # cooccurences
    class Coocurences(object):
        
        def __init__(self):
            self.ref_system_match = 0
            self.ref_only = 0
            self.system_only = 0
            self.system_n = 0
            self.ref_n = 0
            self.matches = set()
            self.false_negatives = set()
            self.corpus = corpus
            self.cases = set(ref["file"].tolist()) # cases to label 

    c = Coocurences()
    
    if corpus != 'casi':
        if 'entity' in analysis_type and single_sys: # mipacq n -> 16793
            cols_to_keep = ['begin', 'end', 'note_id']
            sys = sys[cols_to_keep].drop_duplicates()
            ref = ref[['start', 'end', 'file']].drop_duplicates()
            sys.name = name
        elif 'cui' in analysis_type and single_sys: # mipacq n -> 10799
            cols_to_keep = ['cui', 'note_id']
            sys = sys[cols_to_keep].drop_duplicates()
            # do not overestimate FP
            sys = sys[~sys['cui'].isnull()] 
            ref = ref[['value', 'file']].drop_duplicates()
            ref = ref[~ref['value'].isnull()]
            sys.name = name
        elif 'full' in analysis_type and single_sys: # mipacq n -> 17393
            cols_to_keep = ['begin', 'end', 'cui', 'note_id']
            sys = sys[cols_to_keep].drop_duplicates()
            sys = sys[~sys['cui'].isnull()]
            ref = ref[['start', 'end', 'value', 'file']].drop_duplicates()
            ref = ref[~ref['value'].isnull()]
            sys.name = name

        # matches via inner join
        matches = pd.merge(sys, ref, how = 'inner', left_on=['begin','end','note_id'], right_on = ['start','end','file']) 
        # reference-only via left outer join
        fn = pd.merge(ref, sys, how = 'left', left_on=['start','end','file'], right_on = ['begin','end','note_id']) 

        fn = fn[fn['begin'].isnull()] # get as outer join with no match

        if 'entity' in analysis_type and single_sys:
            cols_to_keep = ['start', 'end', 'file']
        else:
            cols_to_keep = ['start', 'end', 'value', 'file']

        matches = matches[cols_to_keep]
        fn = fn[cols_to_keep]

        # use for metrics 
        c.matches = c.matches.union(df_to_set(matches, analysis_type, 'ref'))
        c.false_negatives = c.false_negatives.union(df_to_set(fn, analysis_type, 'ref'))
        c.ref_system_match = len(c.matches)
        c.system_only = len(sys) - len(c.matches)
        c.system_n = len(sys)
        c.ref_n = len(ref)
        c.ref_only = len(c.false_negatives)
        
    else:
        sql = "select `case` from test.amia_2019_analytical_v where overlap = 1 and `system` = %(sys.name)s"  
        
        matches = pd.read_sql(sql, params={"sys.name":sys.name}, con=engine)
        
        sql = "select `case` from test.amia_2019_analytical_v where (overlap = 0 or overlap is null) and `system` = %(sys.name)s"  
        
        fn = pd.read_sql(sql, params={"sys.name":sys.name}, con=engine)
        
        c.matches = df_to_set(matches, 'entity', 'sys', 'casi')
        c.fn = df_to_set(fn, 'entity', 'sys', 'casi')
        c.ref_system_match = len(c.matches)
        c.system_only = len(sys) - len(c.matches)
        c.system_n = len(matches) + len(fn)
        c.ref_n = len(matches) + len(fn)
        c.ref_only = len(fn)
        
    # sanity check
    if len(ref) - c.ref_system_match < 0:
        print('Error: ref_system_match > len(ref)!')
    if len(ref) != c.ref_system_match + c.ref_only:
        print('Error: ref count mismatch!')
   
    # save TP/FN
    if single_sys and corpus != 'casi':
        print(analysis_type)
        write_out(sys.name, analysis_type, c)
    return c 

In [39]:
#%%cython 

#from __main__ import write_out

#import numpy as np # access to Numpy from Python layer
def label_vector(doc: str, ann: List[int], labels: List[str]) -> np.array:

    #print(ann, doc, labels)

    v = np.zeros(doc)
    labels = list(labels)
    
    for (i, lab) in enumerate(labels):
        i += 1  # 0 is reserved for no label
        idxs = [np.arange(a.begin, a.end) for a in ann if a.label == lab]
            
        idxs = [j for mask in idxs for j in mask]
        v[idxs] = i

    return v

# test confusion matrix elements for vectorized annotation set; includes TN
def confused(sys1, ann1):
    TP = np.sum(np.logical_and(sys1 >= 1, ann1 == sys1 ))

    # True Negative (TN): we predict a label of 0 (negative), and the true label is 0.
    TN = np.sum(np.logical_and(sys1 == 0, ann1 == 0))

    # False Positive (FP): we predict a label of 1 (positive), but the true label is 0.
    FP = np.sum(np.logical_and(sys1 >= 1, ann1 == 0))

    # False Negative (FN): we predict a label of 0 (negative), but the true label is 1.
    FN = np.sum(np.logical_and(sys1 == 0, ann1 >= 1))
    
    return TP, TN, FP, FN


def get_cooccurences_vec(ref, sys, analysis_type: str, corpus: str, single_sys = True, name = None):
    """
    get coocurences between system and reference; exact match; TODO: add relaxed
    """
    # test cooccurences
    class Coocurences(object):
        
        def __init__(self):
            self.ref_system_match = 0
            self.ref_only = 0
            self.system_only = 0
            self.system_n = 0
            self.ref_n = 0
            self.matches = set()
            self.false_negatives = set()
            self.corpus = corpus
            self.cases = set(ref["file"].tolist()) # cases to label 

    c = Coocurences()
    
    # vectorization and i-o labeling
    def test_io():
        test = c.cases
        if analysis_type == 'entity':
            docs = [(x, len(open("/Users/gms/development/nlp/nlpie/data/ensembling-u01/i2b2/source_data/test_data/" + x + ".txt", 'r').read())) for x in test]

        ann = ref.copy()
        ann = ann.rename(index=str, columns={"start": "begin", "file": "case"}).copy()
        cols_to_keep = ['begin', 'end', 'case', 'label']
        if analysis_type == 'entity':
            labels = ["concept"]
            ann["label"] = 'concept'
            ann = ann[cols_to_keep].copy()

        sys_ = sys.rename(index=str, columns={"note_id": "case"}).copy()
        
        # need for enttity-only
        if analysis_type == 'entity':
            sys_["label"] = 'concept'
        
        sys_ = sys_[cols_to_keep]
       
        tp = []
        tn = []
        fp = []
        fn = []
        cvals = []
        out = []
        t = []
        d = defaultdict(list)
        
        for n in range(len(docs)):
            a1 = [i for i in ann[ann["case"] == docs[n][0]].copy().itertuples(index=False)]
            s1 = [i for i in sys_[sys_["case"] == docs[n][0]].copy().itertuples(index=False)]

            ann1 = label_vector(docs[n][1], a1, labels)
            sys1 = label_vector(docs[n][1], s1, labels)
            
            TP, TN, FP, FN = confused(sys1, ann1)
            cvals.append([TP, TN, FP, FN])
                 
            d['sys'].append(list([int(i) for i in sys1]))
            d['oracle'].append(list([int(i) for i in ann1]))
            d['case'].append(docs[n][0])
            
            '''
            print("tn:", np.intersect1d(np.where(ann1 == 0)[0], np.where(sys1 == 0)[0]),  
                  "tp:", np.intersect1d(np.where(ann1 == 1)[0], np.where(sys1 == 1)[0]), 
                  "fn:", np.intersect1d(np.where(ann1 == 1)[0], np.where(sys1 == 0)[0]), 
                  "fp:", np.intersect1d(np.where(ann1 == 0)[0], np.where(sys1 == 1)[0]))
            '''
        d['labels'] = labels
        
        corp = shelve.open('/Users/gms/Desktop/' + sys.name + '_' + corpus + '.dat')
        
        for k in d:
            corp[k] = d[k]
        
        corp.close()
        return cvals
    
    if corpus == 'i2b2':
        TP, TN, FP, FN = np.sum(test_io(), axis=0)
        F, recall, precision, TP, FP, FN, TP_FN_R, TM = Metrics(FP, FN, TP, len(sys), TN).get_confusion_metrics() #no TN
        print('test_io():', TP, TN, FP, FN, np.mean(F), np.mean(recall), np.mean(precision))
    
    # non-vectorized:
    if corpus != 'casi':
        if 'entity' in analysis_type and single_sys: # mipacq n -> 16793
            cols_to_keep = ['begin', 'end', 'note_id']
            sys = sys[cols_to_keep].drop_duplicates()
            ref = ref[['start', 'end', 'file']].drop_duplicates()
            sys.name = name
        elif 'cui' in analysis_type and single_sys: # mipacq n -> 10799
            cols_to_keep = ['cui', 'note_id']
            sys = sys[cols_to_keep].drop_duplicates()
            # do not overestimate FP
            sys = sys[~sys['cui'].isnull()] 
            ref = ref[['value', 'file']].drop_duplicates()
            ref = ref[~ref['value'].isnull()]
            sys.name = name
        elif 'full' in analysis_type and single_sys: # mipacq n -> 17393
            cols_to_keep = ['begin', 'end', 'cui', 'note_id']
            sys = sys[cols_to_keep].drop_duplicates()
            sys = sys[~sys['cui'].isnull()]
            ref = ref[['start', 'end', 'value', 'file']].drop_duplicates()
            ref = ref[~ref['value'].isnull()]
            sys.name = name

        # matches via inner join
        matches = pd.merge(sys, ref, how = 'inner', left_on=['begin','end','note_id'], right_on = ['start','end','file']) 
        # reference-only via left outer join
        fn = pd.merge(ref, sys, how = 'left', left_on=['start','end','file'], right_on = ['begin','end','note_id']) 

        fn = fn[fn['begin'].isnull()] # get as outer join with no match

        if 'entity' in analysis_type and single_sys:
            cols_to_keep = ['start', 'end', 'file']
        else:
            cols_to_keep = ['start', 'end', 'value', 'file']

        matches = matches[cols_to_keep]
        fn = fn[cols_to_keep]

        # use for metrics 
        c.matches = c.matches.union(df_to_set(matches, analysis_type, 'ref'))
        c.false_negatives = c.false_negatives.union(df_to_set(fn, analysis_type, 'ref'))
        c.ref_system_match = len(c.matches)
        c.system_only = len(sys) - len(c.matches)
        c.system_n = len(sys)
        c.ref_n = len(ref)
        c.ref_only = len(c.false_negatives)
        
    else:
        sql = "select `case` from test.amia_2019_analytical_v where overlap = 1 and `system` = %(sys.name)s"  
        
        matches = pd.read_sql(sql, params={"sys.name":sys.name}, con=engine)
        
        sql = "select `case` from test.amia_2019_analytical_v where (overlap = 0 or overlap is null) and `system` = %(sys.name)s"  
        
        fn = pd.read_sql(sql, params={"sys.name":sys.name}, con=engine)
        
        c.matches = df_to_set(matches, 'entity', 'sys', 'casi')
        c.fn = df_to_set(fn, 'entity', 'sys', 'casi')
        c.ref_system_match = len(c.matches)
        c.system_only = len(sys) - len(c.matches)
        c.system_n = len(matches) + len(fn)
        c.ref_n = len(matches) + len(fn)
        c.ref_only = len(fn)
        
    # sanity check
    if len(ref) - c.ref_system_match < 0:
        print('Error: ref_system_match > len(ref)!')
    if len(ref) != c.ref_system_match + c.ref_only:
        print('Error: ref count mismatch!')
   
    # save TP/FN
    if single_sys and corpus != 'casi':
        print(analysis_type)
        write_out(sys.name, analysis_type, c)
    return c 

In [40]:
# merging test for i-o labeled data
import numpy as np
import shelve
# load shelve
def read_shelve():
    corp = shelve.open('/Users/gms/Desktop/test.dat')

    return corp
        
#test = read_shelve()

def test_merge_vector(test):
    # get sample for testing
    for case in test['case'][3:5]:
        for i in range(len(test['case'][3:5])):
            if i == 3:
                t0 = test['oracle'][3][0:750]
            else:
                t1 = test['oracle'][4][0:750]

            #print('case:', case, test['sys'][i], test['oracle'][i], confused(np.array(test['sys'][i]), np.array(test['oracle'][i])))
        #print(t0, t1)

    t0 = np.array(test['oracle'][3][0:750])
    t1 = np.array(test['oracle'][5][0:750])

    l0 = list(t0)
    l1 = list(t1)
    
    l0 = [0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0] 
    l1 = [0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0]
    
    print(l0, l1)

    def intersection(lst1, lst2): 
        out = list()
        if isinstance(lst1, set) and isinstance(lst2, set):
            out = (set(lst1) & set(lst2))
        elif isinstance(lst1, set) and isinstance(lst2, np.int64):
            out = (set(lst1) & set([lst2]))
        elif isinstance(lst1, np.int64) and isinstance(lst2, set):
            out = (set([lst1]) & set(lst2))
        elif isinstance(lst1, np.int64) and isinstance(lst2, np.int64):
            out = (set([lst1]) & set([lst2]))
        #if len(out) > 1:
        return out
        #elif len(out) == 1:
        #    return out[0]
        #else:
        #    return 0

    def union(lst1, lst2): 
        out = list()
        if isinstance(lst1, set) and isinstance(lst2, set):
            out = set(lst1) | set(lst2)
        elif isinstance(lst1, set) and isinstance(lst2, np.int64):
            out = set(lst1) | set([lst2])
        elif isinstance(lst1, np.int64) and isinstance(lst2, set):
            out = set([lst1]) | set(lst2)
        elif isinstance(lst1, np.int64) and isinstance(lst2, np.int64):
            out = set([lst1]) | set([lst2])
        #if len(out) == 1:
        #    #out = out[0]
        return out

    # union and intersect
    def umerges(l0, l1):
        #un = [0]*len(l0)
        #for i in range(len(l0)):
        #    un[i] = union(l0[i], l1[i])

        return [union(l0[i], l1[i]) for i in range(len(l0))]

    %timeit un = umerges(l0, l1)
    
    x = umerges(l0, l1)
    

    #l2 = [1, {1, 4}, {3}, {2, 4}, {1}, 0, 2, 3, {0, 8}, {1, 8}]
    
    #print(umerges(x, l2))
    
    def imerges(l0, l1):
        #inter = [0]*len(l0)
        #for i in range(len(l0)):
        

        return [intersection(l0[i], l1[i]) for i in range(len(l0))]
    
    %timeit x = imerges(l0, l1)
    
    '''
    union = [
        ( [set(x) | set(y)] if isinstance(x, list) and isinstance(y, list)
          else [set(x) | set([y])] if isinstance(x, list) and isinstance(y, int)
          else [set([x]) | set(y)] if isinstance(x, int) and isinstance(y, list)
          else [set([x]) | set([y])])

         for x, y in zip(l0, l1)
    ]

    # unpack map object
    #*y, = list(map(list, zip(*union)))
    #%timeit list(map(list, zip(*union)))

    intersection = [
        ( [set(x) & set(y)] if isinstance(x, list) and isinstance(y, list)
          else [set(x) & set([y])] if isinstance(x, list) and isinstance(y, int)
          else [set([x]) & set(y)] if isinstance(x, int) and isinstance(y, list)
          else [set([x]) & set([y])])
          for x, y in zip(l0, l1)

    ]

    #*x, = list(map(list, zip(*intersection)))
    #%timeit list(map(list, zip(*intersection)))
    '''
#test_merge_vector(test)

In [41]:
# blah -=
# https://kawahara.ca/how-to-compute-truefalse-positives-and-truefalse-negatives-in-python-for-binary-classification-problems/
def confusing(sys1, ann1):
    TP = np.sum(np.logical_and(sys1 == 1, ann1 == sys1))

    # True Negative (TN): we predict a label of 0 (negative), and the true label is 0.
    TN = np.sum(np.logical_and(sys1 == 0, ann1 == 0))

    # False Positive (FP): we predict a label of 1 (positive), but the true label is 0.
    FP = np.sum(np.logical_and(sys1 == 1, ann1 == 0))

    # False Negative (FN): we predict a label of 0 (negative), but the true label is 1.
    FN = np.sum(np.logical_and(np.logical_or(sys1 == 0, sys1 is None), ann1 == 1))
    
    return TP, TN, FP, FN

#%%cython
#import numpy as np # access to Numpy from Python layer
#import time
#from __main__ import read_shelve
def imerge(l0, l1):

    return [
        ( [
            set(x) & set(y)] if isinstance(x, list) and  isinstance(y, list)
            else [set(x) & set([y])] if isinstance(x, list) and  isinstance(y, np.int64)
            else [set(x) & y] if isinstance(x, list) and isinstance(y, set)
            else [x & y] if isinstance(x, set) and isinstance(y, set)
            else [x & set(y)] if isinstance(x, set) and isinstance(y, list)
            else [x & set([y])] if isinstance(x, set) and isinstance(y, np.int64)
            else [set([x]) & set(y)] if isinstance(x, np.int64) and  isinstance(y, list)
            else [set([x]) & y] if isinstance(x, np.int64) and isinstance(y, set)
            else [set([x]) & set([y])])
        for x, y in zip(l0, l1)
    ]

def umerge(l0, l1):

    return [
        ( [
            set(x) | set(y)] if isinstance(x, list) and  isinstance(y, list)
            else [set(x) | set([y])] if isinstance(x, list) and  isinstance(y, np.int64)
            else [set(x) | y] if isinstance(x, list) and isinstance(y, set)
            else [x | y] if isinstance(x, set) and isinstance(y, set)
            else [x | set(y)] if isinstance(x, set) and isinstance(y, list)
            else [x | set([y])] if isinstance(x, set) and isinstance(y, np.int64)
            else [set([x]) | y] if isinstance(x, np.int64) and isinstance(y, set)
            else [set([x]) | set(y)] if isinstance(x, np.int64) and  isinstance(y, list)
            else [set([x]) | set([y])])
        for x, y in zip(l0, l1)
    ]

def imerge_int(l0, l1):
    return [
        ( [
            set(x) & set(y)] if isinstance(x, list) and  isinstance(y, list)
            else [set(x) & set([y])] if isinstance(x, list) and isinstance(y, int)
            else [set(x) & y] if isinstance(x, list) and isinstance(y, set)
            else [x & y] if isinstance(x, set) and isinstance(y, set)
            else [x & set(y)] if isinstance(x, set) and isinstance(y, list)
            else [x & set([y])] if isinstance(x, set) and isinstance(y, int)
            else [set([x]) & set(y)] if isinstance(x, int) and  isinstance(y, list)
            else [set([x]) & y] if isinstance(x, int) and isinstance(y, set)
            else [set([x]) & set([y])])
        for x, y in zip(l0, l1)
    ]

def umerge_int(l0, l1):
    return [
        ( [
            set(x) | set(y)] if isinstance(x, list) and  isinstance(y, list)
            else [set(x) | set([y])] if isinstance(x, list) and  isinstance(y, int)
            else [set(x) | y] if isinstance(x, list) and isinstance(y, set)
            else [x | y] if isinstance(x, set) and isinstance(y, set)
            else [x | set(y)] if isinstance(x, set) and isinstance(y, list)
            else [x | set([y])] if isinstance(x, set) and isinstance(y, int)
            else [set([x]) | y] if isinstance(x, int) and isinstance(y, set)
            else [set([x]) | set(y)] if isinstance(x, int) and  isinstance(y, list)
            else [set([x]) | set([y])])
        for x, y in zip(l0, l1)
    ]

def test_merge_shelve():
    ctakes = shelve.open('/Users/gms/Desktop/ctakes_' + corpus + '.dat')
    clamp = shelve.open('/Users/gms/Desktop/clamp_' + corpus + '.dat')
    mm = shelve.open('/Users/gms/Desktop/metamap_' + corpus + '.dat')

    print(test['case'][0:2])

    #t0 = np.array(test['oracle'][3][0:750])
    #t1 = np.array(test['oracle'][5][0:750])

    start = time.perf_counter()

    sys = []
    oracles = []
    confuzz = []
    for i in range(len(ctakes['case'])):
        t0 = np.array(ctakes['sys'][i])
        t1 = np.array(clamp['sys'][i])
        t2 = np.array(mm['sys'][i])
        oracle = np.array(ctakes['oracle'][i])

        #print(ctakes['case'][i])

        l0 = list(t0)
        l1 = list(t1)
        l2 = list(t2)

        z = *map(list, zip(*umerge(l0, l1))),

        #%time  *map(list, zip(*umerge(l0, l1))),
        #%time  *map(list, zip(*umerge(z[0], l1))),

        t = *map(list, zip(*umerge(z[0], l2))),

        from functools import reduce 
        import itertools
        import operator

        test = [list(i) for i in t[0]]

        replaced = [[none] if len(wd) == 0  else wd for wd in t[0]]

        replaced = [[1] if wd == {0, 1}  else wd for wd in replaced]


        sys.append(replaced)

        tp, tn, fp, fn = confusing(np.array(list(itertools.chain.from_iterable(replaced))), oracle)

        confuzz.append((tp, tn, fp, fn))
        f, recall, precision, tp, fp, fn, tp_fn_r, tm = Metrics(fp, fn, tp, len(oracle), tn).get_confusion_metrics() #no tn
    print(' --- ')

    print(len(list(itertools.chain.from_iterable(oracles))), len(list(itertools.chain.from_iterable(sys))))
    print(list(map(sum, zip(*confuzz))))
    z = list(map(sum, zip(*confuzz)))
    # tp, tn, fp, fn -> (fp, fn, tp, len(oracle), tn)
    f, recall, precision, tp, fp, fn, tp_fn_r, tm = Metrics(z[2], z[3], z[0], len(list(itertools.chain.from_iterable(sys))), z[1]).get_confusion_metrics() #no tn
    print('test_io():', tp, tn, fp, fn, np.mean(f), np.mean(recall), np.mean(precision))
    elapsed = (time.perf_counter() - start)
    print('time 1:', elapsed)

In [42]:
@ft.lru_cache(maxsize=None)
def get_metric_data(analysis_type: str, corpus: str):
   
    usys_file, ref_table = AnalysisConfig().corpus_config()
    systems = AnalysisConfig().systems
    
    sys_ann = pd.read_csv(analysisConf.data_dir + usys_file, dtype={'note_id': str})
    
    sql = "SELECT * FROM " + ref_table  
    
    ref_ann = pd.read_sql(sql, con=engine)
    sys_ann = sys_ann.drop_duplicates()
    
    
    
    return ref_ann, sys_ann

In [43]:
%%cython
import pandas as pd
from scipy import stats
from scipy.stats.mstats import gmean

def geometric_mean(metrics):
    """
    1. Get rank average of F1, TP/FN, TM
        http://www.datasciencemadesimple.com/rank-dataframe-python-pandas-min-max-dense-rank-group/
        https://stackoverflow.com/questions/46686315/in-pandas-how-to-create-a-new-column-with-a-rank-according-to-the-mean-values-o?rq=1
    2. Take geomean of 2.
        https://stackoverflow.com/questions/42436577/geometric-mean-applied-on-row
    """
    
    data = pd.DataFrame() 

    metrics['F1 rank']=metrics['F'].rank(ascending=0,method='average')
    metrics['TP/FN rank']=metrics['TP/FN'].rank(ascending=0,method='average')
    metrics['TM rank']=metrics['TM'].rank(ascending=0,method='average')
    metrics['Gmean'] = gmean(metrics.iloc[:,-3:],axis=1)

    return metrics  

In [44]:
def generate_metrics(analysis_type: str, corpus: str, filter_semtype = False, single_sys = None):
    start = time.time()

    systems = AnalysisConfig().systems
    metrics = pd.DataFrame()

    ref_ann, sys_ann = get_metric_data(analysis_type, corpus)
    
    for sys in systems:
        
        if filter_semtype:
            st = SemanticTypes(semtypes).get_system_type(sys)
            ref_ann = ref_ann[ref_ann['semtype'].isin(SemanticTypes(semtypes).get_system_type('reference'))]
            
        types, _ = AnnotationSystems().get_system_type(sys) # system types for iterable
        for t in types:
            print(t)

            if filter_semtype:
                system_annotations = sys_ann[sys_ann['semtypes'].isin(st)].copy()
            else:
                system_annotations = sys_ann.copy()

            system = system_annotations[system_annotations['type'] == str(t)]

            if sys == 'quick_umls':
                system = system[system.score.astype(float) >= .8]

            if sys == 'metamap':
                system = system[system.score.abs().astype(int) >= 800]

            system = system.drop_duplicates()
            system.name = sys

            c = get_cooccurences(ref_ann, system, analysis_type, corpus, True, system.name) # get matches, FN, etc.

            print(c.ref_n, c.ref_only, c.system_n, c.system_only, c.ref_system_match)

        if c.ref_system_match > 0: # compute confusion matrix metrics and write to dictionary -> df
            F, recall, precision, TP, FP, FN, TP_FN_R, TM = Metrics(c.system_only, c.ref_only, c.ref_system_match, c.system_n).get_confusion_metrics(corpus)

            if corpus == 'casi':
                if sys == 'biomedicus':
                    t = 'biomedicus.v2.Acronym'

                d = {'system': sys, 
                     'type': t, 
                     'F': F, 
                     'precision': precision, 
                     'recall': recall, 
                     'FN': FN, 
                     'TP/FN': TP_FN_R,
                     'n_gold': c.ref_n, 
                     'n_sys': c.system_n, 
                     'TM': TM}
            else:
                d = {'system': sys, 
                     'type': t, 
                     'F': F[1], 
                     'precision': precision[1], 
                     'recall': recall[1], 
                     'TP': TP, 
                     'FN': FN, 
                     'FP': FP, 
                     'TP/FN': TP_FN_R,
                     'n_gold': c.ref_n, 
                     'n_sys': c.system_n, 
                     'TM': TM}

            data = pd.DataFrame(d,  index=[0])
            metrics = pd.concat([metrics, data], ignore_index=True)
            metrics.drop_duplicates(keep='last', inplace=True)
        else:
            print("NO EXACT MATCHES FOR", t)
        elapsed = (time.time() - start)
        print("elapsed:", sys, elapsed)
     
    elapsed = (time.time() - start)
    print(geometric_mean(metrics))
    
    now = datetime.now()
    timestamp = datetime.timestamp(now)
    
    if single_sys is None:
        file_name = 'metrics_'
    
    metrics.to_csv(analysisConf.data_dir + corpus + '_' + file_name + analysis_type + '_' + str(timestamp) + '.csv')
    
    print("total elapsed time:", elapsed) 

# use to iterate through mm scores
def generate_metrics_mm(analysis_type: str, corpus: str, single_sys = None):
    start = time.time()
    #systems = ["biomedicus","ctakes","metamap","clamp","quick_umls"]
    systems = AnalysisConfig().systems
    #systems = ["quick_umls"]
    metrics = pd.DataFrame()

    ref_ann, sys_ann = get_metric_data(analysis_type, corpus)
    
    sys_ann = sys_ann[(sys_ann.score.notnull()) & (sys_ann['system'] == 'metamap')]
    sys_ann = sys_ann[['begin', 'end', 'note_id', 'system', 'score']].drop_duplicates()
    sys_ann.score = sys_ann.score.astype(int)
    
    for sys in systems:
        types, __ = AnnotationSystems().get_system_type(sys) # system types for iterable
        for t in types:
            print(t)

            for i in range(500, 1050, 50): 

                sys_ann = sys_ann[(sys_ann["score"] >= i)].copy()

                sys_ann.name = sys + str(i)

                c = get_cooccurences(ref_ann, sys_ann, analysis_type, corpus, True, sys_ann.name) # get matches, FN, etc.

                print(c.ref_n, c.ref_only, c.system_n, c.system_only, c.ref_system_match)

                #print(i, len(system))

                if c.ref_system_match > 0: # compute confusion matrix metrics and write to dictionary -> df
                    F, recall, precision, TP, FP, FN, TP_FN_R, TM = Metrics(c.system_only, c.ref_only, c.ref_system_match, c.system_n).get_confusion_metrics()
                    d = {'system': sys + '_score_' + str(i), 
                         'type': t, 
                         'F': F[1], 
                         'precision': precision[1], 
                         'recall': recall[1], 
                         'TP': TP, 
                         'FN': FN, 
                         'FP': FP, 
                         'TP/FN': TP_FN_R,
                         'n_gold': c.ref_n, 
                         'n_sys': c.system_n, 
                         'TM': TM}

                    data = pd.DataFrame(d,  index=[0])
                    metrics = pd.concat([metrics, data], ignore_index=True)
                    metrics.drop_duplicates(keep='last', inplace=True)
                else:
                    print("NO EXACT MATCHES FOR", t)
                elapsed = (time.time() - start)
                print("elapsed:", sys, elapsed)
     
    elapsed = (time.time() - start)
    print(geometric_mean(metrics))
    
    now = datetime.now()
    timestamp = datetime.timestamp(now)
    # UIMA or QuickUMLS
    if single_sys is None:
        file_name = 'mm_metrics_'
    metrics.to_csv(analysisConf.data_dir + corpus + '_' + file_name + analysis_type + '_' + str(timestamp) + '.csv')
    
    print("total elapsed time:", elapsed)

In [45]:
# read in system matches from file
@ft.lru_cache(maxsize=None)
def get_ref_n(analysis_type: str, corpus, filter_semtype = False) -> int:
    
    ref_ann, _ = get_metric_data(analysis_type, corpus)
    
    if filter_semtype:
        ref_ann = ref_ann[ref_ann['semtype'].isin(SemanticTypes(semtypes).get_system_type('reference'))]
            
    if corpus == 'casi':
        return len(ref_ann)
        
    else:
        # do not overestimate fn
        if 'entity' in analysis_type:
            ref_ann = ref_ann[['start', 'end', 'file']].drop_duplicates()
        elif 'cui' in analysis_type:
            ref_ann = ref_ann[['value', 'file']].drop_duplicates()
        elif 'full' in analysis_type:
            ref_ann = ref_ann[['start', 'end', 'value', 'file']].drop_duplicates()
        else:
            pass

        ref_n = len(ref_ann.drop_duplicates())

        return ref_n
    
@ft.lru_cache(maxsize=None)
def get_sys_data(system: str, analysis_type: str, corpus: str, filter_semtype = False) -> pd.DataFrame:
   
    _, data = get_metric_data(analysis_type, corpus)
    
    out = data[data['system'] == system].copy()
    
    if filter_semtype:
        st = SemanticTypes(semtypes).get_system_type(system)
    
    if corpus == 'casi':
        cols_to_keep = ['case', 'overlap'] 
        #cols_to_keep = ['case', 'begin', 'end'] 
        out = out[cols_to_keep].drop_duplicates()
        
        return out
        
    else:
        if filter_semtype:
            out = data[data['semtypes'].isin(st)].copy()
                
        else:
            out = data[data['system']== system].copy()

        if system == 'quick_umls':
            out = out[(out.score.astype(float) >= 0.8) & (out["type"] == 'concept_jaccard_score_False')]
        
        if system == 'metamap':
            out = out[out.score.abs().astype(int) >= 800]
        
#         if system == 'biomedicus':
#             out = out[out.score.abs().astype(int) >= 0.6]
            

        if 'entity' in analysis_type:
            cols_to_keep = ['begin', 'end', 'note_id']
        elif 'cui' in analysis_type:
            cols_to_keep = ['cui', 'note_id']
        elif 'full' in analysis_type:
            cols_to_keep = ['begin', 'end', 'cui', 'note_id']

        out = out[cols_to_keep]

        return out.drop_duplicates()

@ft.lru_cache(maxsize=None)
def get_system_matches(system: str, analysis_type: str, corpus: str):
   
    if corpus == 'casi':
        
        sql = "select `case`, overlap from test.amia_2019_cases where overlap = 1 and `system` = %(system)s"  
        data_matches = df_to_set(pd.read_sql(sql, params={"system":system}, con=engine), 'entity', 'sys', 'casi')
        
        sql = "select `case`, overlap from test.amia_2019_cases where (overlap = 0 or overlap is null) and `system` = %(system)s"  
        data_fn = df_to_set(pd.read_sql(sql, params={"system":system}, con=engine), 'entity', 'sys', 'casi')
        
    else:
        
        dir_test = analysisConf.data_dir + 'single_system_out/'

        file = dir_test + system + '_' + analysis_type + '_' + corpus + '_matches.txt'
        data_matches = set(literal_eval(line.strip()) for line in open(file))

        file = dir_test + system + '_' + analysis_type + '_' + corpus + '_ref_only.txt'
        data_fn = set(literal_eval(line.strip()) for line in open(file)) 

    return data_matches, data_fn

In [46]:
class SetTotals(object):
    """ 
    returns an instance with merged match set numbers using either union or intersection of elements in set 
    """
    def __init__(self, ref_n, sys_n, match_set):

        self = self    
        self.ref_ann = ref_n
        self.sys_n = sys_n
        self.match_set = match_set

    def get_ref_sys(self):

        ref_only = self.ref_ann - len(self.match_set)
        sys_only = self.sys_n - len(self.match_set)

        return ref_only, sys_only, len(self.match_set), self.match_set

In [47]:
# TODO: refactor generate_metrics()

def merge_eval(ref_only: int, system_only: int, ref_system_match: int, system_n: int, ref_n: int) -> dict:
    """
    Generate confusion matrix params
    :params: ref_only, system_only, reference_system_match -> sets
    matches, system_n, reference_n -> counts
    :return: dictionary object
    
    """

    if ref_only + ref_system_match != ref_n:
        print('ERROR!')

    # get evaluation metrics
    d = {}
    
    F, recall, precision, TP, FP, FN, TP_FN_R, TM  = Metrics(system_only, ref_only, ref_system_match, system_n).get_confusion_metrics()

    d = {
         'F': F[1], 
         'precision': precision[1], 
         'recall': recall[1], 
         'TP': TP, 
         'FN': FN, 
         'FP': FP, 
         'TP/FN': TP_FN_R,
         'n_gold': ref_n, 
         'n_sys': system_n, 
         'TM': TM
    }
    
    
    if system_n - FP != TP:
        print('inconsistent system n!')

    return d

In [48]:
@ft.lru_cache(maxsize=None)
def process_sentence(pt, sentence, analysis_type, corpus):
    """
    Recursively evaluate parse tree, 
    with check for existence before build
       :param sentence: to process
       :return class of merged annotations, boolean operated system df 
    """
    
    class Results(object):
        def __init__(self):
            self.results = set()
            #self.operations = []
            self.system_merges = pd.DataFrame()
            
    r = Results()
    
    if 'entity' in analysis_type and corpus != 'casi': 
        cols_to_keep = ['begin', 'end', 'note_id'] # entity only
    elif 'full' in analysis_type: 
        cols_to_keep = ['cui', 'begin', 'end', 'note_id'] # entity only
    elif 'cui' in analysis_type:
        cols_to_keep = ['cui', 'note_id'] # entity only
    elif corpus == 'casi':
        cols_to_keep = ['case', 'overlap']
    
    def evaluate(parseTree):
        oper = {'&': op.and_, '|': op.or_}
        
        if parseTree:
            leftC = gevent.spawn(evaluate, parseTree.getLeftChild())
            rightC = gevent.spawn(evaluate, parseTree.getRightChild())
            
            if leftC.get() and rightC.get():
                query = set()
                system_query = pd.DataFrame()
                fn = oper[parseTree.getRootVal()]
                
                if isinstance(leftC.get(), str):
                    # get system as leaf node 
                    left, _ = get_system_matches(leftC.get(), analysis_type, corpus)
                    left_sys = get_sys_data(leftC.get(), analysis_type, corpus, filter_semtype)
                
                elif isinstance(leftC.get(), tuple):
                    left = leftC.get()[0]
                    l_sys = leftC.get()[1]
                
                if isinstance(rightC.get(), str):
                    # get system as leaf node
                    right, _ = get_system_matches(rightC.get(), analysis_type, corpus)
                    right_sys = get_sys_data(rightC.get(), analysis_type, corpus, filter_semtype)
                    
                elif isinstance(rightC.get(), tuple):
                    right = rightC.get()[0]
                    r_sys = rightC.get()[1]
                    
                # create match set based on boolean operation
                match_set = fn(left, right)
               
                if corpus != 'casi':
                    if fn == op.or_:
                        r.results = r.results.union(match_set)

                        if isinstance(leftC.get(), str) and isinstance(rightC.get(), str):
                            frames = [left_sys, right_sys]
                            df = pd.concat(frames,  ignore_index=True)
                            df = df[cols_to_keep].drop_duplicates(subset=cols_to_keep)

                        elif isinstance(leftC.get(), str) and isinstance(rightC.get(), tuple):
                            frames = [left_sys, r_sys]
                            df = pd.concat(frames,  ignore_index=True)
                            df = df[cols_to_keep].drop_duplicates(subset=cols_to_keep)

                        elif isinstance(leftC.get(), tuple) and isinstance(rightC.get(), str):
                            frames = [l_sys, right_sys]
                            df = pd.concat(frames,  ignore_index=True)
                            df = df[cols_to_keep].drop_duplicates(subset=cols_to_keep)

                        elif isinstance(leftC.get(), tuple) and isinstance(rightC.get(), tuple):
                            frames = [l_sys, r_sys]
                            df = pd.concat(frames,  ignore_index=True)
                            df = df[cols_to_keep].drop_duplicates(subset=cols_to_keep)

                    if fn == op.and_:
                        if len(r.results) == 0:
                            r.results = match_set
                        r.results = r.results.intersection(match_set)

                        if isinstance(leftC.get(), str) and isinstance(rightC.get(), str):
                            df = left_sys.merge(right_sys, on=cols_to_keep, how='inner')
                            df = df[cols_to_keep].drop_duplicates(subset=cols_to_keep)

                        elif isinstance(leftC.get(), str) and isinstance(rightC.get(), tuple):
                            df = left_sys.merge(r_sys, on=cols_to_keep, how='inner')
                            df = df[cols_to_keep].drop_duplicates(subset=cols_to_keep)

                        elif isinstance(leftC.get(), tuple) and isinstance(rightC.get(), str):
                            df = l_sys.merge(right_sys, on=cols_to_keep, how='inner')
                            df = df[cols_to_keep].drop_duplicates(subset=cols_to_keep)

                        elif isinstance(leftC.get(), tuple) and isinstance(rightC.get(), tuple):
                            df = l_sys.merge(r_sys, on=cols_to_keep, how='inner')
                            df = df[cols_to_keep].drop_duplicates(subset=cols_to_keep)
                else:
                    if fn == op.or_:
                        r.results = r.results.union(match_set)

                        if isinstance(leftC, str) and isinstance(rightC, str):
                            df = left_sys.append(right_sys)
                            df = df.drop_duplicates()

                        elif isinstance(leftC, str) and isinstance(rightC, tuple):
                            df = left_sys.append(r_sys)
                            df = df.drop_duplicates()

                        elif isinstance(leftC, tuple) and isinstance(rightC, str):
                            df = right_sys.append(l_sys)
                            df = df.drop_duplicates()

                        elif isinstance(leftC, tuple) and isinstance(rightC, tuple):
                            df = l_sys.append(r_sys)
                            df = df.drop_duplicates()

                    if fn == op.and_:
                        if len(r.results) == 0:
                            r.results = match_set
                        r.results = r.results.intersection(match_set)

                        if isinstance(leftC, str) and isinstance(rightC, str):
                            df = left_sys.merge(right_sys, on=cols_to_keep, how='inner')
                            df = df.drop_duplicates()

                        elif isinstance(leftC, str) and isinstance(rightC, tuple):
                            df = left_sys.merge(r_sys, on=cols_to_keep, how='inner')
                            df = df.drop_duplicates()

                        elif isinstance(leftC, tuple) and isinstance(rightC, str):
                            df = l_sys.merge(right_sys, on=cols_to_keep, how='inner')
                            df = df.drop_duplicates()

                        elif isinstance(leftC, tuple) and isinstance(rightC, tuple):
                            df = l_sys.merge(r_sys, on=cols_to_keep, how='inner')
                            df = df.drop_duplicates()
                
                # get matched results
                query.update(r.results)
                
                # get combined system results
                r.system_merges = df
                
                if len(df) > 0:
                    system_query = system_query.append(df)
                else:
                    print('wtf!')
                    
                return query, system_query
            else:
                return parseTree.getRootVal()
    
    if sentence.n_or > 0 or sentence.n_and > 0:
        evaluate(pt)  
    
    # trivial case
    elif sentence.n_or == 0 and sentence.n_and == 0:
        r.results, _ = get_system_matches(sentence.sentence, analysis_type, corpus)
        r.system_merges = get_sys_data(sentence.sentence, analysis_type, corpus, filter_semtype)
        #print('trivial:', sentence.sentence, len(r.results), len(r.system_merges))
    
    return r

In [49]:
"""
Incoming Boolean sentences are parsed into a binary tree.

Test expressions to parse:

sentence = '((((A&B)|C)|D)&E)'

sentence = '(E&(D|(C|(A&B))))'

sentence = '(((A|(B&C))|(D&(E&F)))|(H&I))'

"""
# build parse tree from passed sentence
# using grammatical rules of Boolean logic
def buildParseTree(fpexp):
    """
       Iteratively build parse tree from passed sentence using grammatical rules of Boolean logic
       :param fpexp: sentence to parse
       :return eTree: parse tree representation
       Incoming Boolean sentences are parsed into a binary tree.
       Test expressions to parse:
       sentence = '(A&B)'
       sentence = '(A|B)'
       sentence = '((A|B)&C)'
       
    """
    fplist = fpexp.split()
    pStack = Stack()
    eTree = BinaryTree('')
    pStack.push(eTree)
    currentTree = eTree

    for i in fplist:

        if i == '(':
            currentTree.insertLeft('')
            pStack.push(currentTree)
            currentTree = currentTree.getLeftChild()
        elif i not in ['&', '|', ')']:
            currentTree.setRootVal(i)
            parent = pStack.pop()
            currentTree = parent
        elif i in ['&', '|']:
            currentTree.setRootVal(i)
            currentTree.insertRight('')
            pStack.push(currentTree)
            currentTree = currentTree.getRightChild()
        elif i == ')':
            currentTree = pStack.pop()
        else:
            raise ValueError

    return eTree

def make_parse_tree(payload):
    """
    Ensure data to create tree are in standard form
    :param sentence: sentence to preprocess
    :return pt, parse tree graph
            sentence, processed sentence to build tree
            a: order
    """
    def preprocess_sentence(sentence):
        # prepare statement for case when a boolean AND/OR is given
        sentence = payload.replace('(', ' ( '). \
            replace(')', ' ) '). \
            replace('&', ' & '). \
            replace('|', ' | '). \
            replace('  ', ' ')
        return sentence

    sentence = preprocess_sentence(payload)
    print(sentence)
    
    pt = buildParseTree(sentence)
    #pt.postorder() 
    
    return pt

class Sentence(object):

    def __init__(self, sentence):
        self = self
        self.n_and = sentence.count('&')
        self.n_or = sentence.count('|')
        self.sentence = sentence
    
def get_metrics(boolean_expression: str, analysis_type: str, corpus: str):
    """
    Traverse binary parse tree representation of Boolean sentence
        :params: boolean expression in form of '(<annotator_engine_name1><boolean operator><annotator_engine_name2>)'
                 analysis_type (string value of: 'entity', 'cui', 'full') used to filter set of reference and system annotations 
        :return: dictionary with values needed for confusion matrix
    """
    sentence = Sentence(boolean_expression)   

    pt = make_parse_tree(sentence.sentence)

    r = process_sentence(pt, sentence, analysis_type, corpus)
    
    system_n = len(r.system_merges)
    reference_n = get_ref_n(analysis_type, corpus, filter_semtype)

    reference_only, system_only, reference_system_match, match_set = SetTotals(reference_n, system_n, r.results).get_ref_sys()

    # get overall TP/TF and various other counts for running confusion matrix metric analysis
    return merge_eval(reference_only, system_only, reference_system_match, system_n, reference_n)

def get_merge_data(boolean_expression: str, analysis_type: str, corpus: str):
    """
    Traverse binary parse tree representation of Boolean sentence
        :params: boolean expression in form of '(<annotator_engine_name1><boolean operator><annotator_engine_name2>)'
                 analysis_type (string value of: 'entity', 'cui', 'full') used to filter set of reference and system annotations 
        :return: dictionary with values needed for confusion matrix
    """
    sentence = Sentence(boolean_expression)   

    pt = make_parse_tree(sentence.sentence)

    r = process_sentence(pt, sentence, analysis_type, corpus)
    
    #print('len sys merges:', len(r.system_merges))
    system_n = len(r.system_merges)
    reference_n = get_ref_n(analysis_type, corpus)

    reference_only, system_only, reference_system_match, match_set = SetTotals(reference_n, system_n, r.results).get_ref_sys()

    print(merge_eval(reference_only, system_only, reference_system_match, system_n, reference_n))
    # get matched data from merge
    return r.system_merges # merge_eval(reference_only, system_only, reference_system_match, system_n, reference_n)

In [50]:
# control filter_semtype in get_sys_data, get_fef_n and generate_metrics. TODO consolidate. 

# # run single statement
# statement = '(ctakes&clamp)'
# analysis_type = 'entity'
# corpus = 'fairview'
# #matches = get_merge_data(statement, analysis_type, corpus)
# #print(matches)



In [51]:
# import spacy
# nlp = spacy.load('en')

# sql = "select distinct note_id, sofa from concepts.sofas where corpus = 'fairview'"

# docs = pd.read_sql(sql, con=engine)

# d = {}

# for row in docs.itertuples():
#     d[row.note_id] = row.sofa
    
# print(len(d))

# test = matches[matches['note_id'] == '0000200926']
# print(len(test))

# doc = nlp(d['0000200926'])

# for row in test.itertuples():
#     my_str = [token.text.strip('\n').lower() for token in doc if token.idx >= (row.begin) and token.idx <= (row.end)]
#     if 'diabetes' in my_str:
#         print(my_str)

In [52]:
# generate all combinations of given list of annotators:
def partly_unordered_permutations(lst, k):
    elems = set(lst)
    for c in combinations(lst, k):
        for d in permutations(elems - set(c)):
            yield c + d
            
def expressions(l, n):
    for (operations, *operands), operators in product(
            combinations(l, n), product(('&', '|'), repeat=n - 1)):
        for operation in zip(operators, operands):
            operations = [operations, *operation]
        yield operations
        
#def run_ensemble(l, analysis_type, corpus):
def run_ensemble(systems, analysis_type, corpus):

    metrics = pd.DataFrame()
    
    for l in partly_unordered_permutations(systems, 2):
        print('processing merge combo:', l)
        for i in range(1, len(l)+1):
            test = list(expressions(l, i))
            for t in test:
                if i > 1:
                    # format Boolean sentence for parse tree 
                    t = '(' + " ".join(str(x) for x in t).replace('[','(').replace(']',')').replace("'","").replace(",","").replace(" ","") + ')'

                d = get_metrics(t, analysis_type, corpus)
                d['merge'] = t
                d['n_terms'] = i
                frames = [metrics, pd.DataFrame(d, index=[0]) ]
                metrics = pd.concat(frames, ignore_index=True, sort=False) 
                
#     for i in range(1, len(l)+1):
#         test = list(expressions(l, i))
#         for t in test:
#             if i > 1:
#                 # format Boolean sentence for parse tree 
#                 t = '(' + " ".join(str(x) for x in t).replace('[','(').replace(']',')').replace("'","").replace(",","").replace(" ","") + ')'

#             d = get_metrics(t, analysis_type, corpus)
#             d['merge'] = t
#             frames = [metrics, pd.DataFrame(d, index=[0]) ]
#             metrics = pd.concat(frames, ignore_index=True, sort=False) 
    
    now = datetime.now()
    timestamp = datetime.timestamp(now)
    
    file_name = corpus + '_all_merge_metrics_'
    
    metrics = metrics.drop_duplicates()
    metrics = metrics.sort_values(by=['n_terms', 'merge'])
    
    geometric_mean(metrics).to_csv(analysisConf.data_dir + file_name + analysis_type + '_' + str(timestamp) + '.csv')
    print(geometric_mean(metrics))

In [53]:
#TESTS -> ensemble:
def test_match_consistency(matches, ref_only, ref_n, sys):
    """test for reference only/match set consistency:
        params: match, system and reference only sets"""
   
    print('len', len(sys), len(matches), len(matches.union(sys)), len(matches.intersection(sys)))
    assert len(matches.union(ref_only)) == ref_n, 'Reference annotation mismatch union'
    assert len(matches.intersection(sys)) == len(matches), 'System annotation mismatch intersect'
    assert len(matches.union(sys)) == len(sys), 'System annotation mismatch union'
    assert len(matches.intersection(ref_only)) == 0, 'Reference annotation mismatch intersect'

def test_systems(analysis_type, systems, corpus):
    sys = df_to_set(get_sys_data(systems[0], analysis_type, corpus), analysis_type)
    test_match_consistency(*get_system_matches(systems[0], analysis_type, corpus), get_ref_n(analysis_type), sys)
    print('Match consistency:', len(sys),get_ref_n(analysis_type))

def test_metrics(ref, sys_m, match_m):
    test = True
    reference_n = len(ref)
    system_n = len(sys_m)

    print('Test metrics:', type(reference_n), type(system_n), type(match_m))

    reference_only, system_only, reference_system_match, match_set = SetTotals(reference_n, system_n, match_m).get_ref_sys()
    F, recall, precision, _, _, _, _, _ = Metrics(system_only, reference_only, reference_system_match, system_n).get_confusion_metrics()
    F_, recall_, precision_, _, _, _, _, _ = Metrics(system_only, reference_only, reference_system_match, system_n).get_confusion_metrics(test)

    assert F[1] == F_, 'F1 issue'
    assert recall[1] == recall_, 'recall issue'
    assert precision[1] == precision_, 'precision issue'
    print(F[1], F_)
    print(recall[1], recall_)
    print(precision[1], precision_)

def test_count(analysis_type, corpus):
    # test match counts:
    ctakes, _ = get_system_matches('ctakes', analysis_type, corpus)
    clamp, _ = get_system_matches('clamp', analysis_type, corpus)
    b9, _ = get_system_matches('biomedicus', analysis_type, corpus)
    mm, _ = get_system_matches('metamap', analysis_type, corpus)

    print('count:', len(mm.intersection(b9.intersection(clamp.intersection(ctakes)))))
    
def test_ensemble(analysis_type, corpus):
    
    print('ensemble:')
    # Get mixed system_n
    ref_ann, data = get_metric_data(analysis_type, corpus)

    names = ['ctakes', 'biomedicus', 'metamap', 'clamp']
    if 'entity' in analysis_type: 
        cols_to_keep = ['begin', 'end', 'note_id']
    elif 'cui' in analysis_type:
        cols_to_keep = ['cui', 'note_id']
    elif 'full' in analysis_type:
        cols_to_keep = ['begin', 'end', 'cui', 'note_id']

    biomedicus = data[data["system"]=='biomedicus'][cols_to_keep].copy()
    ctakes = data[data["system"]=='ctakes'][cols_to_keep].copy()
    clamp = data[data["system"]=='clamp'][cols_to_keep].copy()
    metamap = data[data["system"]=='metamap'][cols_to_keep].copy()
    quickumls = data[data["system"]=='quick_umls'][cols_to_keep].copy()

    print('systems:', len(biomedicus), len(clamp), len(ctakes), len(metamap), len(quickumls))

    b9 = set()
    cl = set()
    ct = set()
    mm = set()
    qu = set()

    b9 = df_to_set(get_sys_data('biomedicus', analysis_type, corpus), analysis_type)
    print(len(b9))

    ct = df_to_set(get_sys_data('ctakes', analysis_type, corpus), analysis_type)
    print(len(ct))

    cl = df_to_set(get_sys_data('clamp', analysis_type, corpus), analysis_type)
    print(len(cl))

    mm = df_to_set(get_sys_data('metamap', analysis_type, corpus), analysis_type)
    print(len(mm))

    qu = df_to_set(get_sys_data('quick_umls', analysis_type, corpus), analysis_type)
    print(len(qu))
    
    print('various merges:')
    print(len(b9), len(cl), len(ct), len(mm), len(qu))
    print(len(mm.intersection(b9.intersection(cl.intersection(ct)))))
    print(len(mm.union(b9.intersection(cl.intersection(ct)))))
    print(len(mm.union(b9.union(cl.intersection(ct)))))
    print(len(mm.union(b9.union(cl.union(ct)))))
    print(len(b9.intersection(ct)))

    sys_m = b9.intersection(ct.intersection(qu))
    print('sys_m:', len(sys_m))

    # Get match merges:
    ct, _ = get_system_matches('ctakes', analysis_type, corpus)
    cl, _ = get_system_matches('clamp', analysis_type, corpus)
    b9, _ = get_system_matches('biomedicus', analysis_type, corpus)
    mm, _ = get_system_matches('metamap', analysis_type, corpus)
    qu, _ = get_system_matches('quick_umls', analysis_type, corpus)

    match_m = b9.intersection(ct.intersection(qu))
    print('match_m:', len(match_m))
    # reference df to set
    if 'entity' in analysis_type: 
        cols_to_keep = ['end', 'start','file']
    elif 'cui' in analysis_type:
        cols_to_keep = ['value','file']
    elif 'full' in analysis_type:
        cols_to_keep = ['end', 'start', 'value','file']

    ref = df_to_set(ref_ann[cols_to_keep], analysis_type, 'ref')

    print('ref:', len(ref))

    # test difference:
    print('FP:', len(sys_m - match_m), len(sys_m - ref))
    assert len(sys_m - match_m) == len(sys_m - ref), 'FP mismatch'
    print('FN:', len(ref - match_m), len(ref - sys_m))
    assert len(ref - match_m) == len(ref - sys_m), 'FN mismatch'
    
    test_metrics(ref, sys_m, match_m)

In [54]:
#%%time
def main():
    
    #rtype = int(input("Run: 1->Single systems; 2->Ensemble; 3->Tests; 4-> MM Test"))
   
    '''
        corpora: i2b2, mipacq, fv017
        analyses: entity only (exact span), cui by document, full (aka (entity and cui on exaact span/exact cui)
        systems: ctakes, biomedicus, clamp, metamap, quick_umls
        
        TODO -> Vectorization (entity only and full):
                add switch for use of TN on single system performance evaluations 
                add switch for overlap matching versus exact span
             -> Other tasks besides concept extraction
        
    ''' 
    analysisConf =  AnalysisConfig()
    print(analysisConf.systems, analysisConf.corpus_config())
    
    if (rtype == 1):
        generate_metrics(analysis_type, corpus, filter_semtype)
    elif (rtype == 2):
        
        systems = ['ctakes','biomedicus','clamp','metamap','quick_umls']
        
        #for corpus in corpora:
        #for l in partly_unordered_permutations(systems, 2):
        #print('corpus:', corpus, 'systems', l)
        print('corpus:', corpus, 'systems:', systems)
            #run_ensemble(l, analysis_type, corpus) 
        run_ensemble(systems, analysis_type, corpus) 
            
    elif (rtype == 3):
        systems = ['biomedicus']
        t = ['concept_jaccard_score_false']
        test_systems(analysis_type, systems, corpus)  
        test_count(analysis_type, corpus)
        test_ensemble(analysis_type, corpus)
    elif (rtype == 4):
        generate_metrics_test(analysis_type, corpus)

if __name__ == '__main__':
    %prun main()

['biomedicus', 'clamp', 'ctakes', 'metamap', 'quick_umls'] ('analytical_fairview.csv', 'concepts.fairview_all')
corpus: fairview systems: ['ctakes', 'biomedicus', 'clamp', 'metamap', 'quick_umls']
processing merge combo: ('ctakes', 'biomedicus', 'metamap', 'clamp', 'quick_umls')
ctakes




biomedicus
metamap
clamp
quick_umls
 ( ctakes & biomedicus ) 
 ( ctakes | biomedicus ) 
 ( ctakes & metamap ) 
 ( ctakes | metamap ) 
 ( ctakes & clamp ) 
 ( ctakes | clamp ) 
 ( ctakes & quick_umls ) 
 ( ctakes | quick_umls ) 
 ( biomedicus & metamap ) 
 ( biomedicus | metamap ) 
 ( biomedicus & clamp ) 
 ( biomedicus | clamp ) 
 ( biomedicus & quick_umls ) 
 ( biomedicus | quick_umls ) 
 ( metamap & clamp ) 
 ( metamap | clamp ) 
 ( metamap & quick_umls ) 
 ( metamap | quick_umls ) 
 ( clamp & quick_umls ) 
 ( clamp | quick_umls ) 
 ( ( ctakes & biomedicus ) & metamap ) 
 ( ( ctakes & biomedicus ) | metamap ) 
 ( ( ctakes | biomedicus ) & metamap ) 
 ( ( ctakes | biomedicus ) | metamap ) 
 ( ( ctakes & biomedicus ) & clamp ) 
 ( ( ctakes & biomedicus ) | clamp ) 
 ( ( ctakes | biomedicus ) & clamp ) 
 ( ( ctakes | biomedicus ) | clamp ) 
 ( ( ctakes & biomedicus ) & quick_umls ) 
 ( ( ctakes & biomedicus ) | quick_umls ) 
 ( ( ctakes | biomedicus ) & quick_umls ) 
 ( ( ctakes | biome

         219221699 function calls (216814210 primitive calls) in 534.383 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    51120   99.281    0.002   99.556    0.002 {method 'factorize' of 'pandas._libs.hashtable.Int64Factorizer' objects}
    34104   57.985    0.002   58.597    0.002 {method 'factorize' of 'pandas._libs.hashtable.Int64HashTable' objects}
    17040   24.516    0.001   24.605    0.001 {method 'factorize' of 'pandas._libs.hashtable.Factorizer' objects}
    17054   24.213    0.001   25.067    0.001 {method 'factorize' of 'pandas._libs.hashtable.StringHashTable' objects}
    59697   20.219    0.000   20.219    0.000 {method 'copy' of 'numpy.ndarray' objects}
   122486   18.887    0.000   18.887    0.000 {built-in method numpy.concatenate}
    17049   17.327    0.001   17.327    0.001 {pandas._libs.hashtable.duplicated_int64}
89040/6960   14.128    0.000  478.871    0.069 <ipython-input-48-7fc1303490c8>:27(evalu