In [1]:
import gevent
import pandas as pd
import numpy as np
import math
import pymysql
import time 
import functools as ft
import glob, os   
import operator as op
import shelve
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from pandas.api.types import is_numeric_dtype
from pathlib import Path
from itertools import combinations, product, permutations
from sqlalchemy.engine import create_engine
from datetime import datetime
from ast import literal_eval
from scipy import stats  
from scipy.stats.mstats import gmean
from pythonds.basic.stack import Stack
from pythonds.trees.binaryTree import BinaryTree
from collections import defaultdict
import collections
from typing import List, Set, Tuple 

In [2]:
# STEP-1: CHOOSE YOUR CORPUS
# TODO: get working with list of corpora
#corpora = ['mipacq','i2b2','fairview'] #options for concept extraction include 'fairview', 'mipacq' OR 'i2b2'

corpus = 'fairview'
#corpora = ['i2b2','fairview']

# STEP-2: CHOOSE YOUR DATA DIRECTORY; this is where output data will be saved on your machine
data_directory = '/Users/gms/development/nlp/nlpie/data/ensembling-u01/output/' 

# STEP-3: CHOOSE WHICH SYSTEMS YOU'D LIKE TO EVALUATE AGAINST THE CORPUS REFERENCE SET
systems = ['biomedicus', 'clamp', 'ctakes', 'metamap', 'quick_umls']
#systems = ['biomedicus', 'clamp']
#systems = ['biomedicus', 'clamp']
#systems = ['clamp', 'quick_umls', 'biomedicus']

# STEP-4: CHOOSE TYPE OF RUN
rtype = 2      # OPTIONS INCLUDE: 1->Single systems; 2->Ensemble; 3->Tests; 4 -> generate signiicance test data;  
               # 5 -> signiicance testing
               # The Ensemble can include the max system set ['ctakes','biomedicus','clamp','metamap','quick_umls']
    
# STEP-5: CHOOSE WHAT TYPE OF ANALYSIS YOU'D LIKE TO RUN ON THE CORPUS
analysis_type = 'entity' #options include 'entity' OR 'full'

# STEP-(6A): ENTER DETAILS FOR ACCESSING MANUAL ANNOTATION DATA
database_type = 'mysql+pymysql' # We use mysql+pymql as default
database_username = 'gms'
database_password = 'nej123' 
database_url = 'localhost' # HINT: use localhost if you're running database on your local machine
database_name = 'concepts' # Enter database name

def ref_data(corpus):
    return corpus + '_all' # Enter the table within the database where your reference data is stored

table_name = ref_data(corpus)

# STEP-(6B): ENTER DETAILS FOR ACCESSING SYSTEM ANNOTATION DATA

def sys_data(corpus):
    return 'analytical_'+corpus+'.csv' # OPTIONS include 'analytical_cui_mipacq_concepts.csv' OR 'analytical_cui_i2b2_concepts.csv' 

system_annotation = sys_data(corpus)

# STEP-7: WE'LL CREATE A 'SYSTEM OUTPUT' DIRECTORY FOR YOU INSIDE THE DIRECTORY YOU SPECIFIED IN (STEP 2)
single_sys_dir = Path(data_directory + "single_system_out")
single_sys_dir.mkdir(parents=True, exist_ok=True)
dir_out = Path(data_directory + 'single_system_out/')

# STEP-8: CREATE A DB CONNECTION POOL
engine_request = str(database_type)+'://'+database_username+':'+database_password+"@"+database_url+'/'+database_name
engine = create_engine(engine_request, pool_pre_ping=True, pool_size=20, max_overflow=30)

# STEP-(9A): FILTER BY SEMTYPE
filter_semtype = True

# STEP-(9B): IF STEP-(9A) == True -> GET REFERENCE SEMTYPES

### Fairview -> ['drug', 'finding', 'anatomy', 'procedure']
### i2b2 -> ['test, treatment', 'problem']
### MiPACQ -> ['procedures', 'disorders, sign_symptom', 'anatomy', 'chemicals_and_drugs']

# semtypes = ['Anatomy']
def ref_semtypes(filter_semtype, corpus):
    if filter_semtype:
        if corpus == 'fairview':
            semtypes = ['Drug', 'Finding', 'Anatomy', 'Procedure']
        elif corpus == 'i2b2':
            semtypes = ['test,treatment', 'problem']
        elif corpus == 'mipacq':
            semtypes = ['Procedures', 'Disorders,Sign_Symptom', 'Anatomy', 'Chemicals_and_drugs']
        
        return semtypes

semtypes = ref_semtypes(filter_semtype, corpus)

# STEP-10: Set data directory/table for source documents for vectorization
src_table = 'sofa'

# STEP-11: Specificy match type from {'exact', 'overlap'}
run_type = 'overlap'

In [3]:
#if filter_semtype:
#    print(semtypes)

In [4]:
# config class for analysis
class AnalysisConfig():
    """
    Configuration object:
    systems to use
    notes by corpus
    paths by output, gold and system location
    """
    def __init__(self):
        self = self    
        self.systems = systems
        self.data_dir = data_directory
    
    def corpus_config(self): 
        usys_data = system_annotation
        ref_data = database_name+'.'+table_name
        return usys_data, ref_data

analysisConf =  AnalysisConfig()
#usys, ref = analysisConf.corpus_config()

In [5]:
class SemanticTypes(object):
    '''
    Filter semantic types based on: https://metamap.nlm.nih.gov/SemanticTypesAndGroups.shtml
    :params: semtypes list from corpus, system to query
    :return: list of equivalent system semtypes 
    '''
    
    def __init__(self, semtypes, corpus):
        self = self
        
        sql = "SELECT st.tui, abbreviation, clamp_name, ctakes_name FROM concepts.semantic_groups sg join semantic_types st on sg.tui = st.tui where " + corpus + "_name in ({})"\
           .format(', '.join(['%s' for _ in semtypes]))  
        
        stypes = pd.read_sql(sql, params=[semtypes], con=engine) 
       
        if len(stypes['tui'].tolist()) > 0:
            self.biomedicus_types = set(stypes['tui'].tolist())
            self.qumls_types = set(stypes['tui'].tolist())
        else:
            self.biomedicus_types = None
            self.qumls_types = None
        
        if stypes['clamp_name'].dropna(inplace=True) or len(stypes['clamp_name']) == 0:
            self.clamp_types = None
        else:
            self.clamp_types = set(stypes['clamp_name'].tolist()[0].split(','))
            
        if len(stypes['ctakes_name'].tolist()) > 0:
            self.ctakes_types = set(stypes['ctakes_name'].tolist()[0].split(','))
        else:
            self.ctakes_types = None
            
        if len(stypes['abbreviation'].tolist()) > 0:
            self.metamap_types = set(stypes['abbreviation'].tolist())
        else:
            self.metamap_types = None
            
        self.reference_types =  set(semtypes)
    
    def get_system_type(self, system):  
        
        if system == 'biomedicus':
            semtypes = self.biomedicus_types
        elif system == 'ctakes':
            semtypes = self.ctakes_types
        elif system == 'clamp':
            semtypes = self.clamp_types
        elif system == 'metamap':
            semtypes = self.metamap_types
        elif system == 'quick_umls':
            semtypes = self.qumls_types
        elif system == 'reference':
            semtypes = self.reference_types
            
        return semtypes
    
# print(SemanticTypes(['Drug'], corpus).get_system_type('biomedicus'))
# print(SemanticTypes(['Drug'], corpus).get_system_type('quick_umls'))
# print(SemanticTypes(['Anatomy'], corpus).get_system_type('clamp'))
#print(SemanticTypes(['test,treatment'], 'i2b2').get_system_type('clamp'))

In [6]:
# semtypes = ['test,treatment']
# semtypes = ['problem']
# corpus = 'i2b2'
# sys = 'clamp'

# is semantic type in particular system
def system_semtype_check(sys, semtype, corpus):
    st = SemanticTypes([semtype], corpus).get_system_type(sys)
    if st:
        return sys
    else:
        return None

# print(system_semtype_check(sys, semtypes, corpus))

In [7]:
# annotation class for systems
class AnnotationSystems():
    """   
    System annotations of interest for UMLS concept extraction
    NB: ctakes combines all "mentions" annotation types
    
    """
    def __init__(self):
        
        """ 
        annotation base types
        """   
        
        self.biomedicus_types = ["biomedicus.v2.UmlsConcept"]
        self.clamp_types = ["edu.uth.clamp.nlp.typesystem.ClampNameEntityUIMA"]
        self.ctakes_types = ["ctakes_mentions"]
        self.metamap_types = ["org.metamap.uima.ts.Candidate"]
        self.qumls_types = ["concept_jaccard_score_False"]
       
    def get_system_type(self, system):
        
        """
        return system types
        """
        
        if system == "biomedicus":
            view = "Analysis"
        else:
            view = "_InitialView"

        if system == 'biomedicus':
            types = self.biomedicus_types

        elif system == 'clamp':
            types = self.clamp_types

        elif system == 'ctakes':
            types = self.ctakes_types

        elif system == 'metamap':
            types = self.metamap_types
        
        elif system == "quick_umls":
            types = self.qumls_types
            
        return types, view
    
annSys = AnnotationSystems()

In [8]:
%reload_ext Cython

In [9]:
%%cython

import numpy as np # access to Numpy from Python layer
import math

class Metrics(object):
    """
    metrics class:
    returns an instance with confusion matrix metrics
    """
    def __init__(self, system_only, gold_only, gold_system_match, system_n, neither = 0): # neither: no sys or manual annotation

        self = self    
        self.system_only = system_only
        self.gold_only = gold_only
        self.gold_system_match = gold_system_match
        self.system_n = system_n
        self.neither = neither
        
    def get_confusion_metrics(self, corpus = None, test = False):
        
        """
        compute confusion matrix measures, as per  
        https://stats.stackexchange.com/questions/51296/how-do-you-calculate-precision-and-recall-for-multiclass-classification-using-co
        """
        cdef:
            int TP, FP, FN
            double TM

        TP = self.gold_system_match
        FP = self.system_only
        FN = self.gold_only
        
        TM = TP/math.sqrt(self.system_n) # TigMetric
       
        if not test:
            
            if corpus == 'casi':
                recall = TP/(TP + FN)
                precision = TP/(TP + FP)
                F = 2*(precision*recall)/(precision + recall)
            else:
                if self.neither == 0:
                    confusion = [[0, self.system_only],[self.gold_only,self.gold_system_match]]
                else:
                    confusion = [[self.neither, self.system_only],[self.gold_only,self.gold_system_match]]
                c = np.asarray(confusion)
                recall = np.diag(c) / np.sum(c, axis = 1)
                precision = np.diag(c) / np.sum(c, axis = 0)
                F = 2*(precision*recall)/(precision + recall)
        else:
            precision = TP/(TP+FP)
            recall = TP/(TP+FN)
            F = 2*(precision*recall)/(precision + recall)
        
        # Tignanelli Metric
        if FN == 0:
            TP_FN_R = TP
        elif FN > 0:
            TP_FN_R = TP/FN
 
        return F, recall, precision, TP, FP, FN, TP_FN_R, TM

In [10]:
def write_out(name: str, analysis_type: str, c: object):
   
    """
    write matching and reference-only sets to file for use in merging combinations:
    exact match only
    """
    
    # write output to file
    dir_out = analysisConf.data_dir + 'single_system_out/'
    with open(dir_out + name + '_' + analysis_type + '_' + c.corpus + '_matches.txt', 'w') as f:
        for item in list(c.matches):
            f.write("%s\n" % str(item))

    # write to file
    with open(dir_out + name + '_' + analysis_type + '_' + c.corpus + '_ref_only.txt', 'w') as f:
        for item in list(c.false_negatives):
            f.write("%s\n" % str(item))

In [11]:
def df_to_set(df, analysis_type = 'entity', df_type = 'sys', corpus = None):
    
    # get values for creation of series of type tuple
    if 'entity' in analysis_type: 
        if corpus == 'casi':
            arg = df.case, df.overlap
        else:    
            if df_type == 'sys':
                arg = df.begin, df.end, df.note_id
            else:
                arg = df.start, df.end, df.file
            
    elif 'cui' in analysis_type:
        if df_type == 'sys':
            arg = df.cui, df.note_id
        else:
            arg = df.value, df.file
    elif 'full' in analysis_type:
        if df_type == 'sys':
            arg = df.begin, df.end, df.cui, df.note_id
        else:
            arg = df.start, df.end, df.value, df.file
    
    return set(list(zip(*arg)))

In [12]:
%%cython 

from __main__ import write_out, df_to_set, engine
import numpy as np 
import pandas as pd

def get_cooccurences(ref, sys, analysis_type: str, corpus: str, single_sys = True, name = None):
    """
    get coocurences between system and reference; exact match; TODO: add relaxed -> done in single system evals during ensemble run
    """
    # cooccurences
    class Coocurences(object):
        
        def __init__(self):
            self.ref_system_match = 0
            self.ref_only = 0
            self.system_only = 0
            self.system_n = 0
            self.ref_n = 0
            self.matches = set()
            self.false_negatives = set()
            self.corpus = corpus

    c = Coocurences()
    
    if c.corpus != 'casi':
        if 'entity' in analysis_type and single_sys: # mipacq n -> 16793
            cols_to_keep = ['begin', 'end', 'note_id']
            sys = sys[cols_to_keep].drop_duplicates()
            ref = ref[['start', 'end', 'file']].drop_duplicates()
            sys.name = name
        elif 'cui' in analysis_type and single_sys: # mipacq n -> 10799
            cols_to_keep = ['cui', 'note_id']
            sys = sys[cols_to_keep].drop_duplicates()
            # do not overestimate FP
            sys = sys[~sys['cui'].isnull()] 
            ref = ref[['value', 'file']].drop_duplicates()
            ref = ref[~ref['value'].isnull()]
            sys.name = name
        elif 'full' in analysis_type and single_sys: # mipacq n -> 17393
            cols_to_keep = ['begin', 'end', 'cui', 'note_id']
            sys = sys[cols_to_keep].drop_duplicates()
            sys = sys[~sys['cui'].isnull()]
            ref = ref[['start', 'end', 'value', 'file']].drop_duplicates()
            ref = ref[~ref['value'].isnull()]
            sys.name = name

        # matches via inner join
        matches = pd.merge(sys, ref, how = 'inner', left_on=['begin','end','note_id'], right_on = ['start','end','file']) 
        # reference-only via left outer join
        fn = pd.merge(ref, sys, how = 'left', left_on=['start','end','file'], right_on = ['begin','end','note_id']) 

        fn = fn[fn['begin'].isnull()] # get as outer join with no match

        if 'entity' in analysis_type:
            cols_to_keep = ['start', 'end', 'file']
        else:
            cols_to_keep = ['start', 'end', 'value', 'file']

        matches = matches[cols_to_keep]
        fn = fn[cols_to_keep]

        # use for metrics 
        c.matches = c.matches.union(df_to_set(matches, analysis_type, 'ref'))
        c.false_negatives = c.false_negatives.union(df_to_set(fn, analysis_type, 'ref'))
        c.ref_system_match = len(c.matches)
        c.system_only = len(sys) - len(c.matches)
        c.system_n = len(sys)
        c.ref_n = len(ref)
        c.ref_only = len(c.false_negatives)
        
    else:
        sql = "select `case` from test.amia_2019_analytical_v where overlap = 1 and `system` = %(sys.name)s"  
        
        matches = pd.read_sql(sql, params={"sys.name":sys.name}, con=engine)
        
        sql = "select `case` from test.amia_2019_analytical_v where (overlap = 0 or overlap is null) and `system` = %(sys.name)s"  
        
        fn = pd.read_sql(sql, params={"sys.name":sys.name}, con=engine)
        
        c.matches = df_to_set(matches, 'entity', 'sys', 'casi')
        c.fn = df_to_set(fn, 'entity', 'sys', 'casi')
        c.ref_system_match = len(c.matches)
        c.system_only = len(sys) - len(c.matches)
        c.system_n = len(matches) + len(fn)
        c.ref_n = len(matches) + len(fn)
        c.ref_only = len(fn)
        
    # sanity check
    if len(ref) - c.ref_system_match < 0:
        print('Error: ref_system_match > len(ref)!')
    if len(ref) != c.ref_system_match + c.ref_only:
        print('Error: ref count mismatch!')
   
    # save TP/FN
    if single_sys and corpus != 'casi':
        print(analysis_type)
        write_out(sys.name, analysis_type, c)
    return c 

In [13]:
def label_vector(doc: str, ann: List[int], labels: List[str]) -> np.array:

    v = np.zeros(doc)
    labels = list(labels)
    
    for (i, lab) in enumerate(labels):
        i += 1  # 0 is reserved for no label
        idxs = [np.arange(a.begin, a.end) for a in ann if a.label == lab]
        idxs = [j for mask in idxs for j in mask]
        v[idxs] = i

    return v

# test confusion matrix elements for vectorized annotation set; includes TN
# https://kawahara.ca/how-to-compute-truefalse-positives-and-truefalse-negatives-in-python-for-binary-classification-problems/
def confused(sys1, ann1):
    TP = np.sum(np.logical_and(ann1 == 1, sys1 == 1))

    # True Negative (TN): we predict a label of 0 (negative), and the true label is 0.
    TN = np.sum(np.logical_and(ann1 == 0, sys1 == 0))

    # False Positive (FP): we predict a label of 1 (positive), but the true label is 0.
    FP = np.sum(np.logical_and(ann1 == 0, sys1 == 1))

    # False Negative (FN): we predict a label of 0 (negative), but the true label is 1.
    FN = np.sum(np.logical_and(ann1 == 1, sys1 == 0))
    
    return TP, TN, FP, FN

@ft.lru_cache(maxsize=None)
def vectorized_coocurences(r: object, analysis_type: str, corpus: str, filter_semtype, semtype = None) -> np.int64:
    docs = get_docs(corpus)
    if filter_semtype:
        ann = get_ref_ann(analysis_type, corpus, filter_semtype, semtype)
    else: 
        ann = get_ref_ann(analysis_type, corpus, filter_semtype)
        
    sys = get_sys_ann(r)
    cvals = []
    labels = ["concept"]

    for n in range(len(docs)):
        a1 = list(ann.loc[ann["case"] == docs[n][0]].itertuples(index=False))
        s1 = list(sys.loc[sys["case"] == docs[n][0]].itertuples(index=False))
        ann1 = label_vector(docs[n][1], a1, labels)
        sys1 = label_vector(docs[n][1], s1, labels)

        TP, TN, FP, FN = confused(sys1, ann1)
        cvals.append([TP, TN, FP, FN])

    return np.sum(cvals, axis=0)

In [14]:
def cm_dict(ref_only: int, system_only: int, ref_system_match: int, system_n: int, ref_n: int) -> dict:
    """
    Generate dictionary of confusion matrix params and measures
    :params: ref_only, system_only, reference_system_match -> sets
    matches, system_n, reference_n -> counts
    :return: dictionary object
    """

    if ref_only + ref_system_match != ref_n:
        print('ERROR!')

    # get evaluation metrics
    F, recall, precision, TP, FP, FN, TP_FN_R, TM  = Metrics(system_only, ref_only, ref_system_match, system_n).get_confusion_metrics()

    d = {
         'F': F[1], 
         'precision': precision[1], 
         'recall': recall[1], 
         'TP': TP, 
         'FN': FN, 
         'FP': FP, 
         'TP/FN': TP_FN_R,
         'n_gold': ref_n, 
         'n_sys': system_n, 
         'TM': TM
    }
    
    if system_n - FP != TP:
        print('inconsistent system n!')

    return d

In [15]:
@ft.lru_cache(maxsize=None)
def get_metric_data(analysis_type: str, corpus: str):
   
    usys_file, ref_table = AnalysisConfig().corpus_config()
    systems = AnalysisConfig().systems
    
    sys_ann = pd.read_csv(analysisConf.data_dir + usys_file, dtype={'note_id': str})
    
    sql = "SELECT * FROM " + ref_table  
    
    ref_ann = pd.read_sql(sql, con=engine)
    sys_ann = sys_ann.drop_duplicates()
    
    return ref_ann, sys_ann

In [16]:
%%cython

import pandas as pd
from scipy import stats
from scipy.stats.mstats import gmean

def geometric_mean(metrics):
    """
    1. Get rank average of F1, TP/FN, TM
        http://www.datasciencemadesimple.com/rank-dataframe-python-pandas-min-max-dense-rank-group/
        https://stackoverflow.com/questions/46686315/in-pandas-how-to-create-a-new-column-with-a-rank-according-to-the-mean-values-o?rq=1
    2. Take geomean of 2.
        https://stackoverflow.com/questions/42436577/geometric-mean-applied-on-row
    """
    
    data = pd.DataFrame() 

    metrics['F1 rank']=metrics['F'].rank(ascending=0,method='average')
    metrics['TP/FN rank']=metrics['TP/FN'].rank(ascending=0,method='average')
    metrics['TM rank']=metrics['TM'].rank(ascending=0,method='average')
    metrics['Gmean'] = gmean(metrics.iloc[:,-3:],axis=1)

    return metrics  

In [17]:
def generate_metrics(analysis_type: str, corpus: str, filter_semtype, single_sys = None):
    start = time.time()

    systems = AnalysisConfig().systems
    metrics = pd.DataFrame()

    ref_ann, sys_ann = get_metric_data(analysis_type, corpus)
    
    for sys in systems:
        
        if filter_semtype:
            st = SemanticTypes(semtypes, corpus).get_system_type(sys)
            ref_ann = ref_ann[ref_ann['semtype'].isin(SemanticTypes(semtypes, corpus).get_system_type('reference'))]
            
        types, _ = AnnotationSystems().get_system_type(sys) # system types for iterable
        for t in types:
            print(t)

            if filter_semtype:
                system_annotations = sys_ann[sys_ann['semtypes'].isin(st)].copy()
            else:
                system_annotations = sys_ann.copy()

            system = system_annotations[system_annotations['type'] == str(t)]

            if sys == 'quick_umls':
                system = system[system.score.astype(float) >= .8]

            if sys == 'metamap':
                system = system[system.score.abs().astype(int) >= 800]

            system = system.drop_duplicates()
            system.name = sys

            c = get_cooccurences(ref_ann, system, analysis_type, corpus, True, system.name) # get matches, FN, etc.

        if c.ref_system_match > 0: # compute confusion matrix metrics and write to dictionary -> df
            if corpus == 'casi':
                if sys == 'biomedicus':
                    t = 'biomedicus.v2.Acronym'
            
            # get dictionary of confusion matrix metrics
            d = cm_dict(c.ref_only, c.system_only, c.ref_system_match, c.system_n, c.ref_n)
            d['system'] = sys
            d['type'] = t
                
            data = pd.DataFrame(d,  index=[0])
            metrics = pd.concat([metrics, data], ignore_index=True)
            metrics.drop_duplicates(keep='last', inplace=True)
        else:
            print("NO EXACT MATCHES FOR", t)
        elapsed = (time.time() - start)
        print("elapsed:", sys, elapsed)
     
    elapsed = (time.time() - start)
    print(geometric_mean(metrics))
    
    now = datetime.now()
    timestamp = datetime.timestamp(now)
    
    if single_sys is None:
        file_name = 'metrics_'
    
    metrics.to_csv(analysisConf.data_dir + corpus + '_' + file_name + analysis_type + '_' + str(timestamp) + '.csv')
    
    print("total elapsed time:", elapsed) 

In [18]:
@ft.lru_cache(maxsize=None)
def get_ref_n(analysis_type: str, corpus: str, filter_semtype) -> int:
    
    ref_ann, _ = get_metric_data(analysis_type, corpus)
    
    if filter_semtype:
        ref_ann = ref_ann[ref_ann['semtype'].isin(SemanticTypes(semtypes, corpus).get_system_type('reference'))]
            
    if corpus == 'casi':
        return len(ref_ann)
        
    else:
        # do not overestimate fn
        if 'entity' in analysis_type:
            ref_ann = ref_ann[['start', 'end', 'file']].drop_duplicates()
        elif 'cui' in analysis_type:
            ref_ann = ref_ann[['value', 'file']].drop_duplicates()
        elif 'full' in analysis_type:
            ref_ann = ref_ann[['start', 'end', 'value', 'file']].drop_duplicates()
        else:
            pass

        ref_n = len(ref_ann.drop_duplicates())

        return ref_n
    
@ft.lru_cache(maxsize=None)
def get_sys_data(system: str, analysis_type: str, corpus: str, filter_semtype, semtype = None) -> pd.DataFrame:
   
    _, data = get_metric_data(analysis_type, corpus)
    
    out = data[data['system'] == system].copy()
    
    if filter_semtype:
        st = SemanticTypes([semtype], corpus).get_system_type(system)
    
    if corpus == 'casi':
        cols_to_keep = ['case', 'overlap'] 
        out = out[cols_to_keep].drop_duplicates()
        return out
        
    else:
        if filter_semtype:
            out = out[out['semtypes'].isin(st)].copy()
            
        else:
            out = out[out['system']== system].copy()
        
        if system == 'quick_umls':
            out = out[(out.score.astype(float) >= 0.8) & (out["type"] == 'concept_jaccard_score_False')]
            # fix for leading spacxe on semantic type field
            out = out.apply(lambda x: x.str.strip() if x.dtype == "object" else x) 
        
        if system == 'metamap':
            out = out[out.score.abs().astype(int) >= 800]
            
        if 'entity' in analysis_type:
            cols_to_keep = ['begin', 'end', 'note_id']
        elif 'cui' in analysis_type:
            cols_to_keep = ['cui', 'note_id']
        elif 'full' in analysis_type:
            cols_to_keep = ['begin', 'end', 'cui', 'note_id']

        #out = out[cols_to_keep]

        return out.drop_duplicates()

# read in system/reference matches from file
@ft.lru_cache(maxsize=None)
def get_system_matches(system: str, analysis_type: str, corpus: str):
   
    if corpus == 'casi':
        
        sql = "select `case`, overlap from test.amia_2019_cases where overlap = 1 and `system` = %(system)s"  
        data_matches = df_to_set(pd.read_sql(sql, params={"system":system}, con=engine), 'entity', 'sys', 'casi')
        
        sql = "select `case`, overlap from test.amia_2019_cases where (overlap = 0 or overlap is null) and `system` = %(system)s"  
        data_fn = df_to_set(pd.read_sql(sql, params={"system":system}, con=engine), 'entity', 'sys', 'casi')
        
    else:
        
        dir_test = analysisConf.data_dir + 'single_system_out/'

        file = dir_test + system + '_' + analysis_type + '_' + corpus + '_matches.txt'
        data_matches = set(literal_eval(line.strip()) for line in open(file))

        file = dir_test + system + '_' + analysis_type + '_' + corpus + '_ref_only.txt'
        data_fn = set(literal_eval(line.strip()) for line in open(file)) 

    return data_matches, data_fn

In [19]:
class SetTotals(object):
    """ 
    returns an instance with merged match set numbers using either union or intersection of elements in set 
    """
    def __init__(self, ref_n, sys_n, match_set):

        self = self    
        self.ref_ann = ref_n
        self.sys_n = sys_n
        self.match_set = match_set

    def get_ref_sys(self):

        ref_only = self.ref_ann - len(self.match_set)
        sys_only = self.sys_n - len(self.match_set)

        return ref_only, sys_only, len(self.match_set), self.match_set

In [20]:
@ft.lru_cache(maxsize=None)
def process_sentence(pt, sentence, analysis_type, corpus, filter_semtype, semtype = None):
    """
    Recursively evaluate parse tree, 
    with check for existence before build
       :param sentence: to process
       :return class of merged annotations, boolean operated system df 
    """
    
    class Results(object):
        def __init__(self):
            self.results = set()
            self.system_merges = pd.DataFrame()
            
    r = Results()
    
    if 'entity' in analysis_type and corpus != 'casi': 
        cols_to_keep = ['begin', 'end', 'note_id'] # entity only
    elif 'full' in analysis_type: 
        cols_to_keep = ['cui', 'begin', 'end', 'note_id'] # entity only
    elif 'cui' in analysis_type:
        cols_to_keep = ['cui', 'note_id'] # entity only
    elif corpus == 'casi':
        cols_to_keep = ['case', 'overlap']
    
    def evaluate(parseTree):
        oper = {'&': op.and_, '|': op.or_}
        
        if parseTree:
            leftC = gevent.spawn(evaluate, parseTree.getLeftChild())
            rightC = gevent.spawn(evaluate, parseTree.getRightChild())
            
            if leftC.get() and rightC.get():
                query = set()
                system_query = pd.DataFrame()
                fn = oper[parseTree.getRootVal()]
                
                if isinstance(leftC.get(), str):
                    # get system as leaf node 
                    left, _ = get_system_matches(leftC.get(), analysis_type, corpus)
                    if filter_semtype:
                        left_sys = get_sys_data(leftC.get(), analysis_type, corpus, filter_semtype, semtype)
                    else:
                        left_sys = get_sys_data(leftC.get(), analysis_type, corpus, filter_semtype)
                
                elif isinstance(leftC.get(), tuple):
                    left = leftC.get()[0]
                    l_sys = leftC.get()[1]
                
                if isinstance(rightC.get(), str):
                    # get system as leaf node
                    right, _ = get_system_matches(rightC.get(), analysis_type, corpus)
                    if filter_semtype:
                        right_sys = get_sys_data(rightC.get(), analysis_type, corpus, filter_semtype, semtype)
                    else:
                        right_sys = get_sys_data(rightC.get(), analysis_type, corpus, filter_semtype)
                    
                elif isinstance(rightC.get(), tuple):
                    right = rightC.get()[0]
                    r_sys = rightC.get()[1]
                    
                # create match set based on boolean operation
                match_set = fn(left, right)
               
                if fn == op.or_:
                    r.results = r.results.union(match_set)

                    if isinstance(leftC.get(), str) and isinstance(rightC.get(), str):
                        frames = [left_sys, right_sys]
                        df = pd.concat(frames,  ignore_index=True)
                        df = df[cols_to_keep].drop_duplicates(subset=cols_to_keep)

                    elif isinstance(leftC.get(), str) and isinstance(rightC.get(), tuple):
                        frames = [left_sys, r_sys]
                        df = pd.concat(frames,  ignore_index=True)
                        df = df[cols_to_keep].drop_duplicates(subset=cols_to_keep)

                    elif isinstance(leftC.get(), tuple) and isinstance(rightC.get(), str):
                        frames = [l_sys, right_sys]
                        df = pd.concat(frames,  ignore_index=True)
                        df = df[cols_to_keep].drop_duplicates(subset=cols_to_keep)

                    elif isinstance(leftC.get(), tuple) and isinstance(rightC.get(), tuple):
                        frames = [l_sys, r_sys]
                        df = pd.concat(frames,  ignore_index=True)
                        df = df[cols_to_keep].drop_duplicates(subset=cols_to_keep)

                if fn == op.and_:
                    if len(r.results) == 0:
                        r.results = match_set
                    r.results = r.results.intersection(match_set)

                    if isinstance(leftC.get(), str) and isinstance(rightC.get(), str):
                        df = left_sys.merge(right_sys, on=cols_to_keep, how='inner')
                        df = df[cols_to_keep].drop_duplicates(subset=cols_to_keep)

                    elif isinstance(leftC.get(), str) and isinstance(rightC.get(), tuple):
                        df = left_sys.merge(r_sys, on=cols_to_keep, how='inner')
                        df = df[cols_to_keep].drop_duplicates(subset=cols_to_keep)

                    elif isinstance(leftC.get(), tuple) and isinstance(rightC.get(), str):
                        df = l_sys.merge(right_sys, on=cols_to_keep, how='inner')
                        df = df[cols_to_keep].drop_duplicates(subset=cols_to_keep)

                    elif isinstance(leftC.get(), tuple) and isinstance(rightC.get(), tuple):
                        df = l_sys.merge(r_sys, on=cols_to_keep, how='inner')
                        df = df[cols_to_keep].drop_duplicates(subset=cols_to_keep)
                
                # get matched results
                query.update(r.results)
                
                # get combined system results
                r.system_merges = df
                
                if len(df) > 0:
                    system_query = system_query.append(df)
                else:
                    print('wtf!')
                    
                return query, system_query
            else:
                return parseTree.getRootVal()
    
    if sentence.n_or > 0 or sentence.n_and > 0:
        evaluate(pt)  
    
    # trivial case
    elif sentence.n_or == 0 and sentence.n_and == 0:
        r.results, _ = get_system_matches(sentence.sentence, analysis_type, corpus)
        if filter_semtype:
            r.system_merges = get_sys_data(sentence.sentence, analysis_type, corpus, filter_semtype, semtype)
        else:
            r.system_merges = get_sys_data(sentence.sentence, analysis_type, corpus, filter_semtype)
            
        #print('trivial:', sentence.sentence, len(r.results), len(r.system_merges))
    
    return r

In [21]:
"""
Incoming Boolean sentences are parsed into a binary tree.

Test expressions to parse:

sentence = '((((A&B)|C)|D)&E)'

sentence = '(E&(D|(C|(A&B))))'

sentence = '(((A|(B&C))|(D&(E&F)))|(H&I))'

"""
# build parse tree from passed sentence using grammatical rules of Boolean logic
def buildParseTree(fpexp):
    """
       Iteratively build parse tree from passed sentence using grammatical rules of Boolean logic
       :param fpexp: sentence to parse
       :return eTree: parse tree representation
       Incoming Boolean sentences are parsed into a binary tree.
       Test expressions to parse:
       sentence = '(A&B)'
       sentence = '(A|B)'
       sentence = '((A|B)&C)'
       
    """
    fplist = fpexp.split()
    pStack = Stack()
    eTree = BinaryTree('')
    pStack.push(eTree)
    currentTree = eTree

    for i in fplist:

        if i == '(':
            currentTree.insertLeft('')
            pStack.push(currentTree)
            currentTree = currentTree.getLeftChild()
        elif i not in ['&', '|', ')']:
            currentTree.setRootVal(i)
            parent = pStack.pop()
            currentTree = parent
        elif i in ['&', '|']:
            currentTree.setRootVal(i)
            currentTree.insertRight('')
            pStack.push(currentTree)
            currentTree = currentTree.getRightChild()
        elif i == ')':
            currentTree = pStack.pop()
        else:
            raise ValueError

    return eTree

def make_parse_tree(payload):
    """
    Ensure data to create tree are in correct form
    :param sentence: sentence to preprocess
    :return pt, parse tree graph
            sentence, processed sentence to build tree
            a: order
    """
    def preprocess_sentence(sentence):
        # prepare statement for case when a boolean AND/OR is given
        sentence = payload.replace('(', ' ( '). \
            replace(')', ' ) '). \
            replace('&', ' & '). \
            replace('|', ' | '). \
            replace('  ', ' ')
        return sentence

    sentence = preprocess_sentence(payload)
    print(sentence)
    
    pt = buildParseTree(sentence)
    #pt.postorder() 
    
    return pt

class Sentence(object):
    '''
    Details about boolean expression -> number operators and expression
    '''
    def __init__(self, sentence):
        self = self
        self.n_and = sentence.count('&')
        self.n_or = sentence.count('|')
        self.sentence = sentence

@ft.lru_cache(maxsize=None)
def get_docs(corpus):
    sql = 'select distinct note_id, sofa from sofas where corpus = %(corpus)s order by note_id'
    df = pd.read_sql(sql, params={"corpus":corpus}, con=engine)
    df.drop_duplicates()
    df['len_doc'] = df['sofa'].apply(len)
    
    subset = df[['note_id', 'len_doc']]
    docs = [tuple(x) for x in subset.to_numpy()]
    
    return docs

@ft.lru_cache(maxsize=None)
def get_ref_ann(analysis_type, corpus, filter_semtype, semtype = None):
    
    if filter_semtype:
        if ',' in semtype:
            semtype = semtype.split(',')
        else:
            semtype = [semtype]
        
    ann, _ = get_metric_data(analysis_type, corpus)
    ann = ann.rename(index=str, columns={"start": "begin", "file": "case"})
    
    if filter_semtype:
        ann = ann[ann['semtype'].isin(semtype)]
        
    ann["label"] = 'concept'
    cols_to_keep = ['begin', 'end', 'case', 'label']
    ann = ann[cols_to_keep]
    
    return ann

@ft.lru_cache(maxsize=None)
def get_sys_ann(r):
    sys = r.system_merges    
    sys = sys.rename(index=str, columns={"note_id": "case"})
    sys["label"] = 'concept'
    cols_to_keep = ['begin', 'end', 'case', 'label']
    sys = sys[cols_to_keep]

    return sys

@ft.lru_cache(maxsize=None)
def get_metrics(boolean_expression: str, analysis_type: str, corpus: str, run_type: str, filter_semtype, semtype = None):
    """
    Traverse binary parse tree representation of Boolean sentence
        :params: boolean expression in form of '(<annotator_engine_name1><boolean operator><annotator_engine_name2>)'
                 analysis_type (string value of: 'entity', 'cui', 'full') used to filter set of reference and system annotations 
        :return: dictionary with values needed for confusion matrix
    """
    
    sentence = Sentence(boolean_expression)   
    pt = make_parse_tree(sentence.sentence)
    
    if filter_semtype:
        r = process_sentence(pt, sentence, analysis_type, corpus, filter_semtype, semtype)
    else:
        r = process_sentence(pt, sentence, analysis_type, corpus, filter_semtype)
        
    # vectorize merges using i-o labeling
    if run_type == 'overlap':
        if filter_semtype:
            TP, TN, FP, FN = vectorized_coocurences(r, analysis_type, corpus, filter_semtype, semtype)
        else:
            TP, TN, FP, FN = vectorized_coocurences(r, analysis_type, corpus, filter_semtype)

        # TODO: validate against ann1/sys1 where val = 1
        # total by number chars
        system_n = TP + FP
        reference_n = TP + FN

        d = cm_dict(FN, FP, TP, system_n, reference_n)
        
        d['TN'] = TN
        
        # return full metrics
        return d

    elif run_type == 'exact':
        # total by number spans
        system_n = len(r.system_merges)
        reference_n = get_ref_n(analysis_type, corpus, filter_semtype)

        reference_only, system_only, reference_system_match, match_set = SetTotals(reference_n, system_n, r.results).get_ref_sys()
        # get overall TP/TF and various other counts for running confusion matrix metric analysis
        return cm_dict(reference_only, system_only, reference_system_match, system_n, reference_n)

In [22]:
# generate all combinations of given list of annotators:
def partly_unordered_permutations(lst, k):
    elems = set(lst)
    for c in combinations(lst, k):
        for d in permutations(elems - set(c)):
            yield c + d
            
def expressions(l, n):
    for (operations, *operands), operators in product(
            combinations(l, n), product(('&', '|'), repeat=n - 1)):
        for operation in zip(operators, operands):
            operations = [operations, *operation]
        yield operations

# get list of systems with a semantic type in grouping
def get_valid_systems(systems, semtype):
    test = []
    for sys in systems:
        st = system_semtype_check(sys, semtype, corpus)
        if st:
            test.append(sys)

    return test

# permute system combinations and evaluate system merges for performance
def run_ensemble(systems, analysis_type, corpus, filter_semtype, semtype = None):
    metrics = pd.DataFrame()
    
    expression_type = 'nested_with_singleton'

    if expression_type == 'nested':
        for l in partly_unordered_permutations(systems, 2):
            print('processing merge combo:', l)
            for i in range(1, len(l)+1):
                test = list(expressions(l, i))
                for t in test:
                    if i > 1:
                        # format Boolean sentence for parse tree 
                        t = '(' + " ".join(str(x) for x in t).replace('[','(').replace(']',')').replace("'","").replace(",","").replace(" ","") + ')'

                    if filter_semtype:
                        d = get_metrics(t, analysis_type, corpus, run_type, filter_semtype, semtype)
                    else:
                        d = get_metrics(t, analysis_type, corpus, run_type, filter_semtype)

                    d['merge'] = t
                    d['n_terms'] = i

                    frames = [metrics, pd.DataFrame(d, index=[0])]
                    metrics = pd.concat(frames, ignore_index=True, sort=False) 
                    
    elif expression_type == 'nested_with_singleton':
        
        nested = list(expressions(systems, 3))
        test = list(expressions(systems, 2))
        to_do_terms = []
    
        for n in nested:
            # format Boolean sentence for parse tree 
            n = '(' + " ".join(str(x) for x in n).replace('[','(').replace(']',')').replace("'","").replace(",","").replace(" ","") + ')'

            for t in test:
                t = '(' + " ".join(str(x) for x in t).replace('[','(').replace(']',')').replace("'","").replace(",","").replace(" ","") + ')'

                new_and = '(' + n +'&'+ t + ')'
                new_or = '(' + n +'|'+ t + ')'

                if new_and.count('biomedicus') != 2 and new_and.count('clamp') != 2 and new_and.count('ctakes') != 2 and new_and.count('metamap') != 2 and new_and.count('quick_umls') != 2:

                    if new_and.count('&') != 4 and new_and.count('|') != 4:
                        #print(new_and)
                        #print(new_or)
                        to_do_terms.append(new_or)
                        to_do_terms.append(new_and)
        
        print('nested_with_singleton', len(to_do_terms))
        for t in to_do_terms:
             if filter_semtype:
                    d = get_metrics(t, analysis_type, corpus, run_type, filter_semtype, semtype)
                else:
                    d = get_metrics(t, analysis_type, corpus, run_type, filter_semtype)

                d['merge'] = t
                d['n_terms'] = i

                frames = [metrics, pd.DataFrame(d, index=[0])]
                metrics = pd.concat(frames, ignore_index=True, sort=False) 
                        
    elif expression_type == 'paired':
        m = list(expressions(systems, 2))

        test = list(expressions(m, 2))

        to_do_terms = []
        for t in test:
            # format Boolean sentence for parse tree 
            t = '(' + " ".join(str(x) for x in t).replace('[','(').replace(']',')').replace("'","").replace(",","").replace(" ","") + ')'
            if t.count('biomedicus') != 2 and t.count('clamp') != 2 and t.count('ctakes') != 2 and t.count('metamap') != 2 and t.count('quick_umls') != 2:
                if t.count('&') != 3 and t.count('|') != 3:
                    print(t)
                    to_do_terms.append(t)
                    for i in systems:
                        if i not in t:
                            print('('+t+'&'+i+')')
                            print('('+t+'|'+i+')')
                            new_and = '('+t+'&'+i+')'
                            new_or = '('+t+'|'+i+')'
                            to_do_terms.append(new_and)
                            to_do_terms.append(new_or)
    #         for t in to_do_terms:
    #             print(t)
        print(len(to_do_terms))
        
    return metrics

# write to file
def generate_ensemble_metrics(metrics, analysis_type, corpus, filter_semtype, semtype = None):
    now = datetime.now()
    timestamp = datetime.timestamp(now)
    
    file_name = corpus + '_all_merge_metrics_'
   
    # drop exact matches:
    metrics = metrics.drop_duplicates()
    metrics = metrics.sort_values(by=['n_terms', 'merge'])
    #metrics = metrics.drop_duplicates(subset=['TP', 'FN', 'FP', 'n_sys', 'precision', 'recall', 'F', 'TM', 'TP/FN', 'TM', 'n_terms'])

    file = file_name + analysis_type + '_' + run_type +'_'
    
    if filter_semtype:
        file += semtype
    
    geometric_mean(metrics).to_csv(analysisConf.data_dir + file + str(timestamp) + '.csv')
    print(geometric_mean(metrics))
    
# control ensemble run
def ensemble_control(systems, analysis_type, corpus, run_type, filter_semtype, semtypes = None):
    
    print(semtypes, systems)
    if filter_semtype:
        for semtype in semtypes:
            test = get_valid_systems(systems, semtype)
            print('SYSYEMS FOR SEMTYPE', semtype, 'ARE', test)
            metrics = run_ensemble(test, analysis_type, corpus, filter_semtype, semtype)
            generate_ensemble_metrics(metrics, analysis_type, corpus, filter_semtype, semtype)
            
            
    else:
        metrics = run_ensemble(systems, analysis_type, corpus, filter_semtype)
        generate_ensemble_metrics(metrics, analysis_type, corpus, filter_semtype)

In [23]:
# ad hoc query
def get_merge_data(boolean_expression: str, analysis_type: str, corpus: str, filter_semtype, semtype = None):
    """
    Traverse binary parse tree representation of Boolean sentence
        :params: boolean expression in form of '(<annotator_engine_name1><boolean operator><annotator_engine_name2>)'
                 analysis_type (string value of: 'entity', 'cui', 'full') used to filter set of reference and system annotations 
        :return: dictionary with values needed for confusion matrix
    """
    sentence = Sentence(boolean_expression)   

    pt = make_parse_tree(sentence.sentence)

    r = process_sentence(pt, sentence, analysis_type, corpus, filter_semtype, semtype)
    
    system_n = len(r.system_merges)
    reference_n = get_ref_n(analysis_type, corpus, filter_semtype)

    reference_only, system_only, reference_system_match, match_set = SetTotals(reference_n, system_n, r.results).get_ref_sys()

    print(cm_dict(reference_only, system_only, reference_system_match, system_n, reference_n))
    # get matched data from merge
    return r.system_merges # merge_eval(reference_only, system_only, reference_system_match, system_n, reference_n)

In [24]:
#%%time
def main():
    '''
        corpora: i2b2, mipacq, fv017
        analyses: entity only (exact span), cui by document, full (aka (entity and cui on exaact span/exact cui)
        systems: ctakes, biomedicus, clamp, metamap, quick_umls
        
        TODO -> Vectorization (entity only) -> done:
                add switch for use of TN on single system performance evaluations -> done
                add switch for overlap matching versus exact span -> done
             -> Other tasks besides concept extraction
        
    ''' 
    analysisConf =  AnalysisConfig()
    print(analysisConf.systems, analysisConf.corpus_config())
    
    if (rtype == 1):
        generate_metrics(analysis_type, corpus, filter_semtype)
    elif (rtype == 2):
        print('run_type:', run_type)
        # TODO -> list of corpora!
#         for corpus in corpora: 
#             table_name = ref_data(corpus)
#             system_annotation = sys_data(corpus)
#             semtypes = ref_semtypes(filter_semtype, corpus)
#             analysisConf =  AnalysisConfig()
#             usys, ref = analysisConf.corpus_config(system_annotation, table_name)
#             print('Using:', usys, ref)
        if filter_semtype:
            ensemble_control(analysisConf.systems, analysis_type, corpus, run_type, filter_semtype, semtypes)
        else:
            ensemble_control(analysisConf.systems, analysis_type, corpus, run_type, None)
    elif (rtype == 3):
        t = ['concept_jaccard_score_false']
        test_systems(analysis_type, analysisConf.systems, corpus)  
        test_count(analysis_type, corpus)
        test_ensemble(analysis_type, corpus)
        
if __name__ == '__main__':
    %prun main()
#    pass
#    main()

['biomedicus', 'clamp', 'ctakes', 'metamap', 'quick_umls'] ('analytical_fairview.csv', 'concepts.fairview_all')
run_type: overlap
['Drug', 'Finding', 'Anatomy', 'Procedure'] ['biomedicus', 'clamp', 'ctakes', 'metamap', 'quick_umls']
SYSYEMS FOR SEMTYPE Drug ARE ['biomedicus', 'clamp', 'ctakes', 'metamap', 'quick_umls']
((biomedicus&clamp)|(ctakes&metamap))
(((biomedicus&clamp)|(ctakes&metamap))&quick_umls)
(((biomedicus&clamp)|(ctakes&metamap))|quick_umls)
((biomedicus&clamp)&(ctakes|metamap))
(((biomedicus&clamp)&(ctakes|metamap))&quick_umls)
(((biomedicus&clamp)&(ctakes|metamap))|quick_umls)
((biomedicus&clamp)|(ctakes|metamap))
(((biomedicus&clamp)|(ctakes|metamap))&quick_umls)
(((biomedicus&clamp)|(ctakes|metamap))|quick_umls)
((biomedicus&clamp)|(ctakes&quick_umls))
(((biomedicus&clamp)|(ctakes&quick_umls))&metamap)
(((biomedicus&clamp)|(ctakes&quick_umls))|metamap)
((biomedicus&clamp)&(ctakes|quick_umls))
(((biomedicus&clamp)&(ctakes|quick_umls))&metamap)
(((biomedicus&clamp)&(ct

KeyError: 'n_terms'

In [None]:
# RESULTS

def get_results():
    results = pd.DataFrame()
    for fname in glob.glob(data_directory + '/submission/*.csv'):

        #print(fname)
        t = os.path.basename(fname)
        print(t)
        corpus = t.split('_')[0]
        #print(corpus)
        semtypes = t.split('_')[-2:-1][0]
        #print(semtypes)

        temp = pd.read_csv(fname) 

        temp['corpus'] = corpus
        temp['semtypes'] = semtypes
        temp['file'] = t

        frames = [ temp, results ]
        results = pd.concat(frames)


    merges = results.copy()
    merges = merges.rename(index=str, columns={"n_gold": "n_ref"})

    # sfingle system evaluation
    cols_to_keep = ['merge', 'corpus', 'semtypes', 'F', 'precision', 'recall', 'n_sys', 'n_gold']    
    #print(results[cols_to_keep][results['merge'].isin(['biomedicus', 'clamp', 'ctakes', 'metamap', 'quick_umls'])].sort_values(by=['corpus', 'merge', 'semtypes']).rename(index=str, columns={"merge": "system", "n_gold": "n_ref"}))
    df = results[cols_to_keep][results['merge'].isin(['biomedicus', 'clamp', 'ctakes', 'metamap', 'quick_umls'])].sort_values(by=['corpus', 'merge', 'semtypes']).rename(index=str, columns={"merge": "system", "n_gold": "n_ref"}).copy()
    df.to_csv(data_directory + '/submission/single_system_summary.csv')

    merges.reset_index(inplace=True)
    merges = merges[~merges['merge'].isin(['biomedicus', 'clamp', 'ctakes', 'metamap', 'quick_umls'])]
    max_f = merges.copy()
    max_p = merges.copy()
    max_r = merges.copy()
    #print(len(max_f), len(max_p), len(max_r))
    
    # https://datascience.stackexchange.com/questions/26308/after-grouping-to-minimum-value-in-pandas-how-to-display-the-matching-row-resul
    cols_to_keep = ['merge', 'corpus', 'semtypes', 'F', 'n_sys', 'n_ref']    
    f = max_f.loc[max_f.groupby(['corpus','semtypes'])['F'].idxmax()]
    f = f[cols_to_keep].sort_values(by=['corpus','semtypes'])

    cols_to_keep = ['merge', 'corpus', 'semtypes', 'precision', 'n_sys', 'n_ref']    
    p = max_p.loc[max_p.groupby(['corpus','semtypes'])['precision'].idxmax()]
    p = p[cols_to_keep].sort_values(by=['corpus','semtypes'])

    cols_to_keep = ['merge', 'corpus', 'semtypes', 'recall', 'n_sys', 'n_ref']    
    r = max_r.loc[max_r.groupby(['corpus','semtypes'])['recall'].idxmax()]
    r = r[cols_to_keep].sort_values(by=['corpus','semtypes'])
    #print(len(f), len(p), len(r))
    #print(f, p, r)

    writer = pd.ExcelWriter(data_directory + '/submission/max_merge_summary.xlsx', engine='xlsxwriter')
    f.to_excel(writer, sheet_name='max F-score')
    p.to_excel(writer, sheet_name='max precision')
    r.to_excel(writer, sheet_name='max recall')
    writer.save()
    
#get_results()

In [None]:
'''
df = pd.read_csv(data_directory + '/submission/single_system_summary.csv')
#print(df)

out = pd.pivot_table(df, values = ['precision', 'recall', 'F'], index=['corpus','semtypes'], columns = 'system').reset_index()

out.to_csv(data_directory + '/submission/single_system_out.csv')
'''
#df = pd.read_csv(data_directory + '/submission/test.csv')
# #print(df)

#pd.pivot_table(df, values = ['clinical_m1', 'clinical_m2'], index=['test_number','clinical_type'], columns = 'system').reset_index()

In [None]:
'''
from functools import reduce
f = pd.read_excel(open(data_directory + '/submission/max_merge_summary.xlsx', 'rb'), sheet_name='max F-score')
#print(f)

p = pd.read_excel(open(data_directory + '/submission/max_merge_summary.xlsx', 'rb'), sheet_name='max precision')
#print(p)

r = pd.read_excel(open(data_directory + '/submission/max_merge_summary.xlsx', 'rb'), sheet_name='max recall')
#print(r)

data_frames = [f, p, r]
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['corpus', 'semtypes'], how='inner'), data_frames)
df_merged = df_merged.rename(index=str, columns={"merge_x": "merge_f", "merge_y": "merge_p", "merge": 'merge_r', 'n_ref_x': 'n_ref'})

#print(df_merged)
cols_to_keep = ['merge_f', 'merge_p', 'merge_r', 'F', 'precision', 'recall', 'n_ref', 'corpus', 'semtypes']
print(df_merged[cols_to_keep])

df_merged[cols_to_keep].to_csv(data_directory + '/submission/max_merge_out.csv')
'''

In [None]:
'''
from itertools import combinations, product
def expressions_test(l, n):
    for (operations, *operands), operators in product(combinations(l, n), product(('&', '|'), repeat=n-1)):
        #print(list(zip(operators, operands)))    
        for operation in zip(operators, operands):
            operations = [operations, *operation]
        yield operations
        
l = ['biomedicus', 'clamp', 'ctakes', 'metamap', 'quick_umls']
m = list(expressions_test(l, 2))

test = list(expressions_test(m, 2))

to_do_4_terms = []
to_do_5_terms = []
for t in test:
    # format Boolean sentence for parse tree 
    t = '(' + " ".join(str(x) for x in t).replace('[','(').replace(']',')').replace("'","").replace(",","").replace(" ","") + ')'
    if t.count('biomedicus') != 2 and t.count('clamp') != 2 and t.count('ctakes') != 2 and t.count('metamap') != 2 and t.count('quick_umls') != 2:
        if t.count('&') != 3 and t.count('|') != 3:
            print(t)
            to_do_4_terms.append(t)
            for i in l:
                if i not in t:
                    print('('+t+'&'+i+')')
                    print('('+t+'|'+i+')')
                    to_do_5_terms.append('('+t+'&'+i+')')
                    to_do_5_terms.append('('+t+'|'+i+')')
                    
print(len(to_do_4_terms), len(to_do_5_terms))
'''

In [None]:
'''
nested = list(expressions_test(l, 3))


test = list(expressions_test(l, 2))

to_do_or_terms = []
to_do_and_terms = []
for n in nested:
    # format Boolean sentence for parse tree 
    n = '(' + " ".join(str(x) for x in n).replace('[','(').replace(']',')').replace("'","").replace(",","").replace(" ","") + ')'
   
    for t in test:
        t = '(' + " ".join(str(x) for x in t).replace('[','(').replace(']',')').replace("'","").replace(",","").replace(" ","") + ')'
        
        new_and = '(' + n +'&'+ t + ')'
        new_or = '(' + n +'|'+ t + ')'
    
        if new_and.count('biomedicus') != 2 and new_and.count('clamp') != 2 and new_and.count('ctakes') != 2 and new_and.count('metamap') != 2 and new_and.count('quick_umls') != 2:
            
            if new_and.count('&') != 4 and new_and.count('|') != 4:
                print(new_and)
                print(new_or)
                to_do_or_terms.append(new_or)
                to_do_and_terms.append(new_and)
                    
print(len(to_do_and_terms), len(to_do_or_terms))
'''

In [None]:
# q = get_sys_data('quick_umls', analysis_type, corpus, filter_semtype)
# q = q.sort_values(by=['note_id', 'begin'])
# print(q.head(20))

# b = get_sys_data('biomedicus', analysis_type, corpus, filter_semtype)
# b = b.sort_values(by=['note_id', 'begin'])
# print(b.head(20))

# df = pd.read_csv('/Users/gms/development/nlp/nlpie/data/ensembling-u01/output/analytical_fairview.csv')

# ref_ann, _ = get_metric_data(analysis_type, corpus)
# ref_ann = ref_ann[ref_ann['semtype'] == 'Drug']
# #ref_ann.sort_values(by=['file', 'start']).head(20)

In [None]:
# import random

# merge = 'biomedicus' 
# sys_ann = get_contingency_table(analysis_type, corpus, semtypes[0], merge)  
# ref_ann, _ = get_metric_data(analysis_type, corpus)
# #print(len(ref_ann))
# ref_ann = ref_ann[ref_ann['semtype'].isin(SemanticTypes(semtypes).get_system_type('reference'))]
# #print(len(ref_ann))

# def get_rand_idx(ref_ann, sys_ann):
#     r_idx = ref_ann.index.values.tolist() 
#     s_idx = sys_ann.index.values.tolist()
    
#     n = int(len(r_idx)/(1.33))
#     r = random.sample(r_idx, k=n)
#     n = int(len(s_idx)/(1.33))
#     s = random.sample(s_idx, k=n)
    
#     return r, s

# metrics = pd.DataFrame()
# for i in range(1, 5):
#     r, s = get_rand_idx(ref_ann, sys_ann)
#     ref = ref_ann.ix[r]
#     sys = sys_ann.ix[s]
    
#     c = get_cooccurences(ref, sys, analysis_type, corpus, False)
#     #print(c.ref_n, c.ref_only, c.system_n, c.system_only, c.ref_system_match)
    
#     d = cm_dict(c.ref_only, c.system_only, c.ref_system_match, c.system_n, c.ref_n)
    
#     frames = [metrics, pd.DataFrame(d, index=[0])]
#     metrics = pd.concat(frames, ignore_index=True, sort=False) 
    
# print(geometric_mean(metrics))

In [None]:
# control filter_semtype in get_sys_data, get_ref_n and generate_metrics. TODO consolidate. 
 
# # run single statement
statement = '((ctakes&clamp)|(biomedicus&metamap)'
analysis_type = 'entity'
corpus = 'fairview'
matches = get_merge_data(statement, analysis_type, corpus, True, semtypes[0])
print(matches)

# import spacy
# nlp = spacy.load('en')

# sql = "select distinct note_id, sofa from concepts.sofas where corpus = 'fairview'"

# docs = pd.read_sql(sql, con=engine)

# d = {}

# for row in docs.itertuples():
#     d[row.note_id] = row.sofa
    
# print(len(d))

# test = matches[matches['note_id'] == '0000200926']
# print(len(test))

# doc = nlp(d['0000200926'])

# for row in test.itertuples():
#     my_str = [token.text.strip('\n').lower() for token in doc if token.idx >= (row.begin) and token.idx <= (row.end)]
#     if 'diabetes' in my_str:
#         print(my_str)

In [None]:
# #TESTS -> ensemble:
# def test_match_consistency(matches, ref_only, ref_n, sys):
#     """test for reference only/match set consistency:
#         params: match, system and reference only sets"""
   
#     print('len', len(sys), len(matches), len(matches.union(sys)), len(matches.intersection(sys)))
#     assert len(matches.union(ref_only)) == ref_n, 'Reference annotation mismatch union'
#     assert len(matches.intersection(sys)) == len(matches), 'System annotation mismatch intersect'
#     assert len(matches.union(sys)) == len(sys), 'System annotation mismatch union'
#     assert len(matches.intersection(ref_only)) == 0, 'Reference annotation mismatch intersect'

# def test_systems(analysis_type, systems, corpus):
#     sys = df_to_set(get_sys_data(systems[0], analysis_type, corpus), analysis_type)
#     test_match_consistency(*get_system_matches(systems[0], analysis_type, corpus), get_ref_n(analysis_type), sys)
#     print('Match consistency:', len(sys),get_ref_n(analysis_type))

# def test_metrics(ref, sys_m, match_m):
#     test = True
#     reference_n = len(ref)
#     system_n = len(sys_m)

#     print('Test metrics:', type(reference_n), type(system_n), type(match_m))

#     reference_only, system_only, reference_system_match, match_set = SetTotals(reference_n, system_n, match_m).get_ref_sys()
#     F, recall, precision, _, _, _, _, _ = Metrics(system_only, reference_only, reference_system_match, system_n).get_confusion_metrics()
#     F_, recall_, precision_, _, _, _, _, _ = Metrics(system_only, reference_only, reference_system_match, system_n).get_confusion_metrics(test)

#     assert F[1] == F_, 'F1 issue'
#     assert recall[1] == recall_, 'recall issue'
#     assert precision[1] == precision_, 'precision issue'
#     print(F[1], F_)
#     print(recall[1], recall_)
#     print(precision[1], precision_)

# def test_count(analysis_type, corpus):
#     # test match counts:
#     ctakes, _ = get_system_matches('ctakes', analysis_type, corpus)
#     clamp, _ = get_system_matches('clamp', analysis_type, corpus)
#     b9, _ = get_system_matches('biomedicus', analysis_type, corpus)
#     mm, _ = get_system_matches('metamap', analysis_type, corpus)

#     print('count:', len(mm.intersection(b9.intersection(clamp.intersection(ctakes)))))
    
# def test_ensemble(analysis_type, corpus):
    
#     print('ensemble:')
#     # Get mixed system_n
#     ref_ann, data = get_metric_data(analysis_type, corpus)

#     names = ['ctakes', 'biomedicus', 'metamap', 'clamp']
#     if 'entity' in analysis_type: 
#         cols_to_keep = ['begin', 'end', 'note_id']
#     elif 'cui' in analysis_type:
#         cols_to_keep = ['cui', 'note_id']
#     elif 'full' in analysis_type:
#         cols_to_keep = ['begin', 'end', 'cui', 'note_id']

#     biomedicus = data[data["system"]=='biomedicus'][cols_to_keep].copy()
#     ctakes = data[data["system"]=='ctakes'][cols_to_keep].copy()
#     clamp = data[data["system"]=='clamp'][cols_to_keep].copy()
#     metamap = data[data["system"]=='metamap'][cols_to_keep].copy()
#     quickumls = data[data["system"]=='quick_umls'][cols_to_keep].copy()

#     print('systems:', len(biomedicus), len(clamp), len(ctakes), len(metamap), len(quickumls))

#     b9 = set()
#     cl = set()
#     ct = set()
#     mm = set()
#     qu = set()

#     b9 = df_to_set(get_sys_data('biomedicus', analysis_type, corpus), analysis_type)
#     print(len(b9))

#     ct = df_to_set(get_sys_data('ctakes', analysis_type, corpus), analysis_type)
#     print(len(ct))

#     cl = df_to_set(get_sys_data('clamp', analysis_type, corpus), analysis_type)
#     print(len(cl))

#     mm = df_to_set(get_sys_data('metamap', analysis_type, corpus), analysis_type)
#     print(len(mm))

#     qu = df_to_set(get_sys_data('quick_umls', analysis_type, corpus), analysis_type)
#     print(len(qu))
    
#     print('various merges:')
#     print(len(b9), len(cl), len(ct), len(mm), len(qu))
#     print(len(mm.intersection(b9.intersection(cl.intersection(ct)))))
#     print(len(mm.union(b9.intersection(cl.intersection(ct)))))
#     print(len(mm.union(b9.union(cl.intersection(ct)))))
#     print(len(mm.union(b9.union(cl.union(ct)))))
#     print(len(b9.intersection(ct)))

#     sys_m = b9.intersection(ct.intersection(qu))
#     print('sys_m:', len(sys_m))

#     # Get match merges:
#     ct, _ = get_system_matches('ctakes', analysis_type, corpus)
#     cl, _ = get_system_matches('clamp', analysis_type, corpus)
#     b9, _ = get_system_matches('biomedicus', analysis_type, corpus)
#     mm, _ = get_system_matches('metamap', analysis_type, corpus)
#     qu, _ = get_system_matches('quick_umls', analysis_type, corpus)

#     match_m = b9.intersection(ct.intersection(qu))
#     print('match_m:', len(match_m))
#     # reference df to set
#     if 'entity' in analysis_type: 
#         cols_to_keep = ['end', 'start','file']
#     elif 'cui' in analysis_type:
#         cols_to_keep = ['value','file']
#     elif 'full' in analysis_type:
#         cols_to_keep = ['end', 'start', 'value','file']

#     ref = df_to_set(ref_ann[cols_to_keep], analysis_type, 'ref')

#     print('ref:', len(ref))

#     # test difference:
#     print('FP:', len(sys_m - match_m), len(sys_m - ref))
#     assert len(sys_m - match_m) == len(sys_m - ref), 'FP mismatch'
#     print('FN:', len(ref - match_m), len(ref - sys_m))
#     assert len(ref - match_m) == len(ref - sys_m), 'FN mismatch'
    
#     test_metrics(ref, sys_m, match_m)
