In [1]:
# annotation class for UIMA systems
class AnnotationSystems(object):
    """
    CAS XMI Annotations of interest
    
    """
    
    def __init__(self):
        
        """ 
        annotation base types
        """
        
        self.biomedicus_dir = "biomedicus_out/"
        self.biomedicus_types = ["biomedicus.v2.UmlsConcept",
                                 "biomedicus.v2.Negated",
                                 "biomedicus.v2.Acronym"]
        
        
        self.clamp_dir = "clamp_out/"
        self.clamp_types = ["edu.uth.clamp.nlp.typesystem.ClampNameEntityUIMA",
                            "edu.uth.clamp.nlp.typesystem.ClampRelationUIMA"]
        
        
        self.ctakes_dir = "ctakes_out/"
        self.ctakes_types = ["org.apache.ctakes.typesystem.type.textsem.DiseaseDisorderMention",
                             "org.apache.ctakes.typesystem.type.textsem.MedicationMention",
                             "org.apache.ctakes.typesystem.type.textsem.ProcedureMention",
                             "org.apache.ctakes.typesystem.type.textsem.SignSymptomMention",
                             "org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention"]
        
        self.metamap_dir = "metamap_out/"
        self.metamap_types = ["org.metamap.uima.ts.Candidate",
                              "org.metamap.uima.ts.CuiConcept",
                              "org.metamap.uima.ts.Negation"]
                
       
    def get_system_type(self, system):
        
        """
        return system types
        """
        
        if system == "biomedicus":
            view = "Analysis"
            
        else:
            view = "_InitialView"

        if system == 'biomedicus':
            types = self.biomedicus_types
            output = self.biomedicus_dir

        elif system == 'clamp':
            types = self.clamp_types
            output = self.clamp_dir

        elif system == 'ctakes':
            types = self.ctakes_types
            output = self.ctakes_dir

        elif system == 'metamap':
            types = self.metamap_types
            output = self.metamap_dir
            
        return types, view, output
    
annSys = AnnotationSystems()

# extract attributes from cas Annotation object
def get_attribs(v):
    attribs = []
    for sentence in v:
        #print(sentence)
        for s in sentence.__dir__():
            if '__' not in s:
                if s not in attribs:
                    #print(s)
                    attribs.append(s)
                else:
                    break

    return attribs

def get_cols_to_keep(system, t):
    
    if system == 'biomedicus':
        
        # umlsconcept
        if 'Umls' in t:
            cols_to_keep = ['begin', 'confidence', 'cui', 'end', 'source', 'sui', 'tui',
                            'type', 'xmiID', 'system', 'note_id', 'corpus',
                            'filename']

        # acronym
        if 'Acronym' in t:
            cols_to_keep = ['begin', 'end', 'hasSpaceAfter', 'score', 'text', 'type',
                            'xmiID', 'system', 'note_id', 'corpus', 'filename']

        # negated
        if 'Negated' in t:
            cols_to_keep = ['begin', 'cueTerms', 'end', 'type', 'xmiID',
                           'system', 'note_id', 'corpus', 'filename']
        
    elif system == 'clamp':
        #NE uima
        if 'NameEntityUIMA' in t:
            cols_to_keep = ['assertion', 'attr1', 'attr2', 'attr3', 'attr4', 'attribute', 'begin',
                           'cui', 'end', 'semanticTag', 'type', 
                           'xmiID', 'system', 'note_id', 'corpus', 'filename', 'umlsCuiDesc', 'concept_prob', 'sentence_prob']

        # relation uima: TODO -> deal with named tuples!
        if 'Relation' in t:
            cols_to_keep = ['attr1', 'attr2', 'attr3', 'attr4', 'attribute', 'entFrom', 'entTo',
                           'semanticTag', 'type', 'xmiID', 'system', 'note_id',
                           'corpus', 'filename']
        
    elif system == 'ctakes':
        # disease
        if 'Disease' in t:
            cols_to_keep = ['alleviatingFactor', 'associatedSignSymptom', 'begin', 'bodyLaterality',
                           'bodyLocation', 'bodySide', 'conditional', 'confidence', 'course',
                           'discoveryTechnique', 'duration', 'end', 'endTime', 'event',
                           'exacerbatingFactor', 'generic', 'historyOf', 'id',
                           'ontologyConceptArr', 'polarity', 'relativeTemporalContext', 'severity',
                           'startTime', 'subject', 'type', 'typeID', 'uncertainty',
                           'xmiID', 'system', 'note_id', 'corpus', 'filename', 'cui', 'preferredText']

        # medication
        if 'Medication' in t:
            cols_to_keep = ['begin', 'conditional', 'confidence', 'discoveryTechnique', 'end',
                           'endDate', 'event', 'generic', 'historyOf', 'id', 'medicationAllergy',
                           'medicationDosage', 'medicationDuration', 'medicationForm',
                           'medicationFrequency', 'medicationRoute', 'medicationStatusChange',
                           'medicationStrength', 'ontologyConceptArr', 'polarity',
                           'relativeTemporalContext', 'startDate', 'subject', 'type',
                           'typeID', 'uncertainty', 'xmiID', 'system',
                           'note_id', 'corpus', 'filename', 'cui', 'preferredText']

        # proc
        if 'Procedure' in t:
            cols_to_keep = ['begin', 'bodyLaterality', 'bodyLocation', 'bodySide', 'conditional',
                           'confidence', 'discoveryTechnique', 'duration', 'end', 'endTime',
                           'event', 'generic', 'historyOf', 'id', 'method', 'ontologyConceptArr',
                           'polarity', 'procedureDevice', 'relativeTemporalContext', 
                           'startTime', 'subject', 'type', 'typeID', 'uncertainty',
                           'xmiID', 'system', 'note_id', 'corpus', 'filename', 'cui', 'preferredText']

        # sign
        if 'SignSymptom' in t:
            cols_to_keep = ['alleviatingFactor', 'begin', 'bodyLaterality', 'bodyLocation',
                           'bodySide', 'conditional', 'confidence', 'course', 'discoveryTechnique',
                           'duration', 'end', 'endTime', 'event', 'exacerbatingFactor', 'generic',
                           'historyOf', 'id', 'ontologyConceptArr', 'polarity',
                           'relativeTemporalContext', 'severity', 'startTime', 'subject',
                           'type', 'typeID', 'uncertainty', 'xmiID', 'system',
                           'note_id', 'corpus', 'filename', 'cui', 'preferredText']

        # anatomy
        if 'Anatomical' in t:
            cols_to_keep = ['begin', 'bodyLaterality', 'bodySide', 'conditional', 'confidence',
                           'discoveryTechnique', 'end', 'entity', 'generic', 'historyOf', 'id',
                           'ontologyConceptArr', 'polarity', 'subject', 'type', 'typeID',
                           'uncertainty', 'xmiID', 'system', 'note_id',
                           'corpus', 'filename', 'cui', 'preferredText']
        
    elif system == 'metamap':
        #candidate
        if 'Candidate' in t:
            cols_to_keep = ['begin', 'concept', 'cui', 'end', 'head', 'matchMap', 'matchedwords',
                           'overmatch', 'preferred', 'score', 'semanticTypes', 'sources',
                           'spans', 'type', 'xmiID', 'system', 'note_id',
                           'corpus', 'filename']

        #cuiconcept
        if 'CuiConcept' in t:
            cols_to_keep = ['id', 'negExConcept', 'negExCui', 'type', 'xmiID',
                           'system', 'note_id', 'corpus', 'filename']

        #negation
        if 'Negation' in t:
            cols_to_keep = ['begin', 'cuiConcepts', 'end', 'id', 'ncSpans', 'negTrigger', 'negType',
                           'ntSpans', 'type', 'xmiID', 'system',
                           'note_id', 'corpus', 'filename']
    
    else:
        cols_to_keep = []

    return cols_to_keep


In [2]:
def init_cassis(system, typesystem):
   
    #tic=timeit.default_timer() 

    print(system)

    # types for metamap

    if system == 'metamap':
        t = typesystem.create_type(name='org.apache.uima.examples.SourceDocumentInformation', supertypeName='uima.tcas.Annotation')
        typesystem.add_feature(t, name='uri', rangeTypeName='uima.cas.String')
        typesystem.add_feature(t, name="offsetInSource", rangeTypeName="uima.cas.Integer")
        typesystem.add_feature(t, name="documentSize", rangeTypeName="uima.cas.Integer")
        typesystem.add_feature(t, name="lastSegment", rangeTypeName="uima.cas.Integer")

    # features for ctakes

    if system == 'ctakes':
        t = typesystem.get_type('org.apache.ctakes.typesystem.type.structured.Metadata')
        typesystem.add_feature(t, name='patientIdentifier', rangeTypeName='uima.cas.String')
        
        t = typesystem.get_type('org.apache.ctakes.typesystem.type.textsem.DiseaseDisorderMention')
        typesystem.add_feature(t, name='id', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='ontologyConceptArr', rangeTypeName='uima.cas.FSArray', elementType='org.apache.ctakes.typesystem.type.refsem.UmlsConcept')
        typesystem.add_feature(t, name='subject', rangeTypeName='uima.cas.String')
        typesystem.add_feature(t, name='typeID', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='discoveryTechnique', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='confidence', rangeTypeName='uima.cas.Double')
        typesystem.add_feature(t, name='polarity', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='uncertainty', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='conditional', rangeTypeName='uima.cas.Boolean')
        typesystem.add_feature(t, name='generic', rangeTypeName='uima.cas.Boolean')
        typesystem.add_feature(t, name='historyOf', rangeTypeName='uima.cas.Integer')

        t = typesystem.get_type('org.apache.ctakes.typesystem.type.textsem.SignSymptomMention')
        typesystem.add_feature(t, name='id', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='ontologyConceptArr', rangeTypeName='uima.cas.FSArray', elementType='org.apache.ctakes.typesystem.type.refsem.UmlsConcept')
        typesystem.add_feature(t, name='subject', rangeTypeName='uima.cas.String')
        typesystem.add_feature(t, name='typeID', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='discoveryTechnique', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='confidence', rangeTypeName='uima.cas.Double')
        typesystem.add_feature(t, name='polarity', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='uncertainty', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='conditional', rangeTypeName='uima.cas.Boolean')
        typesystem.add_feature(t, name='generic', rangeTypeName='uima.cas.Boolean')
        typesystem.add_feature(t, name='historyOf', rangeTypeName='uima.cas.Integer')

        t = typesystem.get_type('org.apache.ctakes.typesystem.type.textsem.MedicationMention')
        typesystem.add_feature(t, name='id', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='ontologyConceptArr', rangeTypeName='uima.cas.FSArray', elementType='org.apache.ctakes.typesystem.type.refsem.UmlsConcept')
        typesystem.add_feature(t, name='subject', rangeTypeName='uima.cas.String')
        typesystem.add_feature(t, name='typeID', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='discoveryTechnique', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='confidence', rangeTypeName='uima.cas.Double')
        typesystem.add_feature(t, name='polarity', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='uncertainty', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='conditional', rangeTypeName='uima.cas.Boolean')
        typesystem.add_feature(t, name='generic', rangeTypeName='uima.cas.Boolean')
        typesystem.add_feature(t, name='historyOf', rangeTypeName='uima.cas.Integer')

        t = typesystem.get_type('org.apache.ctakes.typesystem.type.textsem.ProcedureMention')
        typesystem.add_feature(t, name='id', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='subject', rangeTypeName='uima.cas.String')
        typesystem.add_feature(t, name='typeID', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='discoveryTechnique', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='confidence', rangeTypeName='uima.cas.Double')
        typesystem.add_feature(t, name='polarity', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='uncertainty', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='conditional', rangeTypeName='uima.cas.Boolean')
        typesystem.add_feature(t, name='generic', rangeTypeName='uima.cas.Boolean')
        typesystem.add_feature(t, name='historyOf', rangeTypeName='uima.cas.Integer')

        t = typesystem.get_type('org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention')
        typesystem.add_feature(t, name='id', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='ontologyConceptArr', rangeTypeName='uima.cas.FSArray', elementType='org.apache.ctakes.typesystem.type.refsem.UmlsConcept')
        typesystem.add_feature(t, name='subject', rangeTypeName='uima.cas.String')
        typesystem.add_feature(t, name='typeID', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='discoveryTechnique', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='confidence', rangeTypeName='uima.cas.Double')
        typesystem.add_feature(t, name='polarity', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='uncertainty', rangeTypeName='uima.cas.Integer')
        typesystem.add_feature(t, name='conditional', rangeTypeName='uima.cas.Boolean')
        typesystem.add_feature(t, name='generic', rangeTypeName='uima.cas.Boolean')
        typesystem.add_feature(t, name='historyOf', rangeTypeName='uima.cas.Integer')


In [6]:
#%%time
# parse system annotations
def main():
    import os, glob
    import pymysql
    import pandas as pd
    import json
    #from superjson import json
    
    from sqlalchemy.engine import create_engine
    from sqlalchemy.sql import text
    from cassis import load_typesystem, load_cas_from_xmi

    # connection string
    engine = create_engine('mysql+pymysql://gms:nej123@localhost/concepts', pool_pre_ping=True)
    systems = ["metamap", "ctakes", "biomedicus", "clamp"]
    #systems = ["clamp"]
    
    corpora = ["fairview", "mipacq", "i2b2"]
    parse_to_sql = True
    
    i = 0
    if parse_to_sql:
        
        for corpus in corpora:
            print("CORPUS:", corpus)

            for system in systems:

                print("SYSTEM:", system)

                types, view_, output = annSys.get_system_type(system)
                
                dir_test = '/Users/gms/development/nlp/nlpie/scripts/jupyter/ensembling/typesystems/' + system + '/'

                with open(dir_test + 'TypeSystem.xml', 'rb') as f:
                    typesystem = load_typesystem(f)
                
                init_cassis(system, typesystem)
                
                # parse directory
                if corpus in ["i2b2", "mipacq"]:
                    directory_to_parse = '/Users/gms/development/nlp/nlpie/data/ensembling-u01/' + corpus + '/rerun-november-2019/' +  system + '_out/' #data_to_analyze/all/'
                
                elif corpus == 'fairview':
                    directory_to_parse = '/Users/gms/development/nlp/nlpie/data/ensembling-u01/' + corpus + '/system_annotations/data_in_preprocessed/' +  system + '_out/'#data_to_analyze/all/'
                    
                else:
                    directory_to_parse = '/Users/gms/development/nlp/nlpie/data/mimic/'
                    
                print(directory_to_parse)

                for fname in glob.glob(directory_to_parse + '/*.xmi'):
                #for fname in glob.glob(directory_to_parse + '/527982345.txt.xmi'):

                    file = os.path.basename(fname)
                    u = file.split('.')[0]

                    print(u)

                    # load cas
                    with open(directory_to_parse + file, 'rb') as f:
                        cas = load_cas_from_xmi(f, typesystem=typesystem)

                    # load view
                    view = cas.get_view(view_)

                    # sofa -> db
                    def write_sofa(u):
                        d = {}
                        d["note_id"] = str(u)
                        d["sofa"] = view.sofa_string
                        d["corpus"] = corpus

                        # does it exist?
                        if engine.dialect.has_table(engine, "sofas"):
                            sql = text("SELECT * FROM test.sofas WHERE note_id = :e1")
                            resp = engine.execute(sql, e1=u).fetchall()
                        else:
                            resp = []

                        if len(resp) == 0:            
                            pd.DataFrame(d, index=[0]).to_sql("sofas", engine, if_exists="append")  

                    write_sofa(u)

                    for t in types:
                        print("TYPE:", t)
                       
                        # get list for filtering df
                        cols_to_keep = get_cols_to_keep(system, t)
                        
                        attribs = get_attribs(view.select(t))
                        annotation_type = t
                        x = t.split('.')
                        table_name = system[0:3] + '_' + x[0] + '_' + x[len(x)-1]

                        # Annotation object -> dataframe
                        def get_df(v, attribs):
                            d = {}
                            df = pd.DataFrame()

                            # only parse if type exists in file
                            if view.select(t):
                                for sentence in view.select(t):
                                    
                                    for i in range(len(attribs)):
                                        key = attribs[i]
                                        val = sentence.__getattribute__(attribs[i])
                                       
                                        # TODO: convert to list and unstack
                                        if key == 'ontologyConceptArr':
                                            pass
                                    
                                        if system == 'clamp' and key == 'attribute':
                                            if val is not None and val:
                                                if "umlsCuiDesc" in json.loads(val):
                                                    d['umlsCuiDesc'] = json.loads(val)["umlsCuiDesc"].lower()
                                                else:
                                                    d['umlsCuiDesc'] = None
                                        
                                                if "concept_prob" in json.loads(val):
                                                    d['concept_prob'] = json.loads(val)["concept_prob"]
                                                else:
                                                    d['concept_prob'] = None
                                                
                                                if "sentence_prob" in json.loads(val):
                                                    d['sentence_prob'] = json.loads(val)["concept_prob"]
                                                else:
                                                    d['sentence_prob'] = None
                                                   
                                            else:
                                                d['umlsCuiDesc'] = None
                                                d['concept_prob'] = None
                                                d['sentence_prob'] = None
                                            
                                        if key in ['entFrom', 'entTo']:
                                            if val.attribute:
                                                attribute = json.loads(val.attribute)
                                            else:
                                                attribute = None
                                                
                                            val = json.dumps({"begin": val.begin, "end": val.end, "cui": val.cui, "attribute": attribute})
                                            
                                        if isinstance(val, list):
                                            val = ' '.join(map(str,val)) # flatten list to space delimited variable
                                        
                                        d[key] = val
                                       
                                        # entire sentence has be iterated through
                                        if i == len(attribs) - 1:
                                            frames = [ df, pd.DataFrame(d, index=[0]) ]
                                            df = pd.concat(frames, ignore_index=True)

                            df["system"] = system
                            df["type"] = annotation_type
                            df["note_id"] = u
                            df["corpus"] = corpus
                            df["filename"] = fname
                            return df

                        annotations = get_df(view.select(t), attribs)

                        # write to database
                        if not annotations.empty:
                            # handle extraction of elements from named tuple converted to str
                            if 'Mention' in t and not annotations.empty:
                                # stack string delimited by ") /" into multiple rows (initially represented by list of named tuples)
                                b = pd.DataFrame(annotations.ontologyConceptArr.str.split('\) o').tolist(), index=annotations.ontologyConceptArr).stack()
                                b = b.reset_index()[[0, 'ontologyConceptArr']] # var1 variable is currently labeled 0
                                b.columns = ['concept', 'ontologyConceptArr'] # renaming var1

                                c = pd.merge(annotations, b)

                                c['cui'] = c['concept'].str.extract(pat = 'cui=(.[^,]+)').apply(lambda s:s.str.replace("'", "")) 
                                c['preferredText'] = c['concept'].str.extract(pat = 'preferredText=(.[^,]+)').apply(lambda s:s.str.replace("'", ""))
                            
                                annotations = c
                            
                            annotations = annotations[cols_to_keep]
                            annotations.to_sql(table_name, engine, if_exists="append") 
   
    # write out annotations for non-cui tables
    else:
        
        sys_ann_other = pd.DataFrame()
        for system in systems:
                
            types, view_, output = annSys.get_system_type(system)
            print("SYSTEM:", system)
           
            for t in types:

                x = t.split('.')
                table_name = system[0:3] + '_' + x[0] + '_' + x[len(x)-1]

                sql = "SELECT * FROM test." + table_name 
                df = pd.read_sql(sql, engine)

                cols_to_keep = ['begin', 'end', 'type', 'system', 'note_id', 'corpus', 'filename']
                #print(system, t, table_name, list(df[cols_to_keep].columns.values))
                
                frames = [ sys_ann_other, df[cols_to_keep] ]
                sys_ann_other = pd.concat(frames, ignore_index=True)
        
        print(sys_ann_other.drop_duplicates())
        sys_ann_other.drop_duplicates().to_csv('/Users/gms/development/nlp/nlpie/data/amia-2019/output/analytical_' + corpus + '.csv')

    print("done!")

if __name__ == '__main__':
    main()

CORPUS: fairview
SYSTEM: metamap
metamap
/Users/gms/development/nlp/nlpie/data/ensembling-u01/fairview/system_annotations/data_in_preprocessed/metamap_out/
0002204202
TYPE: org.metamap.uima.ts.Candidate
TYPE: org.metamap.uima.ts.CuiConcept
TYPE: org.metamap.uima.ts.Negation
0000513005
TYPE: org.metamap.uima.ts.Candidate
TYPE: org.metamap.uima.ts.CuiConcept
TYPE: org.metamap.uima.ts.Negation
0000200926
TYPE: org.metamap.uima.ts.Candidate
TYPE: org.metamap.uima.ts.CuiConcept
TYPE: org.metamap.uima.ts.Negation
0000202738
TYPE: org.metamap.uima.ts.Candidate
TYPE: org.metamap.uima.ts.CuiConcept
TYPE: org.metamap.uima.ts.Negation
0029014353
TYPE: org.metamap.uima.ts.Candidate
TYPE: org.metamap.uima.ts.CuiConcept
TYPE: org.metamap.uima.ts.Negation
0002518956
TYPE: org.metamap.uima.ts.Candidate
TYPE: org.metamap.uima.ts.CuiConcept
TYPE: org.metamap.uima.ts.Negation
0001157129
TYPE: org.metamap.uima.ts.Candidate
TYPE: org.metamap.uima.ts.CuiConcept
TYPE: org.metamap.uima.ts.Negation
0000089745


In [None]:
import pandas as pd
    from sqlalchemy.engine import create_engine
sql = 'select ontologyConceptArr from concepts.cta_org_DiseaseDisorderMention'
engine = create_engine('mysql+pymysql://gms:nej123@localhost/concepts', pool_pre_ping=True)


pd.read_sql(sql, con=engine)

In [None]:
# create tables for cTAKES UMLS concept -> mention table
# https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows

def mk_ctakes_concepts(sql, table_name):
    import pymysql
    import pandas as pd
    from sqlalchemy.engine import create_engine

    engine = create_engine('mysql+pymysql://gms:nej123@localhost/test', pool_pre_ping=True)

    df = pd.read_sql(sql, engine)

    # MetaMap
    b = pd.DataFrame(df.cuiConcepts.str.split(' ').tolist(), index=df.cuiConcepts).stack()
    b = b.reset_index()[[0, 'cuiConcepts']] # var1 variable is currently labeled 0
    b.columns = ['linked_id', 'cuiConcepts'] # renaming var1
    
    # CTAKES:
     
    #b = pd.DataFrame(df.ontologyConceptArr.str.split(' ').tolist(), index=df.ontologyConceptArr).stack()
    #b = b.reset_index()[[0, 'ontologyConceptArr']] # var1 variable is currently labeled 0
    #b.columns = ['linked_id', 'ontologyConceptArr'] # renaming var1
    

    c = pd.merge(df, b)
   
    c.to_sql(table_name, engine, if_exists="replace") 

    print(c[0:2])

''' TABLES to make
sql = "SELECT * FROM test.met_org_negation"
table_name = 'mm_negation'

mk_ctakes_concepts(sql, table_name)

'''
'''
sql = "SELECT * FROM test.cta_disease"
table_name = 'cTAKES_disease'

mk_ctakes_concepts(sql, table_name)

sql = "SELECT * FROM test.cta_med"
table_name = 'cTAKES_medication'

mk_ctakes_concepts(sql, table_name)


sql = "SELECT * FROM test.cta_proc"
table_name = 'cTAKES_procedure'
mk_ctakes_concepts(sql, table_name)


sql = "SELECT * FROM test.cta_sign_symptom"
table_name = 'cTAKES_sign_symptom'

mk_ctakes_concepts(sql, table_name)

sql = "SELECT * FROM test.cta_anatomical_site"
table_name = 'cTAKES_anatomical_site'

mk_ctakes_concepts(sql, table_name)
'''


In [None]:
# fix XMI, adding "id" for those with linked features

def add_attribute_sys_type(regexpatterns):
    import re, os, glob, path
    import regex

    #directory_to_parse = '/Users/gms/development/nlp/nlpie/data/amia-2019/i2b2/quarantine/'

    #for fname in glob.iglob("/Users/gms/development/nlp/nlpie/data/amia-2019/i2b2/quarantine/period2/*.txt"):

    for fname in glob.iglob("/Users/gms/development/nlp/nlpie/data/amia-2019/analysis/mipacq/rerun_post_validation/metamap_out/*.txt.xmi"):

        # get filename and use for processed output filename
        t = os.path.basename(fname)
        cd = os.path.dirname(fname)
        u = t.split('.')[0] + '-v2.txt.xmi'
        #print(t, cd)

        #print(t)
        with open(fname) as f:
            with open(cd + '/' + u, 'w') as f2:
                for line in f:
                    for r in regexpatterns:
                        line = re.sub(r, r + ' id="0"', line)
                    f2.write(line)
    

#regexpatterns = [r"refsem:UmlsConcept"]
#regexpatterns = [r"ts2:CuiConcept"]
#regexpatterns = [r"ts2:Negation"]

#add_attribute_sys_type(regexpatterns)

In [None]:
# test dkpro-cassis
from cassis import *
def test_dkpro(fname, dir_test, ts_test, view_name):
    with open(ts_test + 'TypeSystem.xml', 'rb') as f:
        typesystem = load_typesystem(f)
    with open(dir_test + fname, 'rb') as f:
        cas = load_cas_from_xmi(f, typesystem=typesystem)
    view = cas.get_view(view_name)
    print([x for x in view.select_all()])
    #print([x for x in view.select("org.apache.ctakes.typesystem.type.refsem.UmlsConcept")])


# dir_test = '/Users/gms/development/nlp/nlpie/data/amia-2019/analysis/mipacq/metamap_out/'
# view_name = "_InitialView"
# ts_test = "/Users/gms/development/nlp/nlpie/data/amia-2019/typesystems/metamap/"
# fname = '3283236649-v1.txt.xmi'
# test_dkpro(fname, dir_test, ts_test, view_name)


In [None]:
# create tables for
# https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows

def get_analytical_set(sql):
    import pymysql
    import pandas as pd
    from sqlalchemy.engine import create_engine

    engine = create_engine('mysql+pymysql://gms:nej123@localhost/test', pool_pre_ping=True)

    df = pd.read_sql(sql, engine)
    
    return df

#GENERATE analytical tables
'''
analytical_cui = pd.DataFrame()

sql = "SELECT * FROM test.biomedicus_cui where corpus = 'mipacq'"

print(get_analytical_set(sql)[0:1])

frames = [ analytical_cui, get_analytical_set(sql) ]
analytical_cui = pd.concat(frames, ignore_index=True, sort=False) 

sql = "SELECT * FROM test.clamp_all_cui where corpus = 'mipacq'"

print(get_analytical_set(sql)[0:1])

frames = [ analytical_cui, get_analytical_set(sql) ]
analytical_cui = pd.concat(frames, ignore_index=True, sort=False) 

sql = "SELECT * FROM test.ctakes_all_types_cui where corpus = 'mipacq'"

print(get_analytical_set(sql)[0:1])

frames = [ analytical_cui, get_analytical_set(sql) ]
analytical_cui = pd.concat(frames, ignore_index=True, sort=False) 

sql = "SELECT * FROM test.metamap_all_cui where corpus = 'mipacq'"

print(get_analytical_set(sql)[0:1])
frames = [ analytical_cui, get_analytical_set(sql) ]
analytical_cui = pd.concat(frames, ignore_index=True, sort=False) 

print(analytical_cui[0:1])
analytical_cui.to_csv('/Users/gms/development/nlp/nlpie/data/amia-2019/output/analytical_cui_mipacq_concepts.csv')
'''

In [None]:
# clean up cui list in clamp
#df['stridx']=df.index


#print(df[df['cui'].str.contains(',')==True])

#df['new_cui' ] = np.where(df.cui.str.contains(','), df['cui'].str.split(r'\s*,\s*|\s*\.\s*').str[0], df['cui'])

#print(df['cui'])
#print(df[df['cui'].str.contains(',')==False])

#new = df.rename(columns={'cui': 'old_cui', 'new_cui': 'cui'}).copy() 
#new.to_csv('/Users/gms/development/nlp/nlpie/data/amia-2019/output/analytical_cui_mipacq_concepts_new.csv')
#writer = pd.ExcelWriter('/Users/gms/development/nlp/nlpie/data/amia-2019/output/ensemble/merged_metrics.xlsx')