In [2]:
%load_ext autoreload
%autoreload 1

In [3]:
from collections import defaultdict
import os, sys
sys.path.insert(0, "/mnt/f/dev/git/miRExplore/python/")
import pickle

import scispacy
import spacy
from spacy import displacy

from textmining.MirGeneRelCheck import SentenceRelationChecker, SentenceRelationClassifier
from utils.tmutils import normalize_gene_names
from collections import Counter
from lxml import etree
from sklearn.metrics import multilabel_confusion_matrix, classification_report
from itertools import chain, combinations



In [4]:
spacy.__version__

'2.2.1'

In [5]:
! ls /mnt/f/spacy

en_core_sci_lg-0.2.4  en_core_web_lg-2.2.0  en_ner_bionlp13cg_md-0.2.4
en_core_sci_lg-0.2.5  en_core_web_lg-2.3.1  en_ner_bionlp13cg_md-0.2.5


In [6]:
nlp_weblg = spacy.load('/mnt/f/spacy/en_core_web_lg-2.2.0/en_core_web_lg/en_core_web_lg-2.2.0/')

In [7]:

nlp = spacy.load('/mnt/f/spacy/en_core_sci_lg-0.2.4/en_core_sci_lg/en_core_sci_lg-0.2.4/')
nlp_ent = spacy.load("/mnt/f/spacy/en_ner_bionlp13cg_md-0.2.4/en_ner_bionlp13cg_md/en_ner_bionlp13cg_md-0.2.4")


In [9]:
nlp.meta['name']

'core_sci_lg'

In [10]:
%aimport textmining.MirGeneRelCheck

scaiBase = "/mnt/d/owncloud/data/miRExplore/scai_corpus/"

normGeneSymbols= normalize_gene_names(path=scaiBase + "/../obodir/" + "/hgnc_no_withdrawn.syn")

def loadRelations(relsFile):
    reldir2new = {}
    reldir2new["TARGET"] = "NEU"
    reldir2new["INVCORR"] = "NEU"
    reldir2new["REGULATE"] = "NEU"
    relClasses = {}
    with open(os.path.join(scaiBase, relsFile)) as fin:
        
        print("Loading relations", relsFile)
        
        for lidx, line in enumerate(fin):

            if lidx == 0:
                continue

            line = line.strip().split("\t")

            interactionID = line[0].strip()
            relType = line[1].strip()
            relRDir = line[2].strip()
            relIndirect = line[3].strip()
            relPassive = line[4].strip()

            if relRDir in reldir2new:
                relRDir = reldir2new[relRDir]

            relClasses[interactionID] = (relType, relRDir, relIndirect, relPassive)
            
        print("Interactions", len(relClasses))
        print("Non NA Interactions", len([x for x in relClasses if not relClasses[x][0] == "NA"]))
            
    return relClasses

def all_subsets(ss):
    
    return chain(*map(lambda x: combinations(ss, x), range(0, len(ss)+1)))

def runCheck(numelems, base, nlp, nlp_ent, subset_interactions=False, subset_classification=False):

    relChecker = SentenceRelationChecker(None)
    relClassifier = SentenceRelationClassifier()
    
    if base == "TRAIN":
        scaiFile = "miRNA_train_fixed.xml"
        relsFile = "scai_train_rels.tsv"
    elif base == "TEST":
        scaiFile = "miRNA_test_fixed.xml"
        relsFile = "scai_test_rels.tsv"
    
    relClasses = loadRelations(relsFile)
    
    check2results = {}
    checkInteractionSubsets = [x for x in all_subsets(relChecker.relCheck.checks_available)]
    checkClassifierSubsets = [x for x in all_subsets(relClassifier.major_checks)]
    #checkSubsets = [checkSubsets[-1]]
    
    if not subset_interactions:
        checkInteractionSubsets = [checkInteractionSubsets[-1]]
        
    if not subset_classification:
        checkClassifierSubsets = [checkClassifierSubsets[-1]]
    
    print("Will test interaction: {} subsets.".format(len(checkInteractionSubsets)))
    print("Will test classification: {} subsets.".format(len(checkClassifierSubsets)))
    
    for subset_checks_interaction in checkInteractionSubsets:
        for subset_checks_classification in checkClassifierSubsets:

            print("Testing checks", subset_checks_interaction)
            print("Testing checks", subset_checks_classification)
            
            relChecker = SentenceRelationChecker(nlp, nlp_ent=nlp_ent, active_checks=subset_checks_interaction)
            relClassifier = SentenceRelationClassifier(active_checks=subset_checks_classification)

            with open(os.path.join(scaiBase, scaiFile), 'r') as fin:
                tree = etree.parse(fin)
                root = tree.getroot()
                scaiPairs = []
                totalChecks = 0
                correctIdentified = 0
                incorrectIdentified = 0

                incorrectClassified = 0
                correctClassified = 0
                totalClassified = 0
                correctInteractClassified = 0
                totalValidClassified = 0
                totalClassifiable = 0
                totalMirGeneDownClassified = 0

                errorByDetect = Counter()
                classifiedByDetect = Counter()
                elemCaseCounter = Counter()
                incorrectClass = Counter()

                classifyTrue = []
                classifyPred = []

                for elem in root.findall(".//document"):

                    pmid = elem.attrib['origId']

                    for sentElem in elem:

                        allEntities = sentElem.findall(".//entity")
                        allPairs = sentElem.findall(".//pair")

                        sentText = sentElem.attrib["text"]

                        entId2elem = {}

                        for entity in allEntities:
                            entId = entity.attrib['id']
                            entText = entity.attrib['text']
                            entType = entity.attrib['type']
                            entOffset = tuple([int(x) for x in entity.attrib['charOffset'].split("-")])

                            if entType in ["Specific_miRNAs", "Genes/Proteins"]:

                                if "Genes" in entType:
                                    if entText in normGeneSymbols:
                                        entText = normGeneSymbols[entText]
                                    elif entText.upper() in normGeneSymbols:
                                        gene = normGeneSymbols[entText.upper()]
                                else:
                                    try:
                                        entText = miRNA(entText).getStringFromParts([miRNAPART.MATURE, miRNAPART.ID, miRNAPART.PRECURSOR])
                                    except:
                                        pass

                                entTuple = (entText, entType, (entOffset[0], entOffset[1]+1))
                                entId2elem[entId] = entTuple


                                sentEntText = sentText[entTuple[2][0]:entTuple[2][1]]

                        for pair in allPairs:

                            validInteraction = pair.attrib['interaction'].lower() == "true"
                            pairE1 = pair.attrib['e1']
                            pairE2 = pair.attrib['e2']

                            #if pairInt == 'true':
                            if pairE1 in entId2elem and pairE2 in entId2elem:

                                totalChecks += 1

                                e1 = entId2elem[pairE1]
                                e2 = entId2elem[pairE2]

                                if not e1[1] in ["Specific_miRNAs"]:

                                    tmp=e1
                                    e1=e2
                                    e2=tmp


                                relRes = relChecker.check_sentence(sentText
                                                                    , {"entity_type": "mirna", "entity_type_token": "e1", "entity_location": e1[2]}
                                                                    , {"entity_type": "gene", "entity_type_token": "e2", "entity_location": e2[2]}
                                                                    , fix_special_chars=False
                                                                    , relClassifier=relClassifier.classify, verbose=False
                                                                    )


                                fullsentence = relRes['full_sentence']
                                acceptInteraction = relRes['accept_relation']

                                if not acceptInteraction == validInteraction:

                                    incorrectIdentified += 1

                                    """
                                    relResV=relChecker.check_sentence(sentText
                                                                    , {"entity_type": "mirna", "entity_type_token": "e1", "entity_location": e1[2]}
                                                                    , {"entity_type": "gene", "entity_type_token": "e2", "entity_location": e2[2]}
                                                                    , fix_special_chars=False
                                                                    , relClassifier=relClassifier.classify, verbose=True
                                                                    )

                                    print(relResV)
                                    """
                                else:
                                    correctIdentified += 1

                                elemCaseCounter[(validInteraction, acceptInteraction)]+=1

                                totalClassified += 1

                                if validInteraction:# and acceptInteraction:
                                    #print(relRes["check_results"])   
                                    totalValidClassified += 1

                                    foundClasses = relRes["check_results"]["classification"] #{'regulation_dir': 'NEU', 'interaction_dir': 'MIR_GENE', 'regulation_keys': set()}
                                    foundTuple = (foundClasses["interaction_dir"], foundClasses["regulation_dir"])                                
                                    relationID = pair.attrib["id"]
                                    testTuple = relClasses[relationID]

                                    classifyTrue.append((testTuple[0], testTuple[1]))
                                    classifyPred.append((foundTuple[0], foundTuple[1]))

                                    if testTuple[0] == "MIR_GENE" and testTuple[1] == "DOWN":
                                        totalMirGeneDownClassified += 1

                                    classifiedByDetect[foundClasses["reg_detect"]] += 1

                                    if testTuple[0] != foundTuple[0] or testTuple[1] != foundTuple[1]:


                                        #print(sentText)
                                        #print(e1, e2)
                                        #print(foundTuple, foundClasses["reg_detect"], "passive?", foundClasses["passive"])
                                        #print(testTuple)

                                        #print()
                                        #print()
                                        #print()

                                        #print(relationID)
                                        #print("IS: ", foundTuple)
                                        #print("SB: ", testTuple)
                                        #print(relRes)

                                        #relResV=relChecker.check_sentence(sentText
                                        #                            , {"entity_type": "mirna", "entity_type_token": "e1", "entity_location": e1[2]}
                                        #                            , {"entity_type": "gene", "entity_type_token": "e2", "entity_location": e2[2]}
                                        #                            , fix_special_chars=False
                                        #                            , relClassifier=relClassifier.classify, verbose=True
                                        #                            )

                                        #print(relResV)

                                        #print()
                                        #print()
                                        #print()

                                        errorByDetect[foundClasses["reg_detect"]] += 1



                                        incorrectClassified += 1



                                        #numelems -= 1
                                        #if numelems == 0:
                                        #    return
                                    else:

                                        if foundClasses["reg_detect_major"] == "return":
                                            pass
                                            #print(foundClasses)
                                            #print(sentText)
                                            #print(e1, e2)
                                            #print(foundTuple, foundClasses["reg_detect"], "passive?", foundClasses["passive"])
                                            #print(testTuple)
                                            #print()
                                            #print()
                                            #print()
                                        correctClassified += 1

                                        if acceptInteraction:
                                            correctInteractClassified += 1



                def printStats(outfile):
                    print("Total:     ", totalChecks, file=outfile)
                    print("Correct:   ", correctIdentified, correctIdentified/totalChecks, file=outfile)
                    print("Incorrect: ", incorrectIdentified, incorrectIdentified/totalChecks, file=outfile)
                    print("classes", incorrectClass, file=outfile)

                    precision = elemCaseCounter[(True, True)] / (elemCaseCounter[(True, True)]+elemCaseCounter[(True, False)])
                    recall = elemCaseCounter[(True, True)] / (elemCaseCounter[(True, True)]+elemCaseCounter[(False, True)])

                    f1 = 2* precision * recall / (precision+recall)

                    if (elemCaseCounter[(True, False)] + elemCaseCounter[(False, False)]) == 0:
                        specificity = 0
                    else:
                        specificity = elemCaseCounter[(False, False)] / (elemCaseCounter[(True, False)] + elemCaseCounter[(False, False)])

                    print("precision", precision, file=outfile)
                    print("recall", recall, file=outfile)
                    print("specificity", specificity, file=outfile)
                    print("f1", f1, file=outfile)

                    print("Correct classified:     ", correctClassified, correctClassified/totalClassified, correctClassified/totalValidClassified,file=outfile)
                    print("Incorrect classified:   ", incorrectClassified, incorrectClassified/totalClassified, incorrectClassified/totalValidClassified,file=outfile)
                    print("Random classified: ", totalMirGeneDownClassified, totalMirGeneDownClassified / totalClassified, totalMirGeneDownClassified/totalValidClassified, file=outfile)
                    print(errorByDetect)

                    print("Correct interaction&classified:     ", correctInteractClassified, correctInteractClassified/totalClassified, correctInteractClassified/totalValidClassified,file=outfile)

                    print("Classified by: ", classifiedByDetect)

                    lClassifyTrue = ["_".join(x) for x in classifyTrue]
                    lClassifyPred = ["_".join(x) for x in classifyPred]

                    allLabels = sorted(set(lClassifyTrue+lClassifyPred))
                    matrix = multilabel_confusion_matrix(lClassifyTrue, lClassifyPred, labels=allLabels)
                    print(matrix)
                    print(classification_report(lClassifyTrue,lClassifyPred))

                    return {'interaction': {'precision': precision, 'recall': recall, 'specificity': specificity, 'f1': f1}, 'regulation': classification_report(lClassifyTrue, lClassifyPred, output_dict=True) }

                rep = printStats(sys.stdout)
                check2results[(subset_checks_interaction, subset_checks_classification)] = rep
        #printStats(sys.stderr)
    return check2results
        


In [11]:
%aimport textmining.MirGeneRelCheck

relClassifier = SentenceRelationClassifier()

results_test_bionlp_bionlp_subseti = runCheck(-1, "TEST", nlp_ent, nlp_ent, subset_interactions=True, subset_classification=False)
results_train_bionlp_bionlp_subseti = runCheck(-1, "TRAIN", nlp_ent, nlp_ent, subset_interactions=True, subset_classification=False)

Loading relations scai_test_rels.tsv
Interactions 232
Non NA Interactions 138
Will test interaction: 32 subsets.
Will test classification: 1 subsets.
Testing checks ()
Testing checks ('compartment', 'between', 'counts', 'return')
Total:      232
Correct:    138 0.5948275862068966
Incorrect:  94 0.4051724137931034
classes Counter()
precision 1.0
recall 0.5948275862068966
specificity 0
f1 0.745945945945946
Correct classified:      127 0.5474137931034483 0.9202898550724637
Incorrect classified:    11 0.04741379310344827 0.07971014492753623
Random classified:  52 0.22413793103448276 0.37681159420289856
Counter({'counts between': 10, 'between mir gene': 1})
Correct interaction&classified:      127 0.5474137931034483 0.9202898550724637
Classified by:  Counter({'counts between': 66, 'counts between equal': 10, 'between gene mir': 10, 'counts opp': 8, 'compartment mir gene by': 8, 'return': 7, 'between mir gene': 6, 'compartment mir gene': 6, 'compartment gene mir': 5, 'between mir gene reg co

  _warn_prf(average, modifier, msg_start, len(result))


Total:      232
Correct:    144 0.6206896551724138
Incorrect:  88 0.3793103448275862
classes Counter()
precision 0.9130434782608695
recall 0.6237623762376238
specificity 0.6
f1 0.7411764705882353
Correct classified:      127 0.5474137931034483 0.9202898550724637
Incorrect classified:    11 0.04741379310344827 0.07971014492753623
Random classified:  52 0.22413793103448276 0.37681159420289856
Counter({'counts between': 10, 'between mir gene': 1})
Correct interaction&classified:      115 0.4956896551724138 0.8333333333333334
Classified by:  Counter({'counts between': 66, 'counts between equal': 10, 'between gene mir': 10, 'counts opp': 8, 'compartment mir gene by': 8, 'return': 7, 'between mir gene': 6, 'compartment mir gene': 6, 'compartment gene mir': 5, 'between mir gene reg corr': 5, 'compartment mir gene neg corr': 4, 'compartment gene mir by': 2, 'counts between alternating': 1})
[[[132   2]
  [  0   4]]

 [[137   0]
  [  0   1]]

 [[ 82   4]
  [  5  47]]

 [[ 54   3]
  [  6  75]]



In [12]:
%aimport textmining.MirGeneRelCheck

relClassifier = SentenceRelationClassifier()

results_test_scilg_bionlp_subseti = runCheck(-1, "TEST", nlp, nlp_ent, subset_interactions=True, subset_classification=False)
results_train_scilg_bionlp_subseti = runCheck(-1, "TRAIN", nlp, nlp_ent, subset_interactions=True, subset_classification=False)

Loading relations scai_test_rels.tsv
Interactions 232
Non NA Interactions 138
Will test interaction: 32 subsets.
Will test classification: 1 subsets.
Testing checks ()
Testing checks ('compartment', 'between', 'counts', 'return')
Total:      232
Correct:    138 0.5948275862068966
Incorrect:  94 0.4051724137931034
classes Counter()
precision 1.0
recall 0.5948275862068966
specificity 0
f1 0.745945945945946
Correct classified:      127 0.5474137931034483 0.9202898550724637
Incorrect classified:    11 0.04741379310344827 0.07971014492753623
Random classified:  52 0.22413793103448276 0.37681159420289856
Counter({'counts between': 10, 'between mir gene': 1})
Correct interaction&classified:      127 0.5474137931034483 0.9202898550724637
Classified by:  Counter({'counts between': 65, 'counts between equal': 10, 'between gene mir': 10, 'counts opp': 9, 'compartment mir gene by': 8, 'return': 7, 'between mir gene': 6, 'compartment mir gene': 6, 'compartment gene mir': 5, 'compartment gene mir by

In [13]:
%aimport textmining.MirGeneRelCheck

relClassifier = SentenceRelationClassifier()

results_test_spcylg_bionlp_subseti = runCheck(-1, "TEST", nlp_weblg, nlp_ent, subset_interactions=True, subset_classification=False)
results_train_spcylg_bionlp_subseti = runCheck(-1, "TRAIN", nlp_weblg, nlp_ent, subset_interactions=True, subset_classification=False)

Loading relations scai_test_rels.tsv
Interactions 232
Non NA Interactions 138
Will test interaction: 32 subsets.
Will test classification: 1 subsets.
Testing checks ()
Testing checks ('compartment', 'between', 'counts', 'return')
Total:      232
Correct:    138 0.5948275862068966
Incorrect:  94 0.4051724137931034
classes Counter()
precision 1.0
recall 0.5948275862068966
specificity 0
f1 0.745945945945946
Correct classified:      111 0.47844827586206895 0.8043478260869565
Incorrect classified:    27 0.11637931034482758 0.1956521739130435
Random classified:  52 0.22413793103448276 0.37681159420289856
Counter({'counts between': 24, 'between mir gene': 2, 'compartment mir gene': 1})
Correct interaction&classified:      111 0.47844827586206895 0.8043478260869565
Classified by:  Counter({'counts between': 76, 'between gene mir': 10, 'compartment mir gene by': 8, 'return': 7, 'between mir gene': 7, 'compartment mir gene': 7, 'counts between equal': 5, 'counts opp': 5, 'compartment gene mir': 

In [14]:
%aimport textmining.MirGeneRelCheck

relClassifier = SentenceRelationClassifier()

results_test_scilg_bionlp = runCheck(-1, "TEST", nlp, nlp_ent, subset_interactions=False, subset_classification=False)
results_train_scilg_bionlp = runCheck(-1, "TRAIN", nlp, nlp_ent, subset_interactions=False, subset_classification=False)

Loading relations scai_test_rels.tsv
Interactions 232
Non NA Interactions 138
Will test interaction: 1 subsets.
Will test classification: 1 subsets.
Testing checks ('conj', 'sdp', 'compartment', 'context', 'entity')
Testing checks ('compartment', 'between', 'counts', 'return')
Total:      232
Correct:    217 0.9353448275862069
Incorrect:  15 0.06465517241379311
classes Counter()
precision 0.927536231884058
recall 0.9624060150375939
specificity 0.898989898989899
f1 0.9446494464944649
Correct classified:      127 0.5474137931034483 0.9202898550724637
Incorrect classified:    11 0.04741379310344827 0.07971014492753623
Random classified:  52 0.22413793103448276 0.37681159420289856
Counter({'counts between': 10, 'between mir gene': 1})
Correct interaction&classified:      118 0.5086206896551724 0.855072463768116
Classified by:  Counter({'counts between': 65, 'counts between equal': 10, 'between gene mir': 10, 'counts opp': 9, 'compartment mir gene by': 8, 'return': 7, 'between mir gene': 6,

In [15]:
%aimport textmining.MirGeneRelCheck

relClassifier = SentenceRelationClassifier()

results_test_scilg_scilg = runCheck(-1, "TEST", nlp, nlp, subset_interactions=False, subset_classification=False)
results_train_scilg_scilg = runCheck(-1, "TRAIN", nlp, nlp, subset_interactions=False, subset_classification=False)

Loading relations scai_test_rels.tsv
Interactions 232
Non NA Interactions 138
Will test interaction: 1 subsets.
Will test classification: 1 subsets.
Testing checks ('conj', 'sdp', 'compartment', 'context', 'entity')
Testing checks ('compartment', 'between', 'counts', 'return')
Total:      232
Correct:    217 0.9353448275862069
Incorrect:  15 0.06465517241379311
classes Counter()
precision 0.927536231884058
recall 0.9624060150375939
specificity 0.898989898989899
f1 0.9446494464944649
Correct classified:      127 0.5474137931034483 0.9202898550724637
Incorrect classified:    11 0.04741379310344827 0.07971014492753623
Random classified:  52 0.22413793103448276 0.37681159420289856
Counter({'counts between': 10, 'between mir gene': 1})
Correct interaction&classified:      118 0.5086206896551724 0.855072463768116
Classified by:  Counter({'counts between': 65, 'counts between equal': 10, 'between gene mir': 10, 'counts opp': 9, 'compartment mir gene by': 8, 'return': 7, 'between mir gene': 6,

In [16]:
%aimport textmining.MirGeneRelCheck

relClassifier = SentenceRelationClassifier()

results_test_bionlp_bionlp = runCheck(-1, "TEST", nlp_ent, nlp_ent, subset_interactions=False, subset_classification=False)
results_train_bionlp_bionlp = runCheck(-1, "TRAIN", nlp_ent, nlp_ent, subset_interactions=False, subset_classification=False)

Loading relations scai_test_rels.tsv
Interactions 232
Non NA Interactions 138
Will test interaction: 1 subsets.
Will test classification: 1 subsets.
Testing checks ('conj', 'sdp', 'compartment', 'context', 'entity')
Testing checks ('compartment', 'between', 'counts', 'return')
Total:      232
Correct:    183 0.7887931034482759
Incorrect:  49 0.21120689655172414
classes Counter()
precision 0.8043478260869565
recall 0.8345864661654135
specificity 0.7272727272727273
f1 0.8191881918819188
Correct classified:      127 0.5474137931034483 0.9202898550724637
Incorrect classified:    11 0.04741379310344827 0.07971014492753623
Random classified:  52 0.22413793103448276 0.37681159420289856
Counter({'counts between': 10, 'between mir gene': 1})
Correct interaction&classified:      102 0.4396551724137931 0.7391304347826086
Classified by:  Counter({'counts between': 66, 'counts between equal': 10, 'between gene mir': 10, 'counts opp': 8, 'compartment mir gene by': 8, 'return': 7, 'between mir gene':

In [17]:
%aimport textmining.MirGeneRelCheck

relClassifier = SentenceRelationClassifier()

results_test_spcylg_bionlp = runCheck(-1, "TEST", nlp_weblg, nlp_ent, subset_interactions=False, subset_classification=False)
results_train_spcylg_bionlp = runCheck(-1, "TRAIN", nlp_weblg, nlp_ent, subset_interactions=False, subset_classification=False)

Loading relations scai_test_rels.tsv
Interactions 232
Non NA Interactions 138
Will test interaction: 1 subsets.
Will test classification: 1 subsets.
Testing checks ('conj', 'sdp', 'compartment', 'context', 'entity')
Testing checks ('compartment', 'between', 'counts', 'return')
Total:      232
Correct:    173 0.7456896551724138
Incorrect:  59 0.2543103448275862
classes Counter()
precision 0.7463768115942029
recall 0.8110236220472441
specificity 0.6666666666666666
f1 0.7773584905660377
Correct classified:      111 0.47844827586206895 0.8043478260869565
Incorrect classified:    27 0.11637931034482758 0.1956521739130435
Random classified:  52 0.22413793103448276 0.37681159420289856
Counter({'counts between': 24, 'between mir gene': 2, 'compartment mir gene': 1})
Correct interaction&classified:      89 0.38362068965517243 0.644927536231884
Classified by:  Counter({'counts between': 76, 'between gene mir': 10, 'compartment mir gene by': 8, 'return': 7, 'between mir gene': 7, 'compartment mir

In [18]:
%aimport textmining.MirGeneRelCheck

relClassifier = SentenceRelationClassifier()

results_test_scilg_bionlp_subsetc = runCheck(-1, "TEST", nlp, nlp_ent, subset_interactions=False, subset_classification=True)
results_train_scilg_bionlp_subsetc = runCheck(-1, "TRAIN", nlp, nlp_ent, subset_interactions=False, subset_classification=True)

Loading relations scai_test_rels.tsv
Interactions 232
Non NA Interactions 138
Will test interaction: 1 subsets.
Will test classification: 16 subsets.
Testing checks ('conj', 'sdp', 'compartment', 'context', 'entity')
Testing checks ()
Total:      232
Correct:    217 0.9353448275862069
Incorrect:  15 0.06465517241379311
classes Counter()
precision 0.927536231884058
recall 0.9624060150375939
specificity 0.898989898989899
f1 0.9446494464944649
Correct classified:      62 0.2672413793103448 0.4492753623188406
Incorrect classified:    76 0.3275862068965517 0.5507246376811594
Random classified:  52 0.22413793103448276 0.37681159420289856
Counter({'static': 76})
Correct interaction&classified:      53 0.22844827586206898 0.38405797101449274
Classified by:  Counter({'static': 128, 'between gene mir': 10})
[[[134   0]
  [  4   0]]

 [[137   0]
  [  1   0]]

 [[ 10  76]
  [  0  52]]

 [[ 57   0]
  [ 71  10]]]
               precision    recall  f1-score   support

 GENE_MIR_NEU       0.00      0

  _warn_prf(average, modifier, msg_start, len(result))


Total:      232
Correct:    217 0.9353448275862069
Incorrect:  15 0.06465517241379311
classes Counter()
precision 0.927536231884058
recall 0.9624060150375939
specificity 0.898989898989899
f1 0.9446494464944649
Correct classified:      85 0.36637931034482757 0.6159420289855072
Incorrect classified:    53 0.22844827586206898 0.38405797101449274
Random classified:  52 0.22413793103448276 0.37681159420289856
Counter({'static': 53})
Correct interaction&classified:      76 0.3275862068965517 0.5507246376811594
Classified by:  Counter({'static': 101, 'between gene mir': 10, 'compartment mir gene by': 8, 'compartment mir gene': 6, 'compartment gene mir': 5, 'compartment gene mir by': 4, 'compartment mir gene neg corr': 4})
[[[134   0]
  [  4   0]]

 [[137   0]
  [  1   0]]

 [[ 33  53]
  [  0  52]]

 [[ 57   0]
  [ 48  33]]]
               precision    recall  f1-score   support

 GENE_MIR_NEU       0.00      0.00      0.00         4
  GENE_MIR_UP       0.00      0.00      0.00         1
MIR_G

In [20]:
results_test_scilg_bionlp[('compartment',
  'between',
  'counts',
  'return')]

KeyError: ('compartment', 'between', 'counts', 'return')

In [19]:
! ls ..

BERTanalysis		 relexfiles.zip
allrels.csv		 scai_corpus
analysis		 scai_eval
bk_pickles		 scai_res
create_synonym_files.sh  scai_test_results.pickle
database		 scai_test_spacy_bionlp13cg_results.pickle
dbs			 scai_test_spacy_large_results.pickle
hgnc_no_withdrawn.syn	 scai_train_results.pickle
mirtex_benchmark	 scai_train_spacy_bionlp13cg_results.pickle
mongodb			 scai_train_spacy_large_results.pickle
nbs			 server
neutrophils		 sym2ens
plots			 synonymes
prediction		 textdb
pubmed			 textmining
relation_extraction	 utils
relexfiles


In [43]:
! mkdir ../scai_eval

In [21]:
with open("../scai_eval/results_test_scilg_bionlp.pickle", "wb") as fout:
    pickle.dump(results_test_scilg_bionlp, fout)   

with open("../scai_eval/results_train_scilg_bionlp.pickle", "wb") as fout:
    pickle.dump(results_train_scilg_bionlp, fout)

In [22]:
with open("../scai_eval/results_test_scilg_scilg.pickle", "wb") as fout:
    pickle.dump(results_test_scilg_scilg, fout)   

with open("../scai_eval/results_train_scilg_scilg.pickle", "wb") as fout:
    pickle.dump(results_train_scilg_scilg, fout)

In [23]:
with open("../scai_eval/results_test_bionlp_bionlp.pickle", "wb") as fout:
    pickle.dump(results_test_bionlp_bionlp, fout)   

with open("../scai_eval/results_train_bionlp_bionlp.pickle", "wb") as fout:
    pickle.dump(results_train_bionlp_bionlp, fout)

In [24]:
with open("../scai_eval/results_test_spcylg_bionlp.pickle", "wb") as fout:
    pickle.dump(results_test_spcylg_bionlp, fout)   

with open("../scai_eval/results_train_spcylg_bionlp.pickle", "wb") as fout:
    pickle.dump(results_train_spcylg_bionlp, fout)

In [26]:
! ls ../scai_eval

results_test_bionlp_bionlp.pickle
results_test_scilg_bionlp.pickle
results_test_scilg_bionlp_subsetc.pickle
results_test_scilg_bionlp_subseti.pickle
results_test_scilg_scilg.pickle
results_test_spcylg_bionlp.pickle
results_test_spcylg_bionlp_subseti.pickle
results_train_bionlp_bionlp.pickle
results_train_scilg_bionlp.pickle
results_train_scilg_bionlp_subsetc.pickle
results_train_scilg_bionlp_subseti.pickle
results_train_scilg_scilg.pickle
results_train_spcylg_bionlp.pickle
results_train_spcylg_bionlp_subseti.pickle


In [25]:
with open("../scai_eval/results_test_scilg_bionlp_subseti.pickle", "wb") as fout:
    pickle.dump(results_test_scilg_bionlp_subseti, fout)   

with open("../scai_eval/results_train_scilg_bionlp_subseti.pickle", "wb") as fout:
    pickle.dump(results_train_scilg_bionlp_subseti, fout)

In [26]:
with open("../scai_eval/results_test_scilg_bionlp_subsetc.pickle", "wb") as fout:
    pickle.dump(results_test_scilg_bionlp_subsetc, fout)   

with open("../scai_eval/results_train_scilg_bionlp_subsetc.pickle", "wb") as fout:
    pickle.dump(results_train_scilg_bionlp_subsetc, fout)

In [27]:
with open("../scai_eval/results_test_spcylg_bionlp_subseti.pickle", "wb") as fout:
    pickle.dump(results_test_spcylg_bionlp_subseti, fout)   

with open("../scai_eval/results_train_spcylg_bionlp_subseti.pickle", "wb") as fout:
    pickle.dump(results_train_spcylg_bionlp_subseti, fout)

In [28]:
with open("../scai_eval/results_test_bionlp_bionlp_subseti.pickle", "wb") as fout:
    pickle.dump(results_test_bionlp_bionlp_subseti, fout)   

with open("../scai_eval/results_train_bionlp_bionlp_subseti.pickle", "wb") as fout:
    pickle.dump(results_train_bionlp_bionlp_subseti, fout)

In [8]:
%aimport textmining.MirGeneRelCheck

scaiBase = "/mnt/d/owncloud/data/miRExplore/scai_corpus/"

normGeneSymbols= normalize_gene_names(path=scaiBase + "/../obodir/" + "/hgnc_no_withdrawn.syn")

def loadRelations(relsFile):
    reldir2new = {}
    reldir2new["TARGET"] = "NEU"
    reldir2new["INVCORR"] = "NEU"
    reldir2new["REGULATE"] = "NEU"
    relClasses = {}
    with open(os.path.join(scaiBase, relsFile)) as fin:
        
        print("Loading relations", relsFile)
        
        for lidx, line in enumerate(fin):

            if lidx == 0:
                continue

            line = line.strip().split("\t")

            interactionID = line[0].strip()
            relType = line[1].strip()
            relRDir = line[2].strip()
            relIndirect = line[3].strip()
            relPassive = line[4].strip()

            if relRDir in reldir2new:
                relRDir = reldir2new[relRDir]

            relClasses[interactionID] = (relType, relRDir, relIndirect, relPassive)
            
        print("Interactions", len(relClasses))
        print("Non NA Interactions", len([x for x in relClasses if not relClasses[x][0] == "NA"]))
            
    return relClasses

def all_subsets(ss):
    
    return chain(*map(lambda x: combinations(ss, x), range(0, len(ss)+1)))

def runCheck(numelems, base, nlp, nlp_ent, subset_interactions=False, subset_classification=False):

    relChecker = SentenceRelationChecker(None)
    relClassifier = SentenceRelationClassifier()
    
    if base == "TRAIN":
        scaiFile = "miRNA-Train-Corupus.xml"
        relsFile = "scai_train_rels.tsv"
    elif base == "TEST":
        scaiFile = "miRNA-Test-Corpus.xml"
        relsFile = "scai_test_rels.tsv"
    
    relClasses = loadRelations(relsFile)
    
    check2results = {}
    checkInteractionSubsets = [x for x in all_subsets(relChecker.relCheck.checks_available)]
    checkClassifierSubsets = [x for x in all_subsets(relClassifier.major_checks)]
    #checkSubsets = [checkSubsets[-1]]
    
    if not subset_interactions:
        checkInteractionSubsets = [checkInteractionSubsets[-1]]
        
    if not subset_classification:
        checkClassifierSubsets = [checkClassifierSubsets[-1]]
    
    print("Will test interaction: {} subsets.".format(len(checkInteractionSubsets)))
    print("Will test classification: {} subsets.".format(len(checkClassifierSubsets)))
    
    for subset_checks_interaction in checkInteractionSubsets:
        for subset_checks_classification in checkClassifierSubsets:

            print("Testing checks", subset_checks_interaction)
            print("Testing checks", subset_checks_classification)
            
            relChecker = SentenceRelationChecker(nlp, nlp_ent=nlp_ent, active_checks=subset_checks_interaction)
            relClassifier = SentenceRelationClassifier(active_checks=subset_checks_classification)

            with open(os.path.join(scaiBase, scaiFile), 'r') as fin:
                tree = etree.parse(fin)
                root = tree.getroot()
                scaiPairs = []
                totalChecks = 0
                correctIdentified = 0
                incorrectIdentified = 0

                incorrectClassified = 0
                correctClassified = 0
                totalClassified = 0
                correctInteractClassified = 0
                totalValidClassified = 0
                totalClassifiable = 0
                totalMirGeneDownClassified = 0

                errorByDetect = Counter()
                classifiedByDetect = Counter()
                elemCaseCounter = Counter()
                incorrectClass = Counter()

                classifyTrue = []
                classifyPred = []

                for elem in root.findall(".//document"):

                    pmid = elem.attrib['origId']

                    for sentElem in elem:

                        allEntities = sentElem.findall(".//entity")
                        allPairs = sentElem.findall(".//pair")

                        sentText = sentElem.attrib["text"]

                        entId2elem = {}

                        for entity in allEntities:
                            entId = entity.attrib['id']
                            entText = entity.attrib['text']
                            entType = entity.attrib['type']
                            entOffset = tuple([int(x) for x in entity.attrib['charOffset'].split("-")])

                            if entType in ["Specific_miRNAs", "Genes/Proteins"]:

                                if "Genes" in entType:
                                    if entText in normGeneSymbols:
                                        entText = normGeneSymbols[entText]
                                    elif entText.upper() in normGeneSymbols:
                                        gene = normGeneSymbols[entText.upper()]
                                else:
                                    try:
                                        entText = miRNA(entText).getStringFromParts([miRNAPART.MATURE, miRNAPART.ID, miRNAPART.PRECURSOR])
                                    except:
                                        pass

                                entTuple = (entText, entType, (entOffset[0], entOffset[1]+1))
                                entId2elem[entId] = entTuple


                                sentEntText = sentText[entTuple[2][0]:entTuple[2][1]]

                        for pair in allPairs:

                            validInteraction = pair.attrib['interaction'].lower() == "true"
                            pairE1 = pair.attrib['e1']
                            pairE2 = pair.attrib['e2']

                            #if pairInt == 'true':
                            if pairE1 in entId2elem and pairE2 in entId2elem:

                                totalChecks += 1

                                e1 = entId2elem[pairE1]
                                e2 = entId2elem[pairE2]

                                if not e1[1] in ["Specific_miRNAs"]:

                                    tmp=e1
                                    e1=e2
                                    e2=tmp


                                relRes = relChecker.check_sentence(sentText
                                                                    , {"entity_type": "mirna", "entity_type_token": "e1", "entity_location": e1[2]}
                                                                    , {"entity_type": "gene", "entity_type_token": "e2", "entity_location": e2[2]}
                                                                    , fix_special_chars=False
                                                                    , relClassifier=relClassifier.classify, verbose=False
                                                                    )


                                fullsentence = relRes['full_sentence']
                                acceptInteraction = relRes['accept_relation']

                                if not acceptInteraction == validInteraction:

                                    incorrectIdentified += 1

                                    """
                                    relResV=relChecker.check_sentence(sentText
                                                                    , {"entity_type": "mirna", "entity_type_token": "e1", "entity_location": e1[2]}
                                                                    , {"entity_type": "gene", "entity_type_token": "e2", "entity_location": e2[2]}
                                                                    , fix_special_chars=False
                                                                    , relClassifier=relClassifier.classify, verbose=True
                                                                    )

                                    print(relResV)
                                    """
                                else:
                                    correctIdentified += 1

                                elemCaseCounter[(validInteraction, acceptInteraction)]+=1

                                totalClassified += 1

                                if validInteraction:# and acceptInteraction:
                                    #print(relRes["check_results"])   
                                    totalValidClassified += 1

                                    foundClasses = relRes["check_results"]["classification"] #{'regulation_dir': 'NEU', 'interaction_dir': 'MIR_GENE', 'regulation_keys': set()}
                                    foundTuple = (foundClasses["interaction_dir"], foundClasses["regulation_dir"])                                
                                    relationID = pair.attrib["id"]
                                    testTuple = relClasses[relationID]

                                    classifyTrue.append((testTuple[0], testTuple[1]))
                                    classifyPred.append((foundTuple[0], foundTuple[1]))

                                    if testTuple[0] == "MIR_GENE" and testTuple[1] == "DOWN":
                                        totalMirGeneDownClassified += 1

                                    classifiedByDetect[foundClasses["reg_detect"]] += 1

                                    if testTuple[0] != foundTuple[0] or testTuple[1] != foundTuple[1]:


                                        #print(sentText)
                                        #print(e1, e2)
                                        #print(foundTuple, foundClasses["reg_detect"], "passive?", foundClasses["passive"])
                                        #print(testTuple)

                                        #print()
                                        #print()
                                        #print()

                                        #print(relationID)
                                        #print("IS: ", foundTuple)
                                        #print("SB: ", testTuple)
                                        #print(relRes)

                                        #relResV=relChecker.check_sentence(sentText
                                        #                            , {"entity_type": "mirna", "entity_type_token": "e1", "entity_location": e1[2]}
                                        #                            , {"entity_type": "gene", "entity_type_token": "e2", "entity_location": e2[2]}
                                        #                            , fix_special_chars=False
                                        #                            , relClassifier=relClassifier.classify, verbose=True
                                        #                            )

                                        #print(relResV)

                                        #print()
                                        #print()
                                        #print()

                                        errorByDetect[foundClasses["reg_detect"]] += 1



                                        incorrectClassified += 1



                                        #numelems -= 1
                                        #if numelems == 0:
                                        #    return
                                    else:

                                        if foundClasses["reg_detect_major"] == "return":
                                            pass
                                            #print(foundClasses)
                                            #print(sentText)
                                            #print(e1, e2)
                                            #print(foundTuple, foundClasses["reg_detect"], "passive?", foundClasses["passive"])
                                            #print(testTuple)
                                            #print()
                                            #print()
                                            #print()
                                        correctClassified += 1

                                        if acceptInteraction:
                                            correctInteractClassified += 1



                def printStats(outfile):
                    print("Total:     ", totalChecks, file=outfile)
                    print("Correct:   ", correctIdentified, correctIdentified/totalChecks, file=outfile)
                    print("Incorrect: ", incorrectIdentified, incorrectIdentified/totalChecks, file=outfile)
                    print("classes", incorrectClass, file=outfile)

                    precision = elemCaseCounter[(True, True)] / (elemCaseCounter[(True, True)]+elemCaseCounter[(True, False)])
                    recall = elemCaseCounter[(True, True)] / (elemCaseCounter[(True, True)]+elemCaseCounter[(False, True)])

                    f1 = 2* precision * recall / (precision+recall)

                    if (elemCaseCounter[(True, False)] + elemCaseCounter[(False, False)]) == 0:
                        specificity = 0
                    else:
                        specificity = elemCaseCounter[(False, False)] / (elemCaseCounter[(True, False)] + elemCaseCounter[(False, False)])

                    print("precision", precision, file=outfile)
                    print("recall", recall, file=outfile)
                    print("specificity", specificity, file=outfile)
                    print("f1", f1, file=outfile)

                    print("Correct classified:     ", correctClassified, correctClassified/totalClassified, correctClassified/totalValidClassified,file=outfile)
                    print("Incorrect classified:   ", incorrectClassified, incorrectClassified/totalClassified, incorrectClassified/totalValidClassified,file=outfile)
                    print("Random classified: ", totalMirGeneDownClassified, totalMirGeneDownClassified / totalClassified, totalMirGeneDownClassified/totalValidClassified, file=outfile)
                    print(errorByDetect)

                    print("Correct interaction&classified:     ", correctInteractClassified, correctInteractClassified/totalClassified, correctInteractClassified/totalValidClassified,file=outfile)

                    print("Classified by: ", classifiedByDetect)

                    lClassifyTrue = ["_".join(x) for x in classifyTrue]
                    lClassifyPred = ["_".join(x) for x in classifyPred]

                    allLabels = sorted(set(lClassifyTrue+lClassifyPred))
                    matrix = multilabel_confusion_matrix(lClassifyTrue, lClassifyPred, labels=allLabels)
                    print(matrix)
                    print(classification_report(lClassifyTrue,lClassifyPred))

                    return {'interaction': {'precision': precision, 'recall': recall, 'specificity': specificity, 'f1': f1}, 'regulation': classification_report(lClassifyTrue, lClassifyPred, output_dict=True) }

                rep = printStats(sys.stdout)
                check2results[(subset_checks_interaction, subset_checks_classification)] = rep
        #printStats(sys.stderr)
    return check2results
        


In [9]:
%aimport textmining.MirGeneRelCheck

relClassifier = SentenceRelationClassifier()

orig_scai_results_test_scilg_bionlp_subseti = runCheck(-1, "TEST", nlp, nlp_ent, subset_interactions=False, subset_classification=False)

Loading relations scai_test_rels.tsv
Interactions 232
Non NA Interactions 138
Will test interaction: 1 subsets.
Will test classification: 1 subsets.
Testing checks ('conj', 'sdp', 'compartment', 'context', 'entity')
Testing checks ('compartment', 'between', 'counts', 'return')
Total:      232
Correct:    204 0.8793103448275862
Incorrect:  28 0.1206896551724138
classes Counter()
precision 0.926829268292683
recall 0.8571428571428571
specificity 0.9090909090909091
f1 0.890625
Correct classified:      109 0.4698275862068966 0.8861788617886179
Incorrect classified:    14 0.0603448275862069 0.11382113821138211
Random classified:  41 0.17672413793103448 0.3333333333333333
Counter({'counts between': 10, 'between mir gene': 1, 'counts opp': 1, 'counts between equal': 1, 'between gene mir': 1})
Correct interaction&classified:      106 0.45689655172413796 0.8617886178861789
Classified by:  Counter({'counts between': 62, 'between gene mir': 10, 'counts opp': 8, 'counts between equal': 8, 'compartm

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
