In [1]:
%load_ext autoreload
%autoreload 1

In [21]:
import os, sys
sys.path.insert(0, str("../"))

from collections import defaultdict, Counter
from lxml import etree
from utils.tmutils import normalize_gene_names



In [None]:
import scispacy
import spacy
from spacy import displacy

nlp = spacy.load('/mnt/f/spacy/en_core_sci_lg-0.2.5/en_core_sci_lg/en_core_sci_lg-0.2.5/')
nlp_ent = spacy.load("/mnt/f/spacy/en_ner_bionlp13cg_md-0.2.5/en_ner_bionlp13cg_md/en_ner_bionlp13cg_md-0.2.5")

import textmining.MirGeneRelCheck
from intervaltree import Interval, IntervalTree

In [33]:

def analyse(testCase, dist=80):
    relCheck = textmining.MirGeneRelCheck.MirGeneRelCheck()

    doc = nlp(testCase[0])
    entDoc = nlp_ent(testCase[0])
    
    enttree = IntervalTree()
    for ent in entDoc.ents:
        enttree.addi(ent.start_char, ent.end_char, (ent.label_, ent.text))

    
    lWord = doc[testCase[1]]
    rWord = doc[testCase[2]]

    for t in doc:
        
        possTokenEnts = enttree.overlap(t.idx, t.idx+len(t.text))
        ents = set([x.data for x in possTokenEnts])
        
        print(t.idx, t.idx+len(t.text), t.i, t, t.pos_, t.dep_, t.head, t.head.idx, ents)

    print(lWord, rWord)

    if not "mir" in str(lWord).lower() and not "let" in str(lWord).lower():
        tmp = lWord
        lWord = rWord
        rWord = tmp
        
    print(lWord, rWord)
    
    checkResults = relCheck.checkRelation(doc, lWord, rWord, verbose=True)
    print(checkResults)
    
    displacy.render(doc, style="dep", options={"compact": True, "distance": dist})
    
    entDoc = nlp_ent(testCase[0])
    
    
    displacy.render(entDoc, style="ent", options={"compact": True, "distance": dist})
    
    return enttree

In [22]:
normGeneSymbols = normalize_gene_names(path="../hgnc_no_withdrawn.syn")

In [10]:
testfile="test"

relexFile = "scai_"+testfile+"_relex.out"
scaiFile = "miRNA_"+testfile+"_fixed.xml"

In [11]:
relexAccepted = []

with open("../relexfiles/"+relexFile) as fin:

    wasRelation = False
    curSentID = None
    for line in fin:

        if line.startswith(">"):
            curSentID = line.strip()[1:]

        if line.startswith("#RELATIONS:"):
            wasRelation = True
            continue

        if wasRelation:
            relexAccepted.append(curSentID)

        wasRelation = False

    print("Relex Hits", len(relexAccepted))

Relex Hits 75


In [9]:
def printStats(outfile):
    print("Total:     ", totalChecks, file=outfile)
    print(file=outfile)
    print(file=outfile)
    print(file=outfile)
    print(file=outfile)
    print("T,T", elemCaseCounter[(True, True)], file=outfile)
    print("T,F", elemCaseCounter[(True, False)], file=outfile)
    print("F,T", elemCaseCounter[(False, True)], file=outfile)
    print("F,F", elemCaseCounter[(False, False)], file=outfile)

    precision = elemCaseCounter[(True, True)] / (elemCaseCounter[(True, True)]+elemCaseCounter[(True, False)])
    recall = elemCaseCounter[(True, True)] / (elemCaseCounter[(True, True)]+elemCaseCounter[(False, True)])

    f1 = 2* precision * recall / (precision+recall)

    specificity = elemCaseCounter[(False, False)] / (elemCaseCounter[(True, False)] + elemCaseCounter[(False, False)])

    print("precision", precision, file=outfile)
    print("recall", recall, file=outfile)
    print("specificity", specificity, file=outfile)
    print("f1", f1, file=outfile)

In [35]:

def scoreAnalysis(printCase=(True, False)):
    correctIdentified = 0
    incorrectIdentified = 0
    totalChecks = 0
    incorrectClass = Counter()

    relationNum = 0
    elemCaseCounter = Counter()

    with open(os.path.join("../scai_corpus", scaiFile), 'r') as fin:
        tree = etree.parse(fin)
        root = tree.getroot()
        scaiPairs = []

        for elem in root.findall(".//document"):

            pmid = elem.attrib['origId']

            for sentElem in elem:

                allEntities = sentElem.findall(".//entity")
                allPairs = sentElem.findall(".//pair")

                sentText = sentElem.attrib["text"]

                entId2elem = {}

                for entity in allEntities:
                    entId = entity.attrib['id']
                    entText = entity.attrib['text']
                    entType = entity.attrib['type']
                    entOffset = tuple([int(x) for x in entity.attrib['charOffset'].split("-")])

                    if entType in ["Specific_miRNAs", "Genes/Proteins"]:

                        if "Genes" in entType:
                            if entText in normGeneSymbols:
                                entText = normGeneSymbols[entText]
                            elif entText.upper() in normGeneSymbols:
                                gene = normGeneSymbols[entText.upper()]
                        else:
                            try:
                                entText = miRNA(entText).getStringFromParts([miRNAPART.MATURE, miRNAPART.ID, miRNAPART.PRECURSOR])
                            except:
                                pass

                        entTuple = (entText, entType, (entOffset[0], entOffset[1]+1))
                        entId2elem[entId] = entTuple


                        sentEntText = sentText[entTuple[2][0]:entTuple[2][1]]

                for pair in allPairs:

                    validInteraction = pair.attrib['interaction'].lower() == "true"
                    pairE1 = pair.attrib['e1']
                    pairE2 = pair.attrib['e2']

                    #if pairInt == 'true':
                    if pairE1 in entId2elem and pairE2 in entId2elem:

                        totalChecks += 1

                        e1 = entId2elem[pairE1]
                        e2 = entId2elem[pairE2]

                        if not e1[1] in ["Specific_miRNAs"]:

                            tmp=e1
                            e1=e2
                            e2=tmp


                        relationID = pair.attrib["id"]
                        relationID = "{}.0.0".format(relationNum)

                        relationNum += 1
                        #print(relationID, sentText.strip().rstrip(".").replace("/", ","), sep="\t", file=sentFile)

                        #8652807.2.10    protein 0       6:16202 55-60   tumor
                        e1Word = sentText[e1[2][0]:e1[2][1]]
                        e2Word = sentText[e2[2][0]:e2[2][1]]

                        #print(relationID, len(relationID) + e1[2][0]+1, e1[2][1]-e1[2][0], e1Word, "protein", relationID + "_" + e1[1], sep="\t", file=synFile)
                        #print(relationID, len(relationID) + e2[2][0]+1, e2[2][1]-e2[2][0], e2Word, "protein", relationID + "_" + e2[1], sep="\t", file=synFile)


                        validInteraction = pair.attrib['interaction'].lower() == "true"
                        acceptInteraction = relationID in relexAccepted

                        elemCase = (acceptInteraction, validInteraction)
                        elemCaseCounter[elemCase] += 1

                        if elemCase == printCase:

                            print(elemCase, str((sentText.strip(), e1Word, e2Word)))

    printStats(sys.stdout)


In [38]:
scoreAnalysis(printCase=(True, False))

(True, False) ('Moreover, the levels of the apl-1 transcription are modulated by the activity of let-7 family microRNAs.', 'let-7', 'apl-1')
(True, False) ('C. elegans apl-1 shows significant genetic interactions with let-7 family microRNAs and let-7-targeted heterochronic genes, hbl-1, lin-41 and lin-42.', 'let-7', 'apl-1')
(True, False) ('C. elegans apl-1 shows significant genetic interactions with let-7 family microRNAs and let-7-targeted heterochronic genes, hbl-1, lin-41 and lin-42.', 'let-7', 'hbl-1')
(True, False) ('C. elegans apl-1 shows significant genetic interactions with let-7 family microRNAs and let-7-targeted heterochronic genes, hbl-1, lin-41 and lin-42.', 'let-7', 'lin-41')
(True, False) ('The data strongly suggested that a regulatory loop between miR-21 and STAT3 might provide an insight into the mechanism of modulating EGFR/STAT3 signaling.', 'miR-21', 'EGFR')
(True, False) ('The data strongly suggested that a regulatory loop between miR-21 and STAT3 might provide an

In [34]:
enttree = analyse(
('miR-206 mediates these effects at least in part through histone deacetylase 4 and fibroblast growth factor signaling pathways.', 0, 9)
)

0 7 0 miR-206 NOUN nsubj mediates 8 set()
8 16 1 mediates VERB ROOT mediates 8 set()
17 22 2 these DET det effects 23 set()
23 30 3 effects NOUN dobj mediates 8 set()
31 33 4 at ADP advmod part 43 set()
34 39 5 least ADJ mwe at 31 set()
40 42 6 in ADP case part 43 set()
43 47 7 part NOUN nmod mediates 8 set()
48 55 8 through ADP case deacetylase 64 set()
56 63 9 histone NOUN compound deacetylase 64 {('GENE_OR_GENE_PRODUCT', 'histone deacetylase 4')}
64 75 10 deacetylase NOUN nmod part 43 {('GENE_OR_GENE_PRODUCT', 'histone deacetylase 4')}
76 77 11 4 NUM nummod deacetylase 64 {('GENE_OR_GENE_PRODUCT', 'histone deacetylase 4')}
78 81 12 and CCONJ cc deacetylase 64 set()
82 92 13 fibroblast NOUN compound pathways 117 {('GENE_OR_GENE_PRODUCT', 'fibroblast growth factor')}
93 99 14 growth NOUN compound pathways 117 {('GENE_OR_GENE_PRODUCT', 'fibroblast growth factor')}
100 106 15 factor NOUN compound pathways 117 {('GENE_OR_GENE_PRODUCT', 'fibroblast growth factor')}
107 116 16 signaling NO

In [39]:
scoreAnalysis(printCase=(False, True))

(False, True) ('Our data suggests that down-regulation of miRNA-128 may contribute to glioma and GBM, in part, by coordinately up-regulating ARP5 (ANGPTL6), Bmi-1 and E2F-3a, resulting in the proliferation of undifferentiated GBM cells.', 'miRNA-128', 'ARP5')
(False, True) ('Our data suggests that down-regulation of miRNA-128 may contribute to glioma and GBM, in part, by coordinately up-regulating ARP5 (ANGPTL6), Bmi-1 and E2F-3a, resulting in the proliferation of undifferentiated GBM cells.', 'miRNA-128', 'ANGPTL6')
(False, True) ('Our data suggests that down-regulation of miRNA-128 may contribute to glioma and GBM, in part, by coordinately up-regulating ARP5 (ANGPTL6), Bmi-1 and E2F-3a, resulting in the proliferation of undifferentiated GBM cells.', 'miRNA-128', 'Bmi-1')
(False, True) ("Addition of exogenous miRNA-128 to CRL-1690 and CRL-2610 GBM cell lines (a) restored 'homeostatic' ARP5 (ANGPTL6), Bmi-1 and E2F-3a expression, and (b) significantly decreased the proliferation of CRL

In [41]:
enttree = analyse(
('Variation in the miRNA-433 binding site of FGF20 confers risk for Parkinson disease by overexpression of alpha-synuclein.', 3, 7)
)

0 9 0 Variation NOUN nsubj confers 49 set()
10 12 1 in ADP case site 35 set()
13 16 2 the DET det site 35 set()
17 26 3 miRNA-433 NOUN amod site 35 set()
27 34 4 binding NOUN compound site 35 set()
35 39 5 site NOUN nmod Variation 0 set()
40 42 6 of ADP case FGF20 43 set()
43 48 7 FGF20 NOUN nmod site 35 {('GENE_OR_GENE_PRODUCT', 'FGF20')}
49 56 8 confers VERB ROOT confers 49 set()
57 61 9 risk NOUN dobj confers 49 set()
62 65 10 for ADP case disease 76 set()
66 75 11 Parkinson NOUN compound disease 76 set()
76 83 12 disease NOUN nmod risk 57 set()
84 86 13 by ADP case overexpression 87 set()
87 101 14 overexpression NOUN nmod confers 49 set()
102 104 15 of ADP case alpha-synuclein 105 set()
105 120 16 alpha-synuclein NOUN nmod overexpression 87 {('GENE_OR_GENE_PRODUCT', 'alpha-synuclein')}
120 121 17 . PUNCT punct confers 49 set()
miRNA-433 FGF20
miRNA-433 FGF20
Conjunctions
SDP sent: Variation in the miRNA-433 binding site of FGF20 confers risk for Parkinson disease by overexpression

In [42]:
enttree = analyse(
('Glucose supplementation in cultured cells that leads to increased miR-107 levels also results in decreased GRN expression, including changes in cell compartmentation and decreased secretion of GRN protein.', 9, 15 )
)

0 7 0 Glucose NOUN compound supplementation 8 {('SIMPLE_CHEMICAL', 'Glucose')}
8 23 1 supplementation NOUN nsubj results 86 set()
24 26 2 in ADP case cells 36 set()
27 35 3 cultured VERB amod cells 36 set()
36 41 4 cells NOUN nmod supplementation 8 {('CELL', 'cells')}
42 46 5 that DET nsubj leads 47 set()
47 52 6 leads VERB acl:relcl cells 36 set()
53 55 7 to PART case levels 74 set()
56 65 8 increased VERB amod levels 74 set()
66 73 9 miR-107 ADJ compound levels 74 {('GENE_OR_GENE_PRODUCT', 'miR-107')}
74 80 10 levels NOUN nmod leads 47 set()
81 85 11 also ADV advmod results 86 set()
86 93 12 results VERB ROOT results 86 set()
94 96 13 in ADP case expression 111 set()
97 106 14 decreased VERB amod expression 111 set()
107 110 15 GRN NOUN compound expression 111 set()
111 121 16 expression NOUN nmod results 86 set()
121 122 17 , PUNCT punct expression 111 set()
123 132 18 including VERB case changes 133 set()
133 140 19 changes NOUN nmod expression 111 set()
141 143 20 in ADP case comp

In [44]:
enttree = analyse(
('Our data suggests that down-regulation of miRNA-128 may contribute to glioma and GBM, in part, by coordinately up-regulating ARP5 (ANGPTL6), Bmi-1 and E2F-3a, resulting in the proliferation of undifferentiated GBM cells.', 6, 20)
)

0 3 0 Our DET nmod:poss data 4 set()
4 8 1 data NOUN nsubj suggests 9 set()
9 17 2 suggests VERB ROOT suggests 9 set()
18 22 3 that SCONJ mark contribute 56 set()
23 38 4 down-regulation NOUN nsubj contribute 56 set()
39 41 5 of ADP case miRNA-128 42 set()
42 51 6 miRNA-128 NOUN nmod down-regulation 23 set()
52 55 7 may VERB aux contribute 56 set()
56 66 8 contribute VERB ccomp suggests 9 set()
67 69 9 to PART case glioma 70 set()
70 76 10 glioma NOUN nmod contribute 56 {('CANCER', 'glioma')}
77 80 11 and CCONJ cc glioma 70 set()
81 84 12 GBM NOUN conj glioma 70 {('CANCER', 'GBM')}
84 85 13 , PUNCT punct glioma 70 set()
86 88 14 in ADP case part 89 set()
89 93 15 part NOUN nmod contribute 56 set()
93 94 16 , PUNCT punct contribute 56 set()
95 97 17 by ADP mark up-regulating 111 set()
98 110 18 coordinately ADV advmod up-regulating 111 set()
111 124 19 up-regulating VERB advcl contribute 56 set()
125 129 20 ARP5 NOUN dobj up-regulating 111 {('GENE_OR_GENE_PRODUCT', 'ARP5')}
130 131 21 (