## Run this set of cells once after restarting the kernel

In [None]:
%load_ext autoreload
%autoreload 2
import os
import sys
# Must set SNORKELDB before importing SnorkelSession
from snorkel import SnorkelSession
from snorkel.parser import TextDocPreprocessor
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence
session = SnorkelSession()

In [2]:
print "Documents:", session.query(Document).count()
print "Sentences:", session.query(Sentence).count()

Documents: 400
Sentences: 95656


In [3]:
from snorkel.models import candidate_subclass


GenePhenoPair = candidate_subclass('GenePhenoPair',['gene', 'pheno'])
PhenoPair = candidate_subclass('Phenotypes',['descriptor', 'entity'])

In [4]:
#all_sents = set()
#docs = session.query(Document).order_by(Document.name).all()
#for i, doc in enumerate(docs):
    #for s in doc.sentences:
        #all_sents.add(s)
import cPickle
with open('small_data/pmcids_400.pkl', 'rb') as f:
    sent_dicts = cPickle.load(f)
train_ids, dev_ids, test_ids = set(sent_dicts['train']), set(sent_dicts['dev']), set(sent_dicts['test'])
all_ids = train_ids.union(dev_ids).union(test_ids)
# 40, 10, 10
train_sents, dev_sents, test_sents, all_sents = set(), set(), set(), set()
all_docs, train_docs, dev_docs, test_docs = set(), set(), set(), set()
docs = session.query(Document).order_by(Document.name).all()

tair = []
with open('small_data/tair_labels.txt', 'rb') as f:
    for line in f:
        tair.append(line.strip())
tair = set(tair)

doc_sents = dict()
for doc_num, doc in enumerate(docs):
    if len(all_docs) >=10: break
    if len(train_docs) >= 40 and len(dev_docs) >= 10 and len(test_docs) >= 10:break
    doc_sents[doc_num] = set()
    for s in doc.sentences:
        doc_sents[doc_num].add(s)
        name = doc.name.split('-')[0]
        if name in tair:
            all_docs.add(name)
            all_sents.add(s)
        if name in train_ids:
            train_docs.add(name)
            train_sents.add(s)
        elif name in dev_ids:
            dev_docs.add(name)
            dev_sents.add(s)
        elif name in test_ids:
            test_docs.add(name)
            test_sents.add(s)
        else:
            raise Exception('ID <{0}> not found in any id set'.format(doc.name))

set(['PMC1557525', 'PMC3921138', 'PMC3504057', 'PMC2156172', 'PMC5384131', 'PMC2935865', 'PMC3002950', 'PMC2577656', 'PMC2935866', 'PMC5132069', 'PMC3139611', 'PMC3935568', 'PMC2905201', 'PMC3481204', 'PMC4283595', 'PMC3040660', 'PMC2444027', 'PMC3879159', 'PMC2803217', 'PMC2323576', 'PMC3878228', 'PMC3017837', 'PMC3527283', 'PMC3542051', 'PMC2803208', 'PMC2893204', 'PMC1779593', 'PMC2877095', 'PMC3178619', 'PMC2999534', 'PMC2607017', 'PMC2910046', 'PMC2790454', 'PMC2566869', 'PMC3913338', 'PMC3883281', 'PMC2920754', 'PMC3138417PMC3143138', 'PMC4283981', 'PMC3511161', 'PMC3503873', 'PMC2557145', 'PMC2791118', 'PMC2736899', 'PMC4955218', 'PMC2295253', 'PMC2295252', 'PMC2791111', 'PMC3193000', 'PMC2478689', 'PMC3467300', 'PMC2633612', 'PMC3504491', 'PMC5156837', 'PMC3043248', 'PMC113751', 'PMC2803206', 'PMC3481211', 'PMC4340836', 'PMC4949643', 'PMC2785485', 'PMC2990762', 'PMC2955748', 'PMC3542049', 'PMC2803491', 'PMC3914372', 'PMC2890954', 'PMC2265440', 'PMC2908147', 'PMC4944173', 'PMC47

In [5]:
print len(all_sents), len(train_sents), len(dev_sents), len(test_sents)
#print all_sents

1665 1083 285 297


## Run the next four cells for Gene-Pheno Pair Extraction

### these can be skipped for phenotype

In [6]:
from candidate_extraction import PM, GM
from snorkel.candidates import Ngrams, CandidateExtractor


gene_ngrams = Ngrams(n_max=5)
pheno_ngrams = Ngrams(n_max=15)

cand_extractor = CandidateExtractor(GenePhenoPair, 
                                    [gene_ngrams, pheno_ngrams], [GM, PM],
                                    symmetric_relations=True, nested_relations=False)

KeyboardInterrupt: 

In [95]:
print "Extracting Candidates..."

cand_extractor.apply(dev_sents, split=0)
dev_cands = session.query(GenePhenoPair).filter(GenePhenoPair.split==0).all()
print "Number of dev candidates:", len(dev_cands)

Extracting Candidates...
Clearing existing...
Running UDF...

Number of dev candidates: 559


In [96]:
from snorkel.viewer import SentenceNgramViewer
sv = SentenceNgramViewer(dev_cands, session, annotator_name = 'gold')

<IPython.core.display.Javascript object>

In [97]:
sv

## Run these next four cells for Pheno extraction

In [23]:
#from revised_extraction import PM
from pheno_candidates import PATO, OBO
from snorkel.candidates import Ngrams, CandidateExtractor

pheno_ngrams = Ngrams(n_max=15)

pheno_extractor = CandidateExtractor(PhenoPair, 
                                    [pheno_ngrams, pheno_ngrams], [PATO, OBO],
                                    nested_relations=True, self_relations=True)

  return [pheno.lower() for pheno in terms if pheno.lower() not in blacklist and len(pheno)>1]


In [24]:
print "Extracting Candidates..."

pheno_extractor.apply(all_sents, split=0)
pheno_cands = session.query(PhenoPair).filter(PhenoPair.split==0).all()
print "Number of dev candidates:", len(pheno_cands)

Extracting Candidates...
Clearing existing...
Running UDF...

Number of dev candidates: 4082


In [27]:
from snorkel.viewer import SentenceNgramViewer
svp = SentenceNgramViewer(pheno_cands, session, annotator_name = 'gold', height=400)

<IPython.core.display.Javascript object>

In [28]:
svp