## Phenotype Candidate Extraction 

#### Run the first three cells once after restarting the kernel

In [None]:
%load_ext autoreload
%autoreload 2
import os
import sys
# Must set SNORKELDB before importing SnorkelSession
from set_env import set_env
set_env()
from snorkel import SnorkelSession
from snorkel.parser import TextDocPreprocessor
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence
session = SnorkelSession()

#### For small-data environment, should see 400 documents and 95656 sentences 

In [None]:
print "Documents:", session.query(Document).count()
print "Sentences:", session.query(Sentence).count()

In [None]:
from snorkel.models import candidate_subclass

PhenoPairComplex = candidate_subclass('ComplexPhenotypes',['descriptor', 'entity'])

## RUN THIS CELL TO GET ALL DOCS LABELED BY TANYA 
### we also need to think about how to split for dev and test - not sure if we should do this here or somewhere else, if it is done here we need to update brat import to accomodate for dif splits because right now it only looks for split=0, but the good thing is the application of the labels is agnostic to the split so we basically just have to make sure we do the same process for each split

In [None]:
import cPickle

#load small set of 400 documents
with open('small_data/pmcids_400.pkl', 'rb') as f:
    sent_dicts = cPickle.load(f)
    
train_ids, dev_ids, test_ids = set(sent_dicts['train']), set(sent_dicts['dev']), set(sent_dicts['test'])
all_ids = train_ids.union(dev_ids).union(test_ids)
all_sents = set()

docs = session.query(Document).order_by(Document.name).all()

#get PMCIDs for BRAT labeled documents
tair = []
with open('small_data/tair_labels.txt', 'rb') as f:
    for line in f:
        tair.append(line.strip())
tair = set(tair)

doc_splits = {}
for doc_num, doc in enumerate(docs):
    name = doc.name.split('-')[0]
    if name in tair:  
        doc_splits[name] = set()
        for s in doc.sentences:
            all_sents.add(s)
            doc_splits[name].add(s)

In [None]:
print 'Total Sentences:', len(all_sents)
print 'Showing number of sentences per document...'
for key in doc_splits.keys():
    print key, len(doc_splits[key])
#print all_sents

## Phenotype extraction

In [None]:
from pheno_candidates import PATO, OBO
from snorkel.candidates import Ngrams, CandidateExtractor

pheno_ngrams = Ngrams(n_max=15)

pheno_extractor_complex = CandidateExtractor(PhenoPairComplex, 
                                    [pheno_ngrams, pheno_ngrams], [PATO, OBO],
                                    nested_relations=True, self_relations=True)

#### We expect 53846 candidates 

In [None]:
print "Extracting Candidates..."
#clear dev and test splits
pheno_extractor_complex.clear(session, split=4)
pheno_extractor_complex.clear(session, split=5)

#extract all cands in a single split
pheno_extractor_complex.apply(all_sents, split=3)
pheno_cands_complex = session.query(PhenoPairComplex).filter(PhenoPairComplex.split==3).all()
print "Number of dev candidates:", len(pheno_cands_complex)

In [None]:
from snorkel.viewer import SentenceNgramViewer
svp = SentenceNgramViewer(pheno_cands_complex, session, annotator_name = 'gold_complex', height=400)

In [None]:
svp