# Split Data

In [8]:
import pandas as pd
from ast import literal_eval

life = pd.read_csv("data/life_bio_tagged_sentences.csv")
openstax = pd.read_csv("data/openstax_bio_tagged_sentences.csv")
data = pd.concat([life, openstax])
data

Unnamed: 0,textbook,chapter,section,sentence,text,tokens,term1,term1_location,term2,term2_location
0,Life Biology,1,intro,2,Scientists are studying how corals are affecte...,"['Scientists', 'are', 'studying', 'how', 'cora...",global climate change,"(8, 11)",study,"(2, 3)"
1,Life Biology,1,intro,2,Scientists are studying how corals are affecte...,"['Scientists', 'are', 'studying', 'how', 'cora...",study,"(2, 3)",global climate change,"(8, 11)"
2,Life Biology,1,intro,2,Scientists are studying how corals are affecte...,"['Scientists', 'are', 'studying', 'how', 'cora...",global climate change,"(8, 11)",scientist,"(0, 1)"
3,Life Biology,1,intro,2,Scientists are studying how corals are affecte...,"['Scientists', 'are', 'studying', 'how', 'cora...",scientist,"(0, 1)",global climate change,"(8, 11)"
4,Life Biology,1,intro,2,Scientists are studying how corals are affecte...,"['Scientists', 'are', 'studying', 'how', 'cora...",global climate change,"(8, 11)",coral,"(4, 5)"
...,...,...,...,...,...,...,...,...,...,...
534699,OpenStax Biology 2e,47,7,46,Habitat restoration has the potential to resto...,"['Habitat', 'restoration', 'has', 'the', 'pote...",ecosystem,"(7, 8)",extinct,"(15, 16)"
534700,OpenStax Biology 2e,47,7,46,Habitat restoration has the potential to resto...,"['Habitat', 'restoration', 'has', 'the', 'pote...",specie,"(13, 14)",ecosystem,"(7, 8)"
534701,OpenStax Biology 2e,47,7,46,Habitat restoration has the potential to resto...,"['Habitat', 'restoration', 'has', 'the', 'pote...",ecosystem,"(7, 8)",specie,"(13, 14)"
534702,OpenStax Biology 2e,47,7,47,Examples of restoration include reintroduction...,"['Examples', 'of', 'restoration', 'include', '...",keystone specie,"(6, 8)",river,"(13, 14)"


In [14]:
for col in ['tokens', 'term1_location', 'term2_location']:
    data[col] = data[col].apply(literal_eval)

In [29]:
dev_sec = [('OpenStax Biology 2e', 4), ('OpenStax Biology 2e', 10), 
       ('Life Biology', 5), ('Life Biology', 11)]
test_sec = [('Life Biology', 39), ('OpenStax Biology 2e', 33)]

In [30]:
data['filter'] = list(zip(data['textbook'], data['chapter']))

In [31]:
test_f = data['filter'].isin(test_sec)
test = data.loc[test_f]

In [33]:
dev_f = data['filter'].isin(dev_sec)
dev = data.loc[dev_f] 

In [37]:
train_f = (~data['filter'].isin(dev_sec)) & (~data['filter'].isin(test_sec))

In [39]:
train = data.loc[train_f]

In [40]:
train

Unnamed: 0,textbook,chapter,section,sentence,text,tokens,term1,term1_location,term2,term2_location,filter
0,Life Biology,1,intro,2,Scientists are studying how corals are affecte...,"[Scientists, are, studying, how, corals, are, ...",global climate change,"(8, 11)",study,"(2, 3)","(Life Biology, 1)"
1,Life Biology,1,intro,2,Scientists are studying how corals are affecte...,"[Scientists, are, studying, how, corals, are, ...",study,"(2, 3)",global climate change,"(8, 11)","(Life Biology, 1)"
2,Life Biology,1,intro,2,Scientists are studying how corals are affecte...,"[Scientists, are, studying, how, corals, are, ...",global climate change,"(8, 11)",scientist,"(0, 1)","(Life Biology, 1)"
3,Life Biology,1,intro,2,Scientists are studying how corals are affecte...,"[Scientists, are, studying, how, corals, are, ...",scientist,"(0, 1)",global climate change,"(8, 11)","(Life Biology, 1)"
4,Life Biology,1,intro,2,Scientists are studying how corals are affecte...,"[Scientists, are, studying, how, corals, are, ...",global climate change,"(8, 11)",coral,"(4, 5)","(Life Biology, 1)"
...,...,...,...,...,...,...,...,...,...,...,...
534699,OpenStax Biology 2e,47,7,46,Habitat restoration has the potential to resto...,"[Habitat, restoration, has, the, potential, to...",ecosystem,"(7, 8)",extinct,"(15, 16)","(OpenStax Biology 2e, 47)"
534700,OpenStax Biology 2e,47,7,46,Habitat restoration has the potential to resto...,"[Habitat, restoration, has, the, potential, to...",specie,"(13, 14)",ecosystem,"(7, 8)","(OpenStax Biology 2e, 47)"
534701,OpenStax Biology 2e,47,7,46,Habitat restoration has the potential to resto...,"[Habitat, restoration, has, the, potential, to...",ecosystem,"(7, 8)",specie,"(13, 14)","(OpenStax Biology 2e, 47)"
534702,OpenStax Biology 2e,47,7,47,Examples of restoration include reintroduction...,"[Examples, of, restoration, include, reintrodu...",keystone specie,"(6, 8)",river,"(13, 14)","(OpenStax Biology 2e, 47)"


In [41]:
train.to_pickle('data/train.pkl')

In [42]:
dev.to_pickle('data/dev.pkl')

In [43]:
test.to_pickle('data/test.pkl')

# Label Function Preprocessing 

In [21]:
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")
pd.set_option('display.max_colwidth', -1)
dev = pd.read_pickle("data/dev.pkl")
train = pd.read_pickle("data/train.pkl")

In [22]:
dev_docs = [nlp(row.text) for _, row in dev.iterrows()]
dev['doc'] = dev_docs

In [None]:
train_docs = [nlp(row.text) for _, row in train.iterrows()]
train['doc'] = docs

# Label Analysis 

In [None]:
%load_ext autoreload
%autoreload 2
#from snorkel_functions.snorkel_preprocessors import *
from snorkel_functions.taxonomy_labelers import *
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
import pickle
import pandas as pd
pd.set_option('display.max_colwidth', -1)

with open("data/kb_bio101_relations_db.pkl", 'rb') as fid:
    relations = pickle.load(fid)
with open("data/kb_bio101_terms.pkl", 'rb') as fid:
    terms = pickle.load(fid)

lfs = [isa_pattern, suchas_pattern, including_pattern, called_pattern, especially_pattern,
       appo_pattern, other_pattern, knownas_pattern, kb_bio101_ds, term_pos, term_subset]
applier = PandasLFApplier(lfs=lfs)

In [None]:
ltrain = applier.apply(df=train)

In [None]:
LFAnalysis(L=ltrain, lfs=lfs).lf_summary()

In [20]:
dev[(ldev[:, 7] == HYPONYM)].sample(1)

Unnamed: 0,textbook,chapter,section,sentence,text,tokens,term1,term1_location,term2,term2_location,filter,doc
34184,Life Biology,5,0,71,The maintenance of a constant internal environment (known as homeostasis) is a key characteristic of life and will be discussed in detail in Chapter 40.,"[The, maintenance, of, a, constant, internal, environment, (, known, as, homeostasis, ), is, a, key, characteristic, of, life, and, will, be, discussed, in, detail, in, Chapter, 40, .]",homeostasis,"(10, 11)",environment,"(6, 7)","(Life Biology, 5)","(The, maintenance, of, a, constant, internal, environment, (, known, as, homeostasis, ), is, a, key, characteristic, of, life, and, will, be, discussed, in, detail, in, Chapter, 40, .)"


In [177]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
text = "Such organisms, like the bacteria that are abundant on you, in you, and all around you, are called prokaryotes (Figure 1.3B)."
text = "Fecal samples were analyzed for composition of the microbe communities, and the blood was tested for triglycerides and other nutrients."
text = "This is the situation faced by individuals with a condition also known as Down syndrome, which affects 1 out of every 700 children born in the United States."
doc = nlp(text)
#[tok for tok in doc]

In [178]:
displacy.serve(doc, style="dep")

  "__main__", mod_spec)



Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [109]:
def including_pattern(doc, term1_idx, term2_idx):
    if term1_idx[0] < term2_idx[0]:
        end = doc[term1_idx[1] - 1]
        start = doc[term2_idx[1] - 1]
    else:
        end = [term2_idx[1] - 1]
        start = [term1_idx[1] - 1]
    
    print(start.text)
    while start.text != 'including':
        if start.dep_ == 'conj' or start.dep_ == 'pobj':
            start = start.head
        else:
            break
    
    if start.text == 'including':
        # if 'such' in [ch.text for ch in start.children]
        if start.head.text == end.text:
            if term1_idx[0] < term2_idx[0]:
                return 1
            else:
                return 0
        else:
            return -1
    else:
        return -1
suchas_pattern(doc, (1, 5), (15, -3))

TypeError: __call__() takes 2 positional arguments but 4 were given

In [34]:
tokens = [i for i, tok in enumerate(doc) if tok.text == 'as']
print(ix)
def such_as_pattern(doc, term1_ix, term2_ix):
    pass 

[0, 7]


In [71]:
def suchas_pattern(doc, term1_idx, term2_idx):
    if term1_idx[0] < term2_idx[0]:
        end = doc[term1_idx[1] - 1]
        start = doc[term2_idx[1] - 1]
    else:
        end = [term2_idx[1] - 1]
        start = [term1_idx[1] - 1]
    
    print(start.text)
    while start.text != 'as':
        if start.dep_ == 'conj' or start.dep_ == 'pobj':
            start = start.head
        else:
            break
    
    if start.text == 'as':
        # if 'such' in [ch.text for ch in start.children]
        if start.head.text == end.text:
            if term1_idx[0] < term2_idx[0]:
                return 1
            else:
                return 0
        else:
            return -1
    else:
        return -1
suchas_pattern(doc, (1, 5), (15, -3))

mitochondria


1

In [11]:
import pickle
db = pickle.load(open('data/kb_bio101_relations_db.pkl', 'rb'))

In [12]:
terms = pickle.load(open('data/kb_bio101_terms.pkl', 'rb'))

{'subclass-of': {('k+', 'monoatomic ion'),
  ('adenine at dna strand opposite to rna strand', 'adenine at dna strand'),
  ('segment of body', 'anatomical structure'),
  ('evaporative cool in pond', 'evaporative cooling of water'),
  ('S', 'non metal atom'),
  ('regulation of body temperature in insect for flight',
   'regulation of animal temperature'),
  ('pacman mechanism', 'intracellular process'),
  ('control endocytosis', 'regulate'),
  ('hydrogen 3', 'hydrogen isotope'),
  ('courtship', 'reproductive behavior'),
  ('bend', 'change shape'),
  ('monogamous mating', 'rodent'),
  ('meiosis in male', 'undergo meiosis'),
  ('development relate intercellular process', 'intercellular process'),
  ('augment biologically', 'ecological process'),
  ('heart disease', 'disorder'),
  ('repeat reproduction', 'life history'),
  ('phage t4', 'virulent virus'),
  ('relay a signal', 'signal transduction with Ca2 plus'),
  ('maize', 'fruit'),
  ('chloroplast membranous system', 'whole_thing'),
  ('f

In [8]:
import numpy as np

data = np.load("data/label/train_label_model_labels.npz")
datal = np.load("data/label/train_label_fn_labels.npz")
datam = np.load("data/label/train_majority_vote_labels.npz")

In [14]:
data['data'][datam['data'] == 1]

array([[1.52743633e-01, 6.77837154e-01, 1.69419214e-01],
       [1.52743633e-01, 6.77837154e-01, 1.69419214e-01],
       [1.52743633e-01, 6.77837154e-01, 1.69419214e-01],
       ...,
       [1.52743633e-01, 6.77837154e-01, 1.69419214e-01],
       [1.52743633e-01, 6.77837154e-01, 1.69419214e-01],
       [8.74392616e-05, 7.62378559e-01, 2.37534002e-01]])

In [10]:
datal['data']

array([[-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ...,  0, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       ...,
       [-1, -1, -1, ..., -1, -1,  0],
       [-1, -1, -1, ..., -1, -1,  0],
       [-1, -1, -1, ..., -1, -1,  0]])

In [11]:
datam['data']

array([-1,  0, -1, ...,  0,  0,  0])