Move over to sagemaker for the training.

We can do the preliminary stuff here. Creating data sets for BioBERT, correctly formatted,   and with multiple sentences which contain the entities (so it doesn't classify everything as OTHER).

1. [Define entities of interest.](#fyrsta)
2. [Load the newest tagged set.](#annað)
3. [Select fully labelled sentences.](#þriðja)
4. [Create an imbalanced data set, to solve class imbalance problem](#fimmta)
4. [Add synthetic data to a the newest tagged set.](#fjórða)

1. Define classes and update the newest tagged set <a name="fyrsta"></a>

In [1]:
# define entities of interest
interesting = ['cell_line', 'science_word', 'reagent', 'gene']

In [2]:

def count_unknowns(path_to_csv):

    unknowns = []

    with open(path_to_csv) as labelled_data:


        for l in labelled_data.readlines():
            items = l.split(',')

            if len(items) == 5:
                # skip strange rows 

                new_sentence_nr = items[1]
                word = items[2]
                category = items[4].strip()

                if category == 'X':
                    unknowns.append(word)


    unknowns = sorted(list(set(unknowns)))

    return unknowns


len(count_unknowns('SPACY NER/wiki_unambiguous'))

857

### 2. Get the marked sentences <a name="annað"></a>

In [3]:
### 2. Take the marked sentences from the categories of interest <a name="annað"></a>

import os

unlabelled = count_unknowns('SPACY NER/imbalanced')

### Get unlabelled and labelled

sentences_with_unlabelled = []
sentence_ids = [] 
with open('SPACY NER/imbalanced') as cells:
    
    # skip the header 
    for l in cells.readlines()[1:]:
        items = l.split(',')
        
        for i in unlabelled:
            if i in l:
                sentences_with_unlabelled.append(items[1])
#                print(items[1])

        if len(items) == 5:
            sentence_ids.append(items[1])
                

ready_ids = sorted(list(set([int(x) for x in sentence_ids if x not in sentences_with_unlabelled])))

In [5]:
ready_words = []
with open('SPACY NER/imbalanced') as f:
    
    for l in f.readlines()[1:]:
        items = l.split(',')
        if len(items) == 5:
            if int(items[1]) in ready_ids:
                ready_words.append(items)
                

In [6]:
# filter for fully labelled sentences containing the entities 

current_sentence = '0'
ready_s = [[]]
ready_labels = [[]] 

for w in ready_words:
    if w[1] == current_sentence:
        ready_s[-1].append(w[2])
        ready_labels[-1].append(w[4].strip())

    else:
        current_sentence = w[1]
        ready_s.append([])
        ready_labels.append([])
        
        

### 3. Put data into format for BERT

Read csv,   check distribution 

In [11]:
import pandas as pd


data_file_address = "imba_test"

# Fillna method can make same sentence with same sentence name
df_data = pd.read_csv(data_file_address,sep=",",encoding="latin1").fillna(method='ffill')


In [18]:
df_data[500:]

Unnamed: 0.1,Unnamed: 0,sentence_nr,word,pos,category
500,63255,3030,Fells,NNP,O
501,63256,3030,?,.,O
502,63257,3030,],-RRB-,O
503,63258,3030,right,JJ,O
504,63259,3030,[,-LRB-,O
505,63260,3030,fells,NNS,O
506,63261,3030,?,.,O
507,63262,3030,],-RRB-,O
508,63264,3030,Housekeeping,NNP,O
509,63266,3030,1,CD,O


In [9]:
df_data.head(n=20)

# Have a look TAG cat
df_data.category.unique()

# Analyse the Tag distribution
df_data.category.value_counts()

O               243708
cell_line         3131
science_word      2683
gene              2636
reagent           1682
Name: category, dtype: int64

In [10]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["word"].values.tolist(),
                                                           s["pos"].values.tolist(),
                                                           s["category"].values.tolist())]
        self.grouped = self.data.groupby("sentence_nr").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [11]:
# Get full document data struce
getter = SentenceGetter(df_data)

# Get sentence data
sentences = [[s[0] for s in sent] for sent in getter.sentences]
sentences[0]

# Get tag labels data
labels = [[s[2] for s in sent] for sent in getter.sentences]
print(labels[0])

['O', 'O', 'O', 'O', 'O']


In [12]:
tags_vals = list(set(df_data["category"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}

In [13]:
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam



from tqdm import tqdm, trange


Using TensorFlow backend.


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


#####  Load BIOBERT

In [None]:
!export BERT_BASE_DIR=/Users/valdimareggertsson/Documents/Valdi/Vetrarönn\ 2019/NER/biobert_v1.1_pubmed.tar.gz
pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch $BERT_BASE_DIR/model.ckpt-1000000 $BERT_BASE_DIR/bert_config.json $BERT_BASE_DIR/pytorch_model.bin


In [14]:
model = BertForTokenClassification.from_pretrained('biobert_v1.1_pubmed', num_labels=len(tag2idx))


In [15]:
# We want casing since it is NER 
tokenizer = BertTokenizer.from_pretrained('biobert_v1.1_pubmed', do_lower_case=False)


In [16]:
MAX_LEN = 75
bs = 32
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
for i,t in enumerate(tokenized_texts):
  if len(t) > 512:

    labels.remove(labels[i])
    tokenized_texts.remove(t)


In [17]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]


In [18]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [19]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [20]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [21]:
### Er allt hingað til yfirfæranlegt á biobert?

import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

#### No full finetuning, because not on a GPU

In [22]:
FULL_FINETUNING = False

if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [23]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
device = torch.device("cuda:0")



tr_inputs,val_inputs = tr_inputs.to(device),val_inputs.to(device)
tr_tags,val_tags = tr_tags.to(device),val_tags.to(device)
tr_masks,val_masks = tr_masks.to(device),val_masks.to(device)

model = model.to(device)

In [53]:
import numpy as np 

epochs = 3
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))



Epoch:   0%|          | 0/3 [00:00<?, ?it/s][A[A

Train loss: 2.9266848727448345




Epoch:  33%|███▎      | 1/3 [13:30<27:00, 810.43s/it][A[A

Validation loss: 2.667662435107761
Validation Accuracy: 0.89679012345679
F1-Score: 0.014099783080260303
Train loss: 2.414177362232992




Epoch:  67%|██████▋   | 2/3 [27:26<13:38, 818.24s/it][A[A

Validation loss: 2.1815133094787598
Validation Accuracy: 0.9332561728395061
F1-Score: 0.004724409448818898
Train loss: 1.9560490144442206




Epoch: 100%|██████████| 3/3 [41:30<00:00, 825.90s/it][A[A

Validation loss: 1.7712080876032512
Validation Accuracy: 0.9334413580246914
F1-Score: 0


In [25]:

# VALIDATION on validation set
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
predictions , true_labels = [], []
for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                              attention_mask=b_input_mask, labels=b_labels)
        logits = model(b_input_ids, token_type_ids=None,
                       attention_mask=b_input_mask)
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    true_labels.append(label_ids)

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1
eval_loss = eval_loss/nb_eval_steps
print("Validation loss: {}".format(eval_loss))
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

NameError: name 'device' is not defined

In [56]:
for t in pred_tags:
    if t != 'O':
        print(t)

gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene
gene


###   Vil búa til sérstakt cell,reagent,gene sett fyrir biobert því eins og er reynir hann að flokka alla 30 flokkana

In [58]:
interesting = ['cell_line', 'science_word', 'reagent', 'gene']

Skrifa í sérstaka skrá þar sem allt nema 'interesting' er 'O'

In [81]:
# mark unambiguous terms from  from wikilabelled



new_file = open('SPACY NER/cell_gene_reagent', 'w') 


with open('SPACY NER/wiki_ambiguous') as cells:

    # label everything else with an 'O'
    j = 0 
    for l in cells.readlines():
        items = l.split(',')
        if j > 0:
            if len(items) == 5:
                # skip strange rows 
                word = items[2]
                cat = items[4].strip()

                if cat not in interesting:
                    # breyti categoryunni í O ef ekki í interesting
                    cat = 'O'    
                    items[4] = cat
                    l = ','.join(items) + '\n'

        new_file.write(l)
        j += 1

            

Hleð inn í pandas df:

In [82]:
data_path = "SPACY NER/" 
data_file_address = "SPACY NER/cell_gene_reagent"

# Fillna method can make same sentence with same sentence name
df_data = pd.read_csv(data_file_address,sep=",",encoding="latin1").fillna(method='ffill')


In [83]:
df_data

Unnamed: 0.1,Unnamed: 0,sentence_nr,word,pos,category
0,0,0,A1,NNP,O
1,1,0,66056,CD,O
2,2,0,.,.,O
3,3,0,A2,NN,O
4,4,0,5.11,CD,O
5,6,1,EXP,NNP,O
6,7,1,2,CD,O
7,8,1,.,.,O
8,9,1,Cm,NNP,O
9,11,1,R,NN,O


Make sure it is the way we want:

In [84]:
df_data.head(n=20)

# Have a look TAG cat
df_data.category.unique()

# Analyse the Tag distribution
print(df_data.category.value_counts())

O               52758
cell_line         161
science_word      133
gene              131
reagent            92
Name: category, dtype: int64


Næ í setningarnar og labelin:

In [85]:

getter = SentenceGetter(df_data)

# Get sentence data
sentences = [[s[0] for s in sent] for sent in getter.sentences]
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]

# Get tag labels data
labels = [[s[2] for s in sent] for sent in getter.sentences]
print(labels[0])

['O', 'O', 'O', 'O', 'O']


In [86]:

tags_vals = list(set(df_data["category"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

for i,t in enumerate(tokenized_texts):
  if len(t) > 512:
    
    labels.remove(labels[i])
    tokenized_texts.remove(t)


Get the data ready for Bert:

In [87]:
MAX_LEN = 75
bs = 32

input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]


tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

### TRAIN 

In [None]:
FULL_FINETUNING = True

if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)


In [None]:

epochs = 4
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

### Make imbalanced dataset, with multiple sentences that have words from the categories of interest

In [47]:


interesting = ['cell_line', 'reagent', 'gene', 'science_word']







interesting_sents = []
with open('SPACY NER/cell_gene_reagent') as cells:

    # label everything else with an 'O'
    j = 0 
    for l in cells.readlines():
        items = l.split(',')
        if j > 0:
            if len(items) == 5:
                # skip strange rows 
                word = items[2]
                cat = items[4].strip()

                if cat in interesting:

                    interesting_sents.append(items[1])

        j += 1

            

##### Add the interesting sentences 

around the document,  add the copied sentences to solve imbalanced classes

In [3]:
interesting_sents

['3',
 '9',
 '10',
 '11',
 '11',
 '14',
 '17',
 '20',
 '30',
 '37',
 '38',
 '40',
 '40',
 '42',
 '45',
 '49',
 '55',
 '57',
 '66',
 '67',
 '67',
 '70',
 '73',
 '81',
 '82',
 '89',
 '97',
 '98',
 '98',
 '98',
 '114',
 '116',
 '116',
 '123',
 '129',
 '135',
 '160',
 '160',
 '162',
 '201',
 '208',
 '216',
 '231',
 '235',
 '236',
 '238',
 '251',
 '252',
 '252',
 '254',
 '254',
 '278',
 '297',
 '299',
 '300',
 '306',
 '306',
 '310',
 '320',
 '320',
 '324',
 '338',
 '342',
 '342',
 '342',
 '345',
 '345',
 '350',
 '360',
 '365',
 '386',
 '392',
 '392',
 '392',
 '396',
 '399',
 '408',
 '421',
 '425',
 '426',
 '438',
 '461',
 '482',
 '488',
 '496',
 '498',
 '504',
 '533',
 '543',
 '543',
 '545',
 '551',
 '553',
 '557',
 '564',
 '566',
 '570',
 '570',
 '570',
 '590',
 '607',
 '607',
 '607',
 '607',
 '610',
 '610',
 '610',
 '615',
 '624',
 '629',
 '643',
 '649',
 '651',
 '651',
 '654',
 '654',
 '654',
 '654',
 '654',
 '654',
 '654',
 '654',
 '654',
 '654',
 '654',
 '656',
 '664',
 '664',
 '677',


In [5]:
line_index = -999
index_after_interesting_sentence = []
with open('SPACY NER/cell_gene_reagent') as cells:

    # label everything else with an 'O'
    j = 0 
    for l in cells.readlines():
        items = l.split(',')
        if j > 0:
            if len(items) == 5:
                # skip strange rows 

                if items[1] in interesting_sents:
                    print(l)
                    line_index = items[0]
                else:
                    # not any more 
                    if int(items[0]) == int(line_index) + 1:
                        # we are in the following line after an interesting sentence 
                        # get safnað indexunum og skrifað síðan frá aftasta og upp án þess að skemma
                        print(l)
                        index_after_interesting_sentence.append(items[0])
                        
        
                # find if the interesting sentence just ended
                # fetch the row index, to add to ]
            
            
                    
        j+=1

116,3,20,CD,O

117,3,µl,CD,O

118,3,2,CD,O

119,3,N,NNP,O

120,3,NaOH,NNP,reagent

121,3,into,IN,O

122,3,dilution,NN,O

123,3,.,.,O

124,4,Substrate,NNP,O

198,9,I,PRP,O

199,9,decided,VBD,O

200,9,to,TO,O

201,9,put,VB,O

202,9,2000000,CD,O

203,9,[,-LRB-,O

204,9,inaudible,JJ,O

205,9,],-RRB-,O

206,9,cells,NNS,O

207,9,into,IN,O

208,9,10µl,NN,O

209,9,of,IN,O

210,9,RPMI,NNP,O

211,9,media,NNS,O

212,9,and,CC,O

213,9,they,PRP,O

214,9,will,MD,O

215,9,be,VB,O

216,9,cultured,VBN,O

217,9,for,IN,O

218,9,the,DT,O

219,9,next,JJ,O

220,9,2,CD,O

221,9,to,TO,O

222,9,3,CD,O

223,9,days,NNS,O

224,9,.,.,O

225,9,I,PRP,O

226,9,will,MD,O

227,9,monitor,VB,O

228,9,the,DT,O

229,9,growth,NN,O

230,9,probably,RB,O

231,9,on,IN,O

232,9,Sunday,NNP,O

233,9,quickly,RB,O

234,9,and,CC,O

235,9,see,VB,O

236,9,if,IN,O

237,9,I,PRP,O

238,9,need,VBP,O

239,9,to,TO,O

240,9,exchange,VB,O

241,9,the,DT,O

242,9,media,NNS,O

243,9,.,.,O

244,9,Jokerit,NNP,cell_line

245,9,cells,NNS,O

246,9,are

13823,649,5,CD,O

13824,649,µm,NN,O

13825,649,,,O

13826,649,500,CD,O

13827,649,angstroms,NNS,O

13828,649,,,O

13829,649,4.6,CD,O

13830,649,*,SYM,O

13831,649,300,CD,O

13832,649,mm,NN,O

13833,649,.,.,O

13834,650,Fraction,NN,O

13856,651,So,RB,O

13857,651,puri,VB,gene

13858,651,for,IN,O

13859,651,the,DT,O

13860,651,purification,NN,O

13861,651,for,IN,O

13862,651,the,DT,O

13863,651,samples,NNS,O

13864,651,that,WDT,O

13865,651,I,PRP,O

13866,651,made,VBD,O

13867,651,a,DT,O

13868,651,mistake,NN,O

13869,651,previously,RB,O

13870,651,in,IN,O

13871,651,during,IN,O

13872,651,the,DT,O

13873,651,.,.,O

13874,651,Um,UH,O

13875,651,preamplifications,NNS,science_word

13876,651,step,NN,O

13877,651,that,WDT,O

13878,651,I,PRP,O

13879,651,have,VBP,O

13880,651,now,RB,O

13881,651,increase,VB,O

13882,651,the,DT,O

13883,651,volume,NN,O

13884,651,of,IN,O

13885,651,the,DT,O

13886,651,beads,NNS,O

13887,651,I,PRP,O

13888,651,start,VBP,O

13889,651,at,IN,O

13890,651,25,CD,O


25365,1178,1,CD,O

25366,1178,,,O

25367,1178,6,CD,O

25368,1178,,,O

25369,1178,7,CD,O

25370,1178,and,CC,O

25371,1178,12,CD,O

25372,1178,.,.,O

25373,1179,Going,VBG,O

25510,1187,Sample,NNP,O

25511,1187,HSD,NNP,gene

25512,1187,1012,CD,O

25513,1187,.,.,O

25514,1188,Repeat,NN,O

25515,1188,test,NN,O

25516,1188,PLT,NNP,gene

25517,1188,legend,NN,O

25518,1188,with,IN,O

25519,1188,loud,JJ,O

25520,1188,English,NNP,O

25521,1188,talking,NN,O

25522,1188,in,IN,O

25523,1188,the,DT,O

25524,1188,background,NN,O

25525,1188,.,.,O

25528,1188,Extraction,NN,O

25529,1188,.,.,O

25532,1188,The,DT,O

25533,1188,biphasic,JJ,O

25534,1188,solution,NN,O

25535,1188,was,VBD,O

25536,1188,stirred,VBN,O

25537,1188,at,IN,O

25538,1188,room,NN,O

25539,1188,temp,NN,O

25540,1188,for,IN,O

25541,1188,20,CD,O

25542,1188,min,NN,O

25543,1188,and,CC,O

25544,1188,then,RB,O

25545,1188,transferred,VBD,O

25546,1188,into,IN,O

25547,1188,a,DT,O

25548,1188,separation,NN,O

25549,1188,funnel,NN,O

25


34759,1596,noted,VBD,O

34760,1596,that,IN,O

34761,1596,the,DT,O

34762,1596,signal,NN,O

34763,1596,for,IN,O

34764,1596,creatinine,NN,O

34765,1596,was,VBD,O

34766,1596,almost,RB,O

34767,1596,0,CD,O

34768,1596,despite,IN,O

34769,1596,normal,JJ,O

34770,1596,sample,NN,O

34771,1596,and,CC,O

34772,1596,everything,NN,O

34773,1596,looked,VBD,O

34774,1596,okay,JJ,O

34775,1596,.,.,O

34776,1596,But,CC,O

34777,1596,after,IN,O

34778,1596,cleaning,VBG,O

34779,1596,all,PDT,O

34780,1596,the,DT,O

34781,1596,cones,NNS,O

34782,1596,,,O

34783,1596,the,DT,O

34784,1596,signal,NN,O

34785,1596,increase,NN,O

34786,1596,to,IN,O

34787,1596,1,CD,O

34788,1596,million,CD,O

34789,1596,peak,NN,O

34790,1596,height,NN,O

34791,1596,,,O

34792,1596,meaning,VBG,O

34793,1596,10,CD,O

34794,1596,thousand,CD,O

34795,1596,times,NNS,O

34796,1596,more,RBR,O

34797,1596,sensitive,JJ,O

34798,1596,system,NN,O

34799,1596,after,IN,O

34800,1596,cleaning,VBG,O

34801,1596,all,PDT,O

34802,1596,the


46067,2150,SPR,NNP,cell_line

46068,2150,buffer,NN,O

46071,2150,20,CD,O

46072,2150,mM,NN,O

46073,2150,Tris,NNP,O

46074,2150,,,O

46075,2150,150,CD,O

46076,2150,mM,NNP,O

46077,2150,NaCl,NNP,O

46078,2150,,,O

46080,2150,TCEP,NNP,O

46081,2150,,,O

46083,2150,EDTA,NN,O

46084,2150,,,O

46085,2150,0.005,CD,O

46086,2150,%,NN,O

46087,2150,DMSO,NNP,reagent

46088,2150,AND,CC,O

46117,2154,Preincubating,VBG,science_word

46118,2154,new,JJ,O

46119,2154,slide,NN,O

46120,2154,fibrinogen,NNS,O

46121,2154,coated,VBN,O

46122,2154,with,IN,O

46123,2154,100,CD,O

46124,2154,µl,NN,O

46125,2154,medium,NN,O

46126,2154,.,.,O

46127,2154,Yield,NN,O

46128,2154,wild,JJ,O

46129,2154,type,NN,O

46130,2154,endothelial,JJ,O

46131,2154,cells,NNS,O

46132,2154,,,O

46133,2154,yield,NN,O

46134,2154,Lyst,NNP,O

46135,2154,endothelial,JJ,O

46136,2154,cells,NNS,O

46137,2154,,,O

46138,2154,cells,NNS,O

46139,2154,.,.,O

46140,2155,Result,NN,O

46174,2157,Plasmid,NN,science_word

46175,2157,number


56819,2709,.,.,O

56820,2709,Light,JJ,O

56821,2709,stimulus,NN,O

56822,2709,Plexon,NNP,O

56823,2709,,,O

56824,2709,LED,VBN,O

56825,2709,driver,NN,O

56826,2709,,,O

56827,2709,LD-1,NNP,O

56828,2709,,,O

56829,2709,maximum,JJ,O

56830,2709,voltage,NN,O

56832,2709,,,O

56833,2709,maximum,JJ,O

56834,2709,current,JJ,O

56836,2709,A,NN,O

56837,2709,473,CD,O

56838,2709,nanometer,NN,O

56839,2709,wavelength,NN,science_word

56840,2709,.,.,O

56841,2710,Sample,NN,O

56944,2712,I,PRP,O

56945,2712,pipetted,VBD,science_word

56946,2712,5,CD,O

56947,2712,µl,XX,O

56948,2712,into,IN,O

56949,2712,a,DT,O

56950,2712,Falcon,NNP,O

56951,2712,tube,NN,O

56952,2712,.,.,O

56953,2713,Then,RB,O

57254,2725,AB,NNP,O

57255,2725,-,HYPH,O

57256,2725,0273,CD,O

57257,2725,-,HYPH,O

57258,2725,M14459,NNP,gene

57259,2725,-,HYPH,O

57260,2725,WS,NNP,O

57261,2725,.,.,O

57262,2726,Add,VB,O

57279,2728,There,EX,O

57280,2728,are,VBP,O

57281,2728,23,CD,O

57282,2728,mils,NNS,O

57283,2728,of,IN,O



66514,3185,diluted,VBD,O

66515,3185,300,CD,O

66516,3185,µl,XX,O

66517,3185,of,IN,O

66518,3185,at,IN,O

66519,3185,the,DT,O

66520,3185,siRNA,NNP,gene

66521,3185,DharmaFECT,NNP,O

66522,3185,mixture,NN,O

66523,3185,in,IN,O

66524,3185,1.2,CD,O

66525,3185,ML,NNP,O

66526,3185,of,IN,O

66527,3185,appropriate,JJ,O

66528,3185,media,NNS,O

66529,3185,for,IN,O

66530,3185,each,DT,O

66531,3185,cell,NN,O

66532,3185,type,NN,O

66533,3185,.,.,O

66534,3186,Dan,NNP,O

66771,3201,Today,NN,O

66772,3201,,,O

66773,3201,he,PRP,O

66774,3201,running,VBG,O

66775,3201,the,DT,O

66776,3201,first,JJ,O

66777,3201,time,NN,O

66778,3201,RNA,NN,O

66779,3201,seq,NN,O

66780,3201,preamplification,NN,science_word

66781,3201,protocol,NN,O

66782,3201,.,.,O

66783,3201,2,CD,O

66784,3201,in,IN,O

66785,3201,order,NN,O

66786,3201,to,TO,O

66787,3201,make,VB,O

66788,3201,everything,NN,O

66789,3201,'s,VBZ,O

66790,3201,not,RB,O

66791,3201,.,.,O

66792,3201,I,PRP,O

66793,3201,need,VBP,O

66794,3201

In [9]:
index_after_interesting_sentence

['124',
 '316',
 '406',
 '472',
 '564',
 '725',
 '847',
 '884',
 '921',
 '956',
 '991',
 '1111',
 '1171',
 '1384',
 '1414',
 '1473',
 '1583',
 '1661',
 '1887',
 '2183',
 '2232',
 '2300',
 '2374',
 '2496',
 '2959',
 '3028',
 '3785',
 '3868',
 '4011',
 '4428',
 '4538',
 '4566',
 '4824',
 '4845',
 '5522',
 '5846',
 '5870',
 '5975',
 '6101',
 '6308',
 '6464',
 '6842',
 '6919',
 '6983',
 '7137',
 '7384',
 '7446',
 '7934',
 '8070',
 '8158',
 '8316',
 '8553',
 '8743',
 '8869',
 '9032',
 '9962',
 '10288',
 '10395',
 '10688',
 '10723',
 '10907',
 '11400',
 '11616',
 '11639',
 '11724',
 '11741',
 '11787',
 '11882',
 '11979',
 '12050',
 '12636',
 '12988',
 '13062',
 '13188',
 '13407',
 '13448',
 '13730',
 '13834',
 '13919',
 '14073',
 '14089',
 '14351',
 '14847',
 '14915',
 '15075',
 '15153',
 '15184',
 '15344',
 '15393',
 '15538',
 '15624',
 '15642',
 '15903',
 '16004',
 '16018',
 '16037',
 '16407',
 '16434',
 '16887',
 '17131',
 '17302',
 '17707',
 '17928',
 '17989',
 '18282',
 '18391',
 '18597

In [28]:
f = open("SPACY NER/cell_gene_reagent", "r")
contents = f.readlines()
f.close()

#contents.insert(index, value)

# f = open("path_to_file", "w")
# contents = "".join(contents)
# f.write(contents)
# f.close()

In [None]:
contents

In [None]:
import os
os.getcwd()

In [51]:
# collect the interesting sentences (not just the indices), to write to file 

sents_to_add = []
with open('SPACY NER/cell_gene_reagent') as cells:

    # label everything else with an 'O'
    j = 0 
    for l in cells.readlines():
        items = l.split(',')
        if j > 0:
            if j < 20400:
                # don't add the last 5000 lines because they are for testing
                if len(items) == 5:
                    # skip strange rows 
                    if items[1] in interesting_sents:
                        sents_to_add.append((items[1],l))
            
        j+=1


In [52]:
sents_to_add[-2]

('969', '20702,969,this,DT,O\n')

## Take the last 320 sentences from cell_reagent aside for testing




In [69]:
f = open("SPACY NER/cell_gene_reagent", "r")
contents = f.readlines()
f.close()

with open('imba_small_test','w') as f:
    for c in contents[-6003:]:
        f.write(c)
#        print(c)

In [68]:
contents

',sentence_nr,word,pos,category\n0,0,A1,NNP,O\n1,0,66056,CD,O\n2,0,.,.,O\n3,0,A2,NN,O\n4,0,5.11,CD,O\n6,1,EXP,NNP,O\n7,1,2,CD,O\n8,1,.,.,O\n9,1,Cm,NNP,O\n11,1,R,NN,O\n12,1,=,SYM,O\n13,1,2.5,CD,O\n14,1,.,.,O\n15,1,Nice,JJ,O\n16,1,calcium,NN,O\n17,1,current,NN,O\n18,1,and,CC,O\n19,1,transient,JJ,O\n20,1,.,.,O\n21,1,Caffeine,NN,O\n22,1,was,VBD,O\n23,1,beautiful,JJ,O\n24,1,.,.,O\n26,1,Isoprenaline,NNP,O\n27,1,was,VBD,O\n28,1,also,RB,O\n29,1,done,VBN,O\n31,1,but,CC,O\n32,1,it,PRP,O\n33,1,\'s,VBZ,O\n34,1,important,JJ,O\n35,1,to,TO,O\n36,1,note,VB,O\n37,1,that,IN,O\n38,1,the,DT,O\n39,1,isoprenaline,JJ,O\n40,1,stock,NN,O\n41,1,was,VBD,O\n42,1,made,VBN,O\n43,1,from,IN,O\n44,1,the,DT,O\n45,1,old,JJ,O\n46,1,stock,NN,O\n48,1,which,WDT,O\n49,1,might,MD,O\n50,1,reflect,VB,O\n51,1,in,IN,O\n52,1,the,DT,O\n53,1,reduced,VBN,O\n54,1,activation,NN,O\n55,1,of,IN,O\n56,1,beta,JJ,O\n57,1,adrenergic,JJ,O\n58,1,receptors,NNS,O\n59,1,.,.,O\n60,1,Leak,NN,O\n61,1,was,VBD,O\n62,1,also,RB,O\n63,1,attempted,VBN,O\n6

['19797,922,Washing,NN,O\n',
 '19798,922,at,IN,O\n',
 '19799,922,5ML,CD,O\n',
 '19800,922,/,SYM,O\n',
 '19801,922,min,NN,O\n',
 '19802,922,,,O\n',
 '19803,922,initially,RB,O\n',
 '19804,922,at,IN,O\n',
 '19805,922,0,CD,O\n',
 '19806,922,%,NN,O\n',
 '19808,923,The,DT,O\n',
 '19809,923,reaction,NN,O\n',
 '19810,923,mixture,NN,O\n',
 '19811,923,to,IN,O\n',
 '19812,923,Lynn,NNP,O\n',
 '19813,923,Slash,NNP,O\n',
 '19814,923,Water,NNP,O\n',
 '19815,923,121,CD,O\n',
 '19816,923,was,VBD,O\n',
 '19817,923,directly,RB,O\n',
 '19818,923,used,VBN,O\n',
 '19819,923,in,IN,O\n',
 '19820,923,knew,VBN,O\n',
 '19821,923,experiment,NN,O\n',
 '19822,923,.,.,O\n',
 '19823,923,The,DT,O\n',
 '19824,923,biphasic,NN,O\n',
 '19825,923,system,NN,O\n',
 '19826,923,was,VBD,O\n',
 '19827,923,cooled,VBN,O\n',
 '19828,923,down,RP,O\n',
 '19829,923,to,IN,O\n',
 '19830,923,50,CD,O\n',
 '19831,923,℃,NN,O\n',
 '19832,923,and,CC,O\n',
 '19833,923,sodium,NN,O\n',
 '19834,923,32,CD,O\n',
 '19835,923,%,NN,O\n',
 '19836,923,w

In [44]:
f = open('SPACY NER/imba_small', "w")
contents = "".join(contents[:20127])
f.write(contents)
f.close()

### Skrifa sents to add aftast í imbalanced skjalið

Bæti við nýju orða indexi (línan) og nýju setningar indexi...

byrja með max setningar indexið +1 
svo alltaf þegar setninga indexið breytist þá hækka það um einn. 

In [55]:
sents_to_add[0]

('3', '116,3,20,CD,O\n')

In [57]:
word_index = 20428+1

sentence_number = 950+1

In [59]:
# initialise with first sentence:
sentence_nr_being_read = sents_to_add[0][0]

with open('SPACY NER/imba_small', 'a') as file:
    
    # add everything that has an interesting term 10 times 
    for j in range(15):
        for s in sents_to_add:

            if s[0] != sentence_nr_being_read:
                # a new sentence is being read
                sentence_number += 1 

            
            # the actual sentence number (which is being duplicated)
            sentence_nr_being_read = s[0]
            
            item = s[1].split(',')

            # mark with new indices, so it belongs at the bottom and is consistent with the rest 
            item[0] = str(word_index)
            item[1] = str(sentence_number)
            print(item)
            file.write(','.join(item))
            
            word_index += 1
            

['74969', '3020', '20', 'CD', 'O\n']
['74970', '3020', 'µl', 'CD', 'O\n']
['74971', '3020', '2', 'CD', 'O\n']
['74972', '3020', 'N', 'NNP', 'O\n']
['74973', '3020', 'NaOH', 'NNP', 'reagent\n']
['74974', '3020', 'into', 'IN', 'O\n']
['74975', '3020', 'dilution', 'NN', 'O\n']
['74976', '3020', '.', '.', 'O\n']
['74977', '3021', 'I', 'PRP', 'O\n']
['74978', '3021', 'decided', 'VBD', 'O\n']
['74979', '3021', 'to', 'TO', 'O\n']
['74980', '3021', 'put', 'VB', 'O\n']
['74981', '3021', '2000000', 'CD', 'O\n']
['74982', '3021', '[', '-LRB-', 'O\n']
['74983', '3021', 'inaudible', 'JJ', 'O\n']
['74984', '3021', ']', '-RRB-', 'O\n']
['74985', '3021', 'cells', 'NNS', 'O\n']
['74986', '3021', 'into', 'IN', 'O\n']
['74987', '3021', '10µl', 'NN', 'O\n']
['74988', '3021', 'of', 'IN', 'O\n']
['74989', '3021', 'RPMI', 'NNP', 'O\n']
['74990', '3021', 'media', 'NNS', 'O\n']
['74991', '3021', 'and', 'CC', 'O\n']
['74992', '3021', 'they', 'PRP', 'O\n']
['74993', '3021', 'will', 'MD', 'O\n']
['74994', '3021',

['78340', '3148', 'million', 'CD', 'O\n']
['78341', '3148', 'cells', 'NNS', 'O\n']
['78342', '3148', '.', '.', 'O\n']
['78343', '3149', 'The', 'DT', 'O\n']
['78344', '3149', '96', 'CD', 'O\n']
['78345', '3149', 'h', 'NN', 'O\n']
['78346', '3149', 'time', 'NN', 'O\n']
['78347', '3149', 'point', 'NN', 'O\n']
['78348', '3149', 'was', 'VBD', 'O\n']
['78349', '3149', 'washed', 'VBN', 'O\n']
['78350', '3149', '1', 'CD', 'O\n']
['78351', '3149', 'x', 'NN', 'O\n']
['78352', '3149', 'with', 'IN', 'O\n']
['78353', '3149', 'PBS', 'NNP', 'reagent\n']
['78354', '3149', '.', '.', 'O\n']
['78355', '3149', 'The', 'DT', 'O\n']
['78356', '3149', 'PBS', 'NNP', 'reagent\n']
['78357', '3149', 'was', 'VBD', 'O\n']
['78358', '3149', 'removed', 'VBN', 'O\n']
['78359', '3149', 'and', 'CC', 'O\n']
['78360', '3149', 'the', 'DT', 'O\n']
['78361', '3149', 'plate', 'NN', 'O\n']
['78362', '3149', 'was', 'VBD', 'O\n']
['78363', '3149', 'placed', 'VBN', 'O\n']
['78364', '3149', 'at', 'IN', 'O\n']
['78365', '3149', '-8

['81825', '3281', 'of', 'IN', 'O\n']
['81826', '3281', 'the', 'DT', 'O\n']
['81827', '3281', '5', 'CD', 'O\n']
['81828', '3281', 'M', 'NN', 'O\n']
['81829', '3282', 'Cell', 'NN', 'O\n']
['81830', '3282', 'culture', 'NN', 'O\n']
['81831', '3282', '.', '.', 'O\n']
['81832', '3282', 'Yesterday', 'NN', 'O\n']
['81833', '3282', 'I', 'PRP', 'O\n']
['81834', '3282', 'A549', 'NNP', 'cell_line\n']
['81835', '3282', 'transduced', 'VBD', 'science_word\n']
['81836', '3282', 'with', 'IN', 'O\n']
['81837', '3282', 'IFITM', 'NNP', 'O\n']
['81838', '3282', '.', '.', 'O\n']
['81839', '3282', 'I', 'PRP', 'O\n']
['81840', '3282', 'split', 'VBD', 'O\n']
['81841', '3282', 'one', 'CD', 'O\n']
['81842', '3282', 'in', 'IN', 'O\n']
['81843', '3282', 'ten', 'CD', 'O\n']
['81844', '3282', 'and', 'CC', 'O\n']
['81845', '3282', 'harvest', 'VB', 'O\n']
['81846', '3282', 'a', 'DT', 'O\n']
['81847', '3282', 'cell', 'NN', 'O\n']
['81848', '3282', 'pellet', 'NN', 'O\n']
['81849', '3282', 'for', 'IN', 'O\n']
['81850', '

['84953', '3395', '', '', 'O\n']
['84954', '3395', 'no', 'UH', 'O\n']
['84955', '3395', '', '', 'O\n']
['84956', '3395', 'I', 'PRP', 'O\n']
['84957', '3395', "'ve", 'VB', 'O\n']
['84958', '3395', 'decided', 'VBN', 'O\n']
['84959', '3395', 'to', 'TO', 'O\n']
['84960', '3395', 'use', 'VB', 'O\n']
['84961', '3395', 'channel', 'NN', 'O\n']
['84962', '3395', '3', 'CD', 'O\n']
['84963', '3395', 'for', 'IN', 'O\n']
['84964', '3395', 'ATP', 'NNP', 'gene\n']
['84965', '3395', '', '', 'O\n']
['84966', '3395', 'Channel', 'NNP', 'O\n']
['84967', '3395', '4', 'CD', 'O\n']
['84968', '3395', 'for', 'IN', 'O\n']
['84969', '3395', 'substrate', 'NN', 'O\n']
['84970', '3395', 'and', 'CC', 'O\n']
['84971', '3395', 'again', 'RB', 'O\n']
['84972', '3395', 'channel', 'NN', 'O\n']
['84973', '3395', '3', 'CD', 'O\n']
['84974', '3395', 'for', 'IN', 'O\n']
['84975', '3395', 'both', 'DT', 'O\n']
['84976', '3395', 'enzymes', 'NNS', 'O\n']
['84977', '3395', '.', '.', 'O\n']
['84978', '3396', 'D7A9', 'NN', 'O\n']
['

['88435', '3527', '70', 'CD', 'O\n']
['88436', '3527', '%', 'NN', 'O\n']
['88437', '3527', 'acetonitrile', 'NN', 'O\n']
['88438', '3527', 'is', 'VBZ', 'O\n']
['88439', '3527', 'added', 'VBN', 'O\n']
['88440', '3527', 'to', 'IN', 'O\n']
['88441', '3527', 'the', 'DT', 'O\n']
['88442', '3527', 'spin', 'NN', 'O\n']
['88443', '3527', 'filters', 'NNS', 'O\n']
['88444', '3527', 'and', 'CC', 'O\n']
['88445', '3527', 'centrifuged', 'VBN', 'O\n']
['88446', '3527', 'at', 'IN', 'O\n']
['88447', '3527', '1500', 'CD', 'O\n']
['88448', '3527', 'RCF', 'NNP', 'O\n']
['88449', '3527', 'for', 'IN', 'O\n']
['88450', '3527', '1', 'CD', 'O\n']
['88451', '3527', 'min', 'NN', 'O\n']
['88452', '3527', 'at', 'IN', 'O\n']
['88453', '3527', 'room', 'NN', 'O\n']
['88454', '3527', 'temperature', 'NN', 'O\n']
['88455', '3527', 'and', 'CC', 'O\n']
['88456', '3527', 'this', 'DT', 'O\n']
['88457', '3527', 'step', 'NN', 'O\n']
['88458', '3527', 'is', 'VBZ', 'O\n']
['88459', '3527', 'repeated', 'VBN', 'O\n']
['88460', '3

['91848', '3661', 'plasmid', 'NN', 'O\n']
['91849', '3661', 'G', 'NNP', 'O\n']
['91850', '3661', 'PGTF2', 'NNP', 'O\n']
['91851', '3661', '78', 'CD', 'O\n']
['91852', '3661', 'BL21', 'NNP', 'cell_line\n']
['91853', '3661', 'D', 'NNP', 'O\n']
['91854', '3661', 'free', 'JJ', 'O\n']
['91855', '3661', 'P', 'NN', 'O\n']
['91856', '3661', 'Grow', 'VBP', 'O\n']
['91857', '3661', '7', 'CD', 'O\n']
['91858', '3661', 'ID7B', 'NNP', 'O\n']
['91859', '3661', 'BL21', 'NNP', 'cell_line\n']
['91860', '3661', 'D', 'NNP', 'O\n']
['91861', '3661', 'free', 'JJ', 'O\n']
['91862', '3661', 'P', 'NN', 'O\n']
['91863', '3661', 'Grow', 'VBP', 'O\n']
['91864', '3661', '7', 'CD', 'O\n']
['91865', '3661', 'ID', 'NNP', 'O\n']
['91866', '3661', '10A', 'CD', 'cell_line\n']
['91867', '3661', 'BL21', 'NNP', 'cell_line\n']
['91868', '3661', 'D', 'NNP', 'O\n']
['91869', '3661', 'free', 'JJ', 'O\n']
['91870', '3661', 'and', 'CC', 'O\n']
['91871', '3661', 'chaperone', 'NN', 'O\n']
['91872', '3661', 'plasmid', 'NN', 'O\n']

['95370', '3798', 'for', 'IN', 'O\n']
['95371', '3798', 'the', 'DT', 'O\n']
['95372', '3798', 'samples', 'NNS', 'O\n']
['95373', '3798', 'that', 'WDT', 'O\n']
['95374', '3798', 'I', 'PRP', 'O\n']
['95375', '3798', 'made', 'VBD', 'O\n']
['95376', '3798', 'a', 'DT', 'O\n']
['95377', '3798', 'mistake', 'NN', 'O\n']
['95378', '3798', 'previously', 'RB', 'O\n']
['95379', '3798', 'in', 'IN', 'O\n']
['95380', '3798', 'during', 'IN', 'O\n']
['95381', '3798', 'the', 'DT', 'O\n']
['95382', '3798', '.', '.', 'O\n']
['95383', '3798', 'Um', 'UH', 'O\n']
['95384', '3798', 'preamplifications', 'NNS', 'science_word\n']
['95385', '3798', 'step', 'NN', 'O\n']
['95386', '3798', 'that', 'WDT', 'O\n']
['95387', '3798', 'I', 'PRP', 'O\n']
['95388', '3798', 'have', 'VBP', 'O\n']
['95389', '3798', 'now', 'RB', 'O\n']
['95390', '3798', 'increase', 'VB', 'O\n']
['95391', '3798', 'the', 'DT', 'O\n']
['95392', '3798', 'volume', 'NN', 'O\n']
['95393', '3798', 'of', 'IN', 'O\n']
['95394', '3798', 'the', 'DT', 'O\n'

['98904', '3932', '2nd', 'JJ', 'O\n']
['98905', '3932', 'A3', 'NN', 'O\n']
['98906', '3932', '.', '.', 'O\n']
['98907', '3932', 'Pour', 'VB', 'O\n']
['98908', '3932', 'these', 'DT', 'O\n']
['98909', '3932', 'in', 'IN', 'O\n']
['98910', '3932', 'water', 'NN', 'O\n']
['98911', '3932', 'and', 'CC', 'O\n']
['98912', '3932', 'add', 'VB', 'O\n']
['98913', '3932', 'Tris', 'NNP', 'O\n']
['98914', '3932', 'buffer', 'NN', 'O\n']
['98915', '3932', 'to', 'TO', 'O\n']
['98916', '3932', 'get', 'VB', 'O\n']
['98917', '3932', 'pH', 'NN', 'O\n']
['98918', '3932', '8.3', 'CD', 'O\n']
['98919', '3932', '.', '.', 'O\n']
['98920', '3933', 'K562', 'NN', 'cell_line\n']
['98921', '3933', 'viability', 'NN', 'O\n']
['98922', '3933', '83.7', 'CD', 'O\n']
['98923', '3933', '%', 'NN', 'O\n']
['98924', '3933', '.', '.', 'O\n']
['98925', '3933', 'Live', 'JJ', 'O\n']
['98926', '3934', 'Test', 'VB', 'O\n']
['98927', '3934', 'with', 'IN', 'O\n']
['98928', '3934', 'difficult', 'JJ', 'O\n']
['98929', '3934', 'words', 'NN

['102435', '4067', 'h.', 'NN', 'O\n']
['102436', '4067', 'But', 'CC', 'O\n']
['102437', '4067', 'the', 'DT', 'O\n']
['102438', '4067', 'control', 'NN', 'O\n']
['102439', '4067', 'with', 'IN', 'O\n']
['102440', '4067', 'GSP', 'NNP', 'gene\n']
['102441', '4067', 'looked', 'VBD', 'O\n']
['102442', '4067', 'after', 'IN', 'O\n']
['102443', '4067', '24', 'CD', 'O\n']
['102444', '4067', 'h', 'NN', 'O\n']
['102445', '4067', 'not', 'RB', 'O\n']
['102446', '4067', 'so', 'RB', 'O\n']
['102447', '4067', 'good', 'JJ', 'O\n']
['102448', '4067', 'so', 'RB', 'O\n']
['102449', '4067', 'after', 'IN', 'O\n']
['102450', '4067', '48', 'CD', 'O\n']
['102451', '4067', 'h', 'NN', 'O\n']
['102452', '4067', 'all', 'DT', 'O\n']
['102453', '4067', 'cells', 'NNS', 'O\n']
['102454', '4067', 'are', 'VBP', 'O\n']
['102455', '4067', 'green', 'JJ', 'O\n']
['102456', '4067', 'means', 'NNS', 'O\n']
['102457', '4067', 'they', 'PRP', 'O\n']
['102458', '4067', 'are', 'VBP', 'O\n']
['102459', '4067', 'transduce', 'NN', 'scie

['105963', '4202', '.', '.', 'O\n']
['105964', '4202', 'To', 'TO', 'O\n']
['105965', '4202', 'pipette', 'VB', 'O\n']
['105966', '4202', 'the', 'DT', 'O\n']
['105967', '4202', 'I', 'PRP', 'O\n']
['105968', '4202', "'ve", 'VB', 'O\n']
['105969', '4202', 'pipetted', 'VBN', 'science_word\n']
['105970', '4202', '4000', 'CD', 'O\n']
['105971', '4202', 'µl', 'IN', 'O\n']
['105972', '4202', 'of', 'IN', 'O\n']
['105973', '4202', '4', 'CD', 'O\n']
['105974', '4202', 'milliters', 'NNS', 'O\n']
['105975', '4202', 'and', 'CC', 'O\n']
['105976', '4202', 'to', 'TO', 'O\n']
['105977', '4202', 'pipette', 'VB', 'O\n']
['105978', '4202', 'the', 'DT', 'O\n']
['105979', '4202', 'remaining', 'VBG', 'O\n']
['105980', '4202', 'amount', 'NN', 'O\n']
['105981', '4202', 'I', 'PRP', 'O\n']
['105982', '4202', 'did', 'VBD', 'O\n']
['105983', '4202', 'the', 'DT', 'O\n']
['105984', '4202', 'P1000C', 'NNP', 'O\n']
['105985', '4202', 'blue', 'JJ', 'O\n']
['105986', '4202', '', '', 'O\n']
['105987', '4202', 'serial', 'J

['109584', '4340', 'use', 'VBP', 'O\n']
['109585', '4340', 'the', 'DT', 'O\n']
['109586', '4340', 'PAL', 'NN', 'O\n']
['109587', '4340', 'pipette', 'NN', 'O\n']
['109588', '4340', 'pro', 'NN', 'O\n']
['109589', '4340', 'and', 'CC', 'O\n']
['109590', '4340', 'a', 'DT', 'O\n']
['109591', '4340', '5', 'CD', 'O\n']
['109592', '4340', 'ml', 'NNS', 'O\n']
['109593', '4340', 'stripette', 'NN', 'O\n']
['109594', '4340', 'to', 'TO', 'O\n']
['109595', '4340', 'pipette', 'VB', 'O\n']
['109596', '4340', 'my', 'PRP$', 'O\n']
['109597', '4340', '5000', 'CD', 'O\n']
['109598', '4340', 'volume', 'NN', 'O\n']
['109599', '4340', '.', '.', 'O\n']
['109600', '4340', 'To', 'TO', 'O\n']
['109601', '4340', 'pipette', 'VB', 'O\n']
['109602', '4340', 'the', 'DT', 'O\n']
['109603', '4340', 'I', 'PRP', 'O\n']
['109604', '4340', "'ve", 'VB', 'O\n']
['109605', '4340', 'pipetted', 'VBN', 'science_word\n']
['109606', '4340', '4000', 'CD', 'O\n']
['109607', '4340', 'µl', 'IN', 'O\n']
['109608', '4340', 'of', 'IN', 'O

['113201', '4477', 'the', 'DT', 'O\n']
['113202', '4477', 'protocol', 'NN', 'O\n']
['113203', '4477', 'because', 'IN', 'O\n']
['113204', '4477', 'that', 'DT', 'O\n']
['113205', '4477', 'one', 'NN', 'O\n']
['113206', '4477', 'already', 'RB', 'O\n']
['113207', '4477', 'has', 'VBZ', 'O\n']
['113208', '4477', 'extra', 'JJ', 'O\n']
['113209', '4477', 'reactions', 'NNS', 'O\n']
['113210', '4477', 'for', 'IN', 'O\n']
['113211', '4477', 'pipetting', 'VBG', 'science_word\n']
['113212', '4477', 'error', 'NN', 'O\n']
['113213', '4477', '.', '.', 'O\n']
['113214', '4478', 'To', 'TO', 'O\n']
['113215', '4478', 'pipette', 'VB', 'O\n']
['113216', '4478', 'my', 'PRP$', 'O\n']
['113217', '4478', 'volume', 'NN', 'O\n']
['113218', '4478', '', '', 'O\n']
['113219', '4478', 'I', 'PRP', 'O\n']
['113220', '4478', 'use', 'VBP', 'O\n']
['113221', '4478', 'the', 'DT', 'O\n']
['113222', '4478', 'PAL', 'NN', 'O\n']
['113223', '4478', 'pipette', 'NN', 'O\n']
['113224', '4478', 'pro', 'NN', 'O\n']
['113225', '4478'

['116652', '4603', 'stocks', 'NNS', 'O\n']
['116653', '4603', 'with', 'IN', 'O\n']
['116654', '4603', 'POBC', 'UH', 'cell_line\n']
['116655', '4603', '.', '.', 'O\n']
['116656', '4604', '3.86', 'CD', 'O\n']
['116657', '4604', 'µl', 'IN', 'O\n']
['116658', '4604', 'RSB', 'NNP', 'cell_line\n']
['116659', '4604', 'all', 'DT', 'O\n']
['116660', '4604', '8', 'CD', 'O\n']
['116661', '4604', 'lanes', 'NNS', 'O\n']
['116662', '4604', '.', '.', 'O\n']
['116663', '4605', 'No', 'DT', 'O\n']
['116664', '4605', 'change', 'NN', 'O\n']
['116665', '4605', 'in', 'IN', 'O\n']
['116666', '4605', 'CO2', 'NNP', 'cell_line\n']
['116667', '4605', '.', '.', 'O\n']
['116668', '4605', 'Let', 'VB', 'O\n']
['116669', '4605', "'s", 'PRP', 'O\n']
['116670', '4605', 'start', 'VB', 'O\n']
['116671', '4605', 'occlusion', 'NN', 'O\n']
['116672', '4606', 'Send', 'VB', 'O\n']
['116673', '4606', 'reminder', 'NN', 'O\n']
['116674', '4606', 'in', 'IN', 'O\n']
['116675', '4606', '20', 'CD', 'O\n']
['116676', '4606', 'min', '

['119984', '4730', 'eppys', 'NN', 'O\n']
['119985', '4730', '..', '.', 'O\n']
['119986', '4731', 'Continue', 'VB', 'O\n']
['119987', '4731', 'project', 'NN', 'O\n']
['119988', '4731', 'PMS', 'NNP', 'O\n']
['119989', '4731', '6', 'CD', 'O\n']
['119990', '4731', '-', 'HYPH', 'O\n']
['119991', '4731', '1-UC', 'CD', 'O\n']
['119992', '4731', ':', ':', 'O\n']
['119993', '4731', 'Digest', 'VB', 'O\n']
['119994', '4731', 'with', 'IN', 'O\n']
['119995', '4731', 'DPN1', 'NNP', 'cell_line\n']
['119996', '4731', 'restriction', 'NN', 'O\n']
['119997', '4731', 'enzyme', 'NN', 'O\n']
['119998', '4731', '.', '.', 'O\n']
['119999', '4732', 'I', 'PRP', 'O\n']
['120000', '4732', 'pipetted', 'VBD', 'science_word\n']
['120001', '4732', '5', 'CD', 'O\n']
['120002', '4732', 'µl', 'XX', 'O\n']
['120003', '4732', '.', '.', 'O\n']
['120004', '4733', 'Let', 'VB', 'O\n']
['120005', '4733', 'me', 'PRP', 'O\n']
['120006', '4733', 'introduce', 'VB', 'O\n']
['120007', '4733', 'this', 'DT', 'O\n']
['120008', '4733', 

['123363', '4856', 'buffer', 'NN', 'O\n']
['123364', '4856', '20', 'CD', 'O\n']
['123365', '4856', 'mM', 'NNP', 'O\n']
['123366', '4856', 'HEPES', 'NNP', 'O\n']
['123367', '4856', '', '', 'O\n']
['123368', '4856', '150', 'CD', 'O\n']
['123369', '4856', 'mM', 'CD', 'O\n']
['123370', '4856', 'sodium', 'NN', 'O\n']
['123371', '4856', 'chloride', 'NN', 'O\n']
['123372', '4856', '', '', 'O\n']
['123373', '4856', '1', 'CD', 'O\n']
['123374', '4856', 'mM', 'NNP', 'O\n']
['123375', '4856', 'TCEP', 'NNP', 'O\n']
['123376', '4856', '0.05', 'CD', 'O\n']
['123377', '4856', '%', 'NN', 'O\n']
['123378', '4856', 'Tween20', 'NNP', 'O\n']
['123379', '4856', 'pH', 'NN', 'O\n']
['123380', '4856', '7.5', 'CD', 'O\n']
['123381', '4856', '.', '.', 'O\n']
['123382', '4857', 'Adding', 'VBG', 'O\n']
['123383', '4857', 'one', 'CD', 'O\n']
['123384', '4857', 'ml', 'NN', 'O\n']
['123385', '4857', 'of', 'IN', 'O\n']
['123386', '4857', 'MACS', 'NNP', 'gene\n']
['123387', '4857', 'buffer', 'NN', 'O\n']
['123388', '4

['126881', '4990', 'J11', 'NNP', 'O\n']
['126882', '4990', '', '', 'O\n']
['126883', '4990', 'J12', 'NNP', 'O\n']
['126884', '4990', '', '', 'O\n']
['126885', '4990', 'K12', 'NNP', 'O\n']
['126886', '4990', '', '', 'O\n']
['126887', '4990', 'O12', 'NNP', 'O\n']
['126888', '4991', 'I', 'PRP', 'O\n']
['126889', '4991', 'decided', 'VBD', 'O\n']
['126890', '4991', 'to', 'TO', 'O\n']
['126891', '4991', 'split', 'VB', 'O\n']
['126892', '4991', 'the', 'DT', 'O\n']
['126893', '4991', 'Jurkat', 'NNP', 'cell_line\n']
['126894', '4991', 'cells', 'VBZ', 'O\n']
['126895', '4991', 'one', 'CD', 'O\n']
['126896', '4991', 'more', 'JJR', 'O\n']
['126897', '4991', 'time', 'NN', 'O\n']
['126898', '4991', '.', '.', 'O\n']
['126899', '4991', 'I', 'PRP', 'O\n']
['126900', '4991', 'did', 'VBD', 'O\n']
['126901', '4991', 'a', 'DT', 'O\n']
['126902', '4991', 'full', 'JJ', 'O\n']
['126903', '4991', 'split', 'NN', 'O\n']
['126904', '4991', 'of', 'IN', 'O\n']
['126905', '4991', 'the', 'DT', 'O\n']
['126906', '4991

In [50]:
for item in sents_to_add:
    print(','.join(item[1:]))

116,3,20,CD,O

117,3,µl,CD,O

118,3,2,CD,O

119,3,N,NNP,O

120,3,NaOH,NNP,reagent

121,3,into,IN,O

122,3,dilution,NN,O

123,3,.,.,O

198,9,I,PRP,O

199,9,decided,VBD,O

200,9,to,TO,O

201,9,put,VB,O

202,9,2000000,CD,O

203,9,[,-LRB-,O

204,9,inaudible,JJ,O

205,9,],-RRB-,O

206,9,cells,NNS,O

207,9,into,IN,O

208,9,10µl,NN,O

209,9,of,IN,O

210,9,RPMI,NNP,O

211,9,media,NNS,O

212,9,and,CC,O

213,9,they,PRP,O

214,9,will,MD,O

215,9,be,VB,O

216,9,cultured,VBN,O

217,9,for,IN,O

218,9,the,DT,O

219,9,next,JJ,O

220,9,2,CD,O

221,9,to,TO,O

222,9,3,CD,O

223,9,days,NNS,O

224,9,.,.,O

225,9,I,PRP,O

226,9,will,MD,O

227,9,monitor,VB,O

228,9,the,DT,O

229,9,growth,NN,O

230,9,probably,RB,O

231,9,on,IN,O

232,9,Sunday,NNP,O

233,9,quickly,RB,O

234,9,and,CC,O

235,9,see,VB,O

236,9,if,IN,O

237,9,I,PRP,O

238,9,need,VBP,O

239,9,to,TO,O

240,9,exchange,VB,O

241,9,the,DT,O

242,9,media,NNS,O

243,9,.,.,O

244,9,Jokerit,NNP,cell_line

245,9,cells,NNS,O

246,9,are,VBP,O

247,9,highly,RB


11851,564,number,NN,O

11852,564,of,IN,O

11853,564,reactions,NNS,O

11854,564,for,IN,O

11855,564,the,DT,O

11856,564,master,NN,O

11857,564,mix,NN,O

11858,564,has,VBZ,O

11859,564,to,TO,O

11860,564,be,VB,O

11861,564,exactly,RB,O

11862,564,that,DT,O

11863,564,of,IN,O

11864,564,the,DT,O

11865,564,one,NN,O

11866,564,is,VBZ,O

11867,564,stated,VBN,O

11868,564,on,IN,O

11869,564,the,DT,O

11870,564,protocol,NN,O

11871,564,because,IN,O

11872,564,that,DT,O

11873,564,one,NN,O

11874,564,already,RB,O

11875,564,has,VBZ,O

11876,564,extra,JJ,O

11877,564,reactions,NNS,O

11878,564,for,IN,O

11879,564,pipetting,VBG,science_word

11880,564,error,NN,O

11881,564,.,.,O

11927,566,To,TO,O

11928,566,pipette,VB,O

11929,566,my,PRP$,O

11930,566,volume,NN,O

11931,566,,,O

11932,566,I,PRP,O

11933,566,use,VBP,O

11934,566,the,DT,O

11935,566,PAL,NN,O

11936,566,pipette,NN,O

11937,566,pro,NN,O

11938,566,and,CC,O

11939,566,a,DT,O

11940,566,5,CD,O

11941,566,ml,NNS,O

11942,566,stripett


21026,985,first,JJ,O

21027,985,round,NN,O

21028,985,.,.,O

21029,985,Single,JJ,O

21030,985,chain,NN,O

21031,985,5D5,CD,cell_line

21032,985,25,CD,O

21033,985,mosquitoes,NNS,O

21034,985,.,.,O

21144,993,15,CD,O

21145,993,microchip,NN,science_word

21146,993,runs,NNS,O

21147,993,faster,RBR,O

21148,993,than,IN,O

21149,993,the,DT,O

21150,993,other,JJ,O

21151,993,ones,NNS,O

21152,993,.,.,O

21290,1000,Starting,VBG,O

21291,1000,a,DT,O

21292,1000,gel,NN,O

21293,1000,.,.,O

21294,1000,Taking,VBG,O

21295,1000,out,RP,O

21296,1000,a,DT,O

21297,1000,17,CD,O

21298,1000,lane,NN,O

21299,1000,gel,NN,O

21300,1000,from,IN,O

21301,1000,the,DT,O

21302,1000,Invitrogen,NNP,O

21303,1000,.,.,O

21304,1000,Nupage,NNP,O

21305,1000,4,CD,O

21306,1000,to,TO,O

21307,1000,12,CD,O

21308,1000,%,NN,O

21309,1000,.,.,O

21310,1000,Bis,NNP,O

21311,1000,Tris,NNP,O

21312,1000,gel,NN,O

21313,1000,.,.,O

21314,1000,Lot,NN,O

21315,1000,number,NN,O

21316,1000,19040470,CD,O

21317,1000,.,.,O



29108,1343,mg,NN,O

29109,1343,/,SYM,O

29110,1343,ml,NN,O

29111,1343,.,.,O

29112,1343,Therefore,RB,O

29113,1343,,,O

29114,1343,60,CD,O

29115,1343,µl,NN,O

29116,1343,of,IN,O

29117,1343,Alcohol,NNP,O

29118,1343,dehydrogenase,NN,O

29119,1343,were,VBD,O

29120,1343,mixed,VBN,O

29121,1343,with,IN,O

29122,1343,40,CD,O

29123,1343,µl,CD,O

29124,1343,of,IN,O

29125,1343,PBS,NNP,reagent

29126,1343,buffer,NN,O

29127,1343,.,.,O

29128,1343,Better,RB,O

29129,1343,I,PRP,O

29130,1343,'m,VBP,O

29131,1343,realizar,NN,O

29132,1343,with,IN,O

29133,1343,4,CD,O

29134,1343,mg,NN,O

29135,1343,/,SYM,O

29136,1343,ml,NNS,O

29137,1343,was,VBD,O

29138,1343,diluted,VBN,O

29139,1343,to,IN,O

29140,1343,3,CD,O

29141,1343,mg,NN,O

29142,1343,/,SYM,O

29143,1343,ml,NNS,O

29144,1343,in,IN,O

29145,1343,PBS,NNP,reagent

29146,1343,.,.,O

29147,1343,Therefore,RB,O

29148,1343,,,O

29149,1343,75,CD,O

29150,1343,µl,XX,O

29151,1343,of,IN,O

29152,1343,beta,NN,O

29153,1343,Amylase,NNP,O

29154

38551,1787,sonication,NN,science_word

38552,1787,was,VBD,O

38553,1787,finished,VBN,O

38554,1787,so,IN,O

38555,1787,the,DT,O

38556,1787,whole,JJ,O

38557,1787,procedure,NN,O

38558,1787,for,IN,O

38559,1787,6,CD,O

38560,1787,tubes,NNS,O

38561,1787,takes,VBZ,O

38562,1787,about,RB,O

38563,1787,1,CD,O

38564,1787,h,NN,O

38565,1787,,,O

38566,1787,but,CC,O

38567,1787,it,PRP,O

38568,1787,'s,VBZ,O

38569,1787,also,RB,O

38570,1787,a,DT,O

38571,1787,lot,NN,O

38572,1787,of,IN,O

38573,1787,incubation,NN,O

38574,1787,time,NN,O

38575,1787,with,IN,O

38576,1787,just,RB,O

38577,1787,resuspending,VBG,O

38578,1787,the,DT,O

38579,1787,pellets,NNS,O

38580,1787,.,.,O

38598,1789,Jurkat,NNP,cell_line

38599,1789,cells,NNS,O

38600,1789,were,VBD,O

38601,1789,at,IN,O

38602,1789,0.68,CD,O

38603,1789,and,CC,O

38604,1789,86,CD,O

38605,1789,%,NN,O

38606,1789,.,.,O

38607,1789,I,PRP,O

38608,1789,'m,VBP,O

38609,1789,going,VBG,O

38610,1789,to,TO,O

38611,1789,split,VB,O

38612,1789,th

50682,2379,the,DT,O

50683,2379,noise,NN,O

50684,2379,around,IN,O

50685,2379,me,PRP,O

50686,2379,this,DT,O

50687,2379,maybe,RB,O

50688,2379,influencing,VBG,O

50689,2379,the,DT,O

50690,2379,transcription,NN,O

50691,2379,.,.,O

50692,2379,The,DT,O

50693,2379,next,JJ,O

50694,2379,plans,NNS,O

50695,2379,are,VBP,O

50696,2379,to,TO,O

50697,2379,split,VB,O

50698,2379,the,DT,O

50699,2379,cells,NNS,O

50700,2379,SF9,NNP,cell_line

50701,2379,ESF,NNP,O

50702,2379,as,RB,O

50703,2379,well,RB,O

50704,2379,as,IN,O

50705,2379,SF9,NNP,cell_line

50706,2379,Lonza,NNP,O

50707,2379,to,IN,O

50708,2379,one,CD,O

50709,2379,million,CD,O

50710,2379,cells,NNS,O

50711,2379,per,IN,O

50712,2379,milliliter,NN,O

50713,2379,and,CC,O

50714,2379,later,RB,O

50715,2379,on,RB,O

50716,2379,also,RB,O

50717,2379,produce,VB,O

50718,2379,P1,NN,O

50719,2379,virus,NN,O

50720,2379,.,.,O

50721,2380,For,IN,O

50722,2380,paraformaldehyde,NN,O

50723,2380,fixation,NN,O

50724,2380,add,VBP,O

50725,2

59721,2848,2,CD,O

59722,2848,.,.,O

59723,2848,Caterpillar,NN,O

59724,2848,.,.,O

59730,2850,116,CD,O

59731,2850,cut,NN,O

59732,2850,TL1,NNP,cell_line

59733,2850,,,O

59734,2850,TL2,NNP,gene

59735,2850,,,O

59737,2850,TL3,NNP,O

59738,2850,,,O

59739,2850,TL4,NNP,gene

59740,2850,,,O

59741,2850,LB1,NNP,gene

59742,2850,,,O

59743,2850,LB3,NNP,O

59744,2850,.,.,O

59746,2850,This,DT,O

59747,2850,the,DT,O

59748,2850,only,JJ,O

59749,2850,incentive,NN,O

59750,2850,that,WDT,O

59751,2850,worked,VBD,O

59752,2850,for,IN,O

59753,2850,all,DT,O

59754,2850,of,IN,O

59755,2850,them,PRP,O

59756,2850,,,O

59757,2850,which,WDT,O

59758,2850,is,VBZ,O

59759,2850,NDE1,NNP,O

59760,2850,.,.,O

59761,2850,And,CC,O

59762,2850,cut,VB,O

59763,2850,TL4,NNP,gene

59764,2850,with,IN,O

59765,2850,BamH1,NNP,O

59766,2850,.,.,O

59767,2850,High,JJ,O

59768,2850,fidelity,NN,O

59769,2850,.,.,O

59770,2850,For,IN,O

59771,2850,3,CD,O

59772,2850,h,NN,O

59773,2850,then,RB,O

59774,2850,split,VBD,O


67376,3218,very,RB,O

67377,3218,fast,RB,O

67378,3218,.,.,O

67379,3218,Maybe,RB,O

67380,3218,I,PRP,O

67381,3218,have,VBP,O

67382,3218,to,TO,O

67383,3218,split,VB,O

67384,3218,them,PRP,O

67385,3218,higher,JJR,O

67386,3218,than,IN,O

67387,3218,1,CD,O

67388,3218,in,IN,O

67389,3218,10,CD,O

67390,3218,.,.,O

67538,3225,Transfection,NNP,O

67539,3225,of,IN,O

67540,3225,NPR,NNP,gene

67541,3225,2,CD,O

67542,3225,plates,NNS,O

67543,3225,.,.,O

67674,3232,Tip,NN,O

67675,3232,7,CD,O

67676,3232,was,VBD,O

67677,3232,used,VBN,O

67678,3232,for,IN,O

67679,3232,dispensing,VBG,O

67680,3232,10X,CD,O

67681,3232,substrate,NN,O

67682,3232,KN,NNP,gene

67683,3232,.,.,O

67784,3240,Remove,VB,O

67785,3240,10,CD,O

67786,3240,µl,CD,O

67787,3240,from,IN,O

67788,3240,B1,NN,O

67789,3240,through,IN,O

67790,3240,B12,NNP,O

67791,3240,and,CC,O

67792,3240,pipetted,VBN,science_word

67793,3240,for,IN,O

67794,3240,a,DT,O

67795,3240,total,NN,O

67796,3240,of,IN,O

67797,3240,40,CD,O

677

In [62]:
with open('SPACY NER/imba_small') as file:
    with open('imba_small_test') as t_file:
    
        for l in t_file.readlines():
            
            for l2 in file.readlines():
                if l[:3] == l2[:3]:
                    print(l)
                    print(l2)


64800,3104,Today,NN,O

648,25,1,CD,O

64800,3104,Today,NN,O

6480,325,up,RP,O

64800,3104,Today,NN,O

6481,325,a,DT,O

64800,3104,Today,NN,O

6482,325,ligation,NN,O

64800,3104,Today,NN,O

6483,325,reaction,NN,O

64800,3104,Today,NN,O

6484,325,using,VBG,O

64800,3104,Today,NN,O

6485,326,Miss,VBP,O

64800,3104,Today,NN,O

6486,326,molecular,JJ,O

64800,3104,Today,NN,O

6487,326,weight,NN,O

64800,3104,Today,NN,O

6488,326,213.25,CD,O

64800,3104,Today,NN,O

6489,326,.,.,O



In [65]:
with open('SPACY NER/imba_small2', 'w') as file2:

    with open('SPACY NER/imba_small') as file:
        for l in file.readlines():
            items = l.split(',')
            
            if len(items) == 5:
                file2.write(l)
        

### 4. Add synthetic data to the newest tagged set.<a name="fjórða"></a>

     - Check the synthetic data plan 
     - Check the already labelled dataset

In [None]:
# mark unambiguous terms from  from wikilabelled



write_to = open('SPACY NER/cell_gene_reagent')


with open('data/synthetic_sentences_normalized.txt', 'w')  as write_from:

    # label everything else with an 'O'
    j = 0 
    for l in cells.readlines():
        items = l.split(',')
        new_file.write(l)
        j += 1

            

In [49]:
import os
os.getcwd()

'/Users/valdimareggertsson/Documents/Valdi/Vetrarönn 2019/NER'