# spaCy Formatting
This notebook covers the process for preparing data to run through a spaCy NER model. The code can be reused for any dataset where docs are tokenized in lists.



In [None]:
import pandas as pd
import numpy as np

# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

os.chdir('/content/drive/My Drive/Colab Notebooks/DAAN888/data')
#os.chdir('/content/drive/My Drive/DAAN888/data')
os.getcwd()

'/content/drive/My Drive/Colab Notebooks/DAAN888/data'

### Function for Converting to spaCy format

In [None]:
import spacy
from spacy.tokens import Doc
from spacy.gold import GoldParse
from spacy.gold import biluo_tags_from_offsets, iob_to_biluo, offsets_from_biluo_tags

nlp = spacy.load('en', vectors=False, disable=['tagger', 'parser', 'ner'])
nlp.tokenizer = nlp.tokenizer.tokens_from_list


def spacy_formatting(docs, labels):
  ''' docs and labels must be in list of lists format where docs are 
  already tokenized '''

  result_docs, result_ents = [], []
  for doc, tags in list(zip(docs, labels)):

    biluos = iob_to_biluo(tags) # convert to bilou format

    DOC = Doc(nlp.vocab, words=doc) # make the doc object that's required

    ents = {}
    ents['entities'] = offsets_from_biluo_tags(DOC, biluos)

    doc = ' '.join(doc)

    result_docs.append(doc)
    result_ents.append(ents)
    
  return list(zip(result_docs, result_ents)) # return original tokens and their offsets

### MIT MOVIE CONVERSION

In [None]:
import pickle

with open('mitmovie.pickle', mode = 'rb') as handle:
  dataset = pickle.load(handle)

In [None]:
dataset.keys()

dict_keys(['train_tokens', 'train_labels', 'test_tokens', 'test_labels'])

In [None]:
train = spacy_formatting(dataset['train_tokens'], dataset['train_labels'])

In [None]:
# check a result
train[0], dataset['train_tokens'][0], dataset['train_labels'][0]

(('what movies star bruce willis', {'entities': [(17, 29, 'ACTOR')]}),
 ['what', 'movies', 'star', 'bruce', 'willis'],
 ['O', 'O', 'O', 'B-ACTOR', 'I-ACTOR'])

In [None]:
test = spacy_formatting(dataset['test_tokens'], dataset['test_labels'])

In [None]:
# check a result
test[0], dataset['test_tokens'][0], dataset['test_labels'][0]

(('are there any good romantic comedies out right now',
  {'entities': [(19, 36, 'GENRE'), (41, 50, 'YEAR')]}),
 ['are',
  'there',
  'any',
  'good',
  'romantic',
  'comedies',
  'out',
  'right',
  'now'],
 ['O', 'O', 'O', 'O', 'B-GENRE', 'I-GENRE', 'O', 'B-YEAR', 'I-YEAR'])

In [None]:
import json

# save as json
with open('spacy_mitmovie_train.json', 'w') as handle:
  json.dump(train, handle)  

In [None]:
with open('spacy_mitmovie_test.json', 'w') as handle:
  json.dump(test, handle)



---



---



### GMB DATASET CONVERSION

In [None]:
with open('gmb.pickle', mode = 'rb') as handle:
  dataset = pickle.load(handle)

In [None]:
dataset.keys()

dict_keys(['tokens', 'labels'])

In [None]:
gmb = spacy_formatting(dataset['tokens'], dataset['labels'])

In [None]:
gmb[0], list(zip(dataset['tokens'][0], dataset['labels'][0]))

(('Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .',
  {'entities': [(48, 54, 'geo'), (77, 81, 'geo'), (111, 118, 'gpe')]}),
 [('Thousands', 'O'),
  ('of', 'O'),
  ('demonstrators', 'O'),
  ('have', 'O'),
  ('marched', 'O'),
  ('through', 'O'),
  ('London', 'B-geo'),
  ('to', 'O'),
  ('protest', 'O'),
  ('the', 'O'),
  ('war', 'O'),
  ('in', 'O'),
  ('Iraq', 'B-geo'),
  ('and', 'O'),
  ('demand', 'O'),
  ('the', 'O'),
  ('withdrawal', 'O'),
  ('of', 'O'),
  ('British', 'B-gpe'),
  ('troops', 'O'),
  ('from', 'O'),
  ('that', 'O'),
  ('country', 'O'),
  ('.', 'O')])

In [None]:
# save as json

with open('spacy_gmb.json', 'w') as handle:
  json.dump(gmb, handle)

## Unused Code


In [None]:
def convert_to_spacy(tokens, tags):
  ''' will convert list of docs into spacy span format
  edited but referenced from: https://aihub.cloud.google.com/p/products%2F2290fc65-0041-4c87-a898-0289f59aa8ba 
  '''

  word_tag = []
  for words, labels in zip(tokens, tags ):
    word_tag.append(list(zip(words, labels)))

  
  spacy_format, entities, sentence, unique_labels = [], [], [], []
  current_annotation = None
  end = 0 # initialize counter to keep track of start and end characters
  for doc in word_tag:
    for i, (w, l) in enumerate(doc):
      label = l[2:]
      label_type = l[0]
      sentence.append(w)
      end += (len(w) + 1)

      if label_type != 'I' and current_annotation: # if at end of annotation
        entities.append((start, end - 2 - len(w), current_annotation)) # append the annotation
        current_annotation = None # reset
      if label_type == 'B': # if beginning of an annotation
        start = end - len(w) - 1
        current_annotation = label
      if label_type == 'I': # if annotation is multi-word
        current_annotation = label
      if label != 'O' and label not in unique_labels:
        unique_labels.append(label)

      if i == (len(doc) -  1):
        if current_annotation: # if there was an annotation
          entities.append((start, end - 1, current_annotation))
        sentence = ' '.join([w for w,l in doc])
        spacy_format.append([sentence, {'entities' : entities}])
        # reset the counters and temporary lists
        end = 0            
        entities, sentence = [], []
        current_annotation = None
  
  return spacy_format, unique_labels

In [None]:
#!head engtrain.bio

In [None]:
# used for extracting from bio format
# https://aihub.cloud.google.com/p/products%2F2290fc65-0041-4c87-a898-0289f59aa8ba 

# def load_data_spacy(file_path):
#     ''' Converts data from:
#     label \t word \n label \t word \n \n label \t word
#     to: sentence, {entities : [(start, end, label), (stard, end, label)]}
#     '''
#     file = open(file_path, 'r')
#     training_data, entities, sentence, unique_labels = [], [], [], []
#     current_annotation = None
#     end = 0 # initialize counter to keep track of start and end characters
#     for line in file:
#         line = line.strip("\n").split("\t")
#         # lines with len > 1 are words
#         if len(line) > 1:
#             label = line[0][2:]     # the .txt is formatted: label \t word, label[0:2] = label_type
#             label_type = line[0][0] # beginning of annotations - "B", intermediate - "I"
#             word = line[1]
#             sentence.append(word)
#             end += (len(word) + 1)  # length of the word + trailing space
           
#             if label_type != 'I' and current_annotation:  # if at the end of an annotation
#                 entities.append((start, end - 2 - len(word), current_annotation))  # append the annotation
#                 current_annotation = None                 # reset the annotation
#             if label_type == 'B':                         # if beginning new annotation
#                 start = end - len(word) - 1  # start annotation at beginning of word
#                 current_annotation = label   # append the word to the current annotation
#             if label_type == 'I':            # if the annotation is multi-word
#                 current_annotation = label   # append the word
           
#             if label != 'O' and label not in unique_labels:
#                 unique_labels.append(label)
 
#         # lines with len == 1 are breaks between sentences
#         if len(line) == 1:
#             if current_annotation:
#                 entities.append((start, end - 1, current_annotation))
#             sentence = " ".join(sentence)
#             training_data.append([sentence, {'entities' : entities}])
#             # reset the counters and temporary lists
#             end = 0            
#             entities, sentence = [], []
#             current_annotation = None
#     file.close()
#     return training_data, unique_labels            
           


In [None]:
#TRAIN_DATA, LABELS = load_data_spacy("engtrain.bio")

In [None]:
#TRAIN_DATA[4]

In [None]:
# python -m spacy convert [input_file] [output_dir] [--file-type] [--converter]
# [--n-sents] [--morphology] [--lang]

In [None]:
#!python -m spacy convert engtrain.bio -t json -c ner
# not working

In [None]:
# We should explore the pretraining option, which trains on the corpus of text you have and learns word embeddings https://spacy.io/usage/vectors-similarity
# it does not require annotations - and it might help the performance of our models
# but for now we will just get the data formatted for entry into a spacy model.