In [1]:
!pip install spacy-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy-transformers
  Downloading spacy_transformers-1.1.8-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.7 MB/s 
[?25hCollecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.8.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 29.6 MB/s 
Collecting transformers<4.22.0,>=3.4.0
  Downloading transformers-4.21.3-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 55.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 41.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 9.8 MB/s

In [2]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans
from spacy import displacy
import spacy_transformers
#spacy.require_gpu()

In [3]:
# annotations = [(text, label), (text, label)]
def buildEntity(text, annotations):
    entity = {}
    entity['content'] = text
    entity['annotations'] = []
    for ent_text, ent_label in annotations:
        curr = {}
        curr['start'] = text.find(ent_text)
        curr['end'] = curr['start'] + len(ent_text)
        curr['tag_name'] = ent_label
        entity['annotations'].append(curr)
    return entity

In [4]:
data = {
    'examples': [
        {
            'content': "HW 2 - Tuesday, September 27th",
            'annotations': [
                {
                    'start': 0,
                    'end': 4,
                    'tag_name': "ASSIGNMENT"
                },
                {
                    'start': 16,
                    'end': 30,
                    'tag_name': "DATE"
                }
            ]
        },
        {
            'content': "Exam #1 - 10/11",
            'annotations': [
                {
                    'start': 0,
                    'end': 7,
                    'tag_name': "ASSIGNMENT"
                },
                {
                    'start': 10,
                    'end': 15,
                    'tag_name': "DATE"
                }
            ]
        },
        {
            'content': "Reading Assignment 4: 3-16",
            'annotations': [
                {
                    'start': 0,
                    'end': 20,
                    'tag_name': "ASSIGNMENT"
                },
                {
                    'start': 22,
                    'end': 26,
                    'tag_name': "DATE"
                }
            ]
        },
        {
            'content': "Reading Assignment 4- 03-5-22",
            'annotations': [
                {
                    'start': 0,
                    'end': 20,
                    'tag_name': "ASSIGNMENT"
                },
                {
                    'start': 22,
                    'end': 29,
                    'tag_name': "DATE"
                }
            ]
        },
        {
            'content': "Exam 4 - 10/11/22",
            'annotations': [
                {
                    'start': 0,
                    'end': 6,
                    'tag_name': "ASSIGNMENT"
                },
                {
                    'start': 9,
                    'end': 17,
                    'tag_name': "DATE"
                }
            ]
        },
        {
            'content': "We will have Exam 4 on March 3rd, 2022",
            'annotations': [
                {
                    'start': 13,
                    'end': 19,
                    'tag_name': "ASSIGNMENT"
                },
                {
                    'start': 23,
                    'end': 32,
                    'tag_name': "DATE"
                }
            ]
        },
        {
            'content': "Week of September 9",
            'annotations': [
                {
                    'start': 8,
                    'end': 19,
                    'tag_name': "DATE"
                }
            ]
        },
        {
            'content': "office hours: w 1:00 - 2:00pm, th 2:30 - 3:30pm, or by appointment",
            'annotations': []
        },
        {
            'content': "time: t/th 1:15 - 2:30pm",
            'annotations': []
        },
        {
            'content': "prerequisite: one of math 2810, 2820, or 3641; and one of math 2410, 2501, or 2600.",
            'annotations': []
        },
        {
            'content': "HW 2 - Sep. 27",
            'annotations': [
                {
                    'start': 0,
                    'end': 4,
                    'tag_name': "ASSIGNMENT"
                },
                {
                    'start': 7,
                    'end':14,
                    'tag_name': "DATE"
                }
            ]
        },
        {
            'content': "Final: May 5, 3:00pm",
            'annotations': [
                {
                    'start': 0,
                    'end': 5,
                    'tag_name': "ASSIGNMENT"
                },
                {
                    'start': 7,
                    'end': 12,
                    'tag_name': "DATE"
                }
            ]
        }
    ]
}

training = [
    ['Final: May 5, 3:00pm', [("Final", "ASSIGNMENT"), ("May 5", "DATE")]],
    ['7 th-9/15 5 quiz 2 , present worth analysis, industry applications  ', [("9/15", "DATE"), ("quiz 2", "ASSIGNMENT")]],
    ['6 tu-9/13 4 ch4 equivalence for repeated cash flows , cont’d  ', [('9/13', "DATE")]],
    ['9/8 ch4 equivalence for repeated cash flows', [('9/8', "DATE")]],
    ['tu-9/6 quiz 1, problem  session : ch1,2&3', [('9/6', "DATE"), ('quiz 1', 'ASSIGNMENT')]],
    ['tu-9/27  exam 1', [('9/27', 'DATE'), ('exam 1', "ASSIGNMENT")]],
    ['11 th-9/29 6 annual cash flow analysis', [('9/29', "DATE")]],
    ['15 tu-10/18 9 quiz 3, other analysis techniques', [('10/18', "DATE"), ('quiz 3', 'ASSIGNMENT')]],
    ['18 th-10/27  review for exam 2', [('10/27', "DATE")]],
    ['19 tu-11/1 exam 2', [('11/1', "DATE"), ('exam 2', "ASSIGNMENT")]]
]
for train in training:
  data['examples'].append(buildEntity(train[0], train[1]))
print(data['examples'])

[{'content': 'HW 2 - Tuesday, September 27th', 'annotations': [{'start': 0, 'end': 4, 'tag_name': 'ASSIGNMENT'}, {'start': 16, 'end': 30, 'tag_name': 'DATE'}]}, {'content': 'Exam #1 - 10/11', 'annotations': [{'start': 0, 'end': 7, 'tag_name': 'ASSIGNMENT'}, {'start': 10, 'end': 15, 'tag_name': 'DATE'}]}, {'content': 'Reading Assignment 4: 3-16', 'annotations': [{'start': 0, 'end': 20, 'tag_name': 'ASSIGNMENT'}, {'start': 22, 'end': 26, 'tag_name': 'DATE'}]}, {'content': 'Reading Assignment 4- 03-5-22', 'annotations': [{'start': 0, 'end': 20, 'tag_name': 'ASSIGNMENT'}, {'start': 22, 'end': 29, 'tag_name': 'DATE'}]}, {'content': 'Exam 4 - 10/11/22', 'annotations': [{'start': 0, 'end': 6, 'tag_name': 'ASSIGNMENT'}, {'start': 9, 'end': 17, 'tag_name': 'DATE'}]}, {'content': 'We will have Exam 4 on March 3rd, 2022', 'annotations': [{'start': 13, 'end': 19, 'tag_name': 'ASSIGNMENT'}, {'start': 23, 'end': 32, 'tag_name': 'DATE'}]}, {'content': 'Week of September 9', 'annotations': [{'start': 

In [5]:
training_data = {'classes': ['ASSIGNMENT', "DATE"], 'annotations': []}
for example in data['examples']:
    temp_dict = {}
    temp_dict['text'] = example['content']
    temp_dict['entities'] = []
    for annotation in example['annotations']:
        start = annotation['start']
        end = annotation['end']
        label = annotation['tag_name'].upper()
        temp_dict['entities'].append((start, end, label))
    training_data['annotations'].append(temp_dict)

print(training_data['annotations'])

[{'text': 'HW 2 - Tuesday, September 27th', 'entities': [(0, 4, 'ASSIGNMENT'), (16, 30, 'DATE')]}, {'text': 'Exam #1 - 10/11', 'entities': [(0, 7, 'ASSIGNMENT'), (10, 15, 'DATE')]}, {'text': 'Reading Assignment 4: 3-16', 'entities': [(0, 20, 'ASSIGNMENT'), (22, 26, 'DATE')]}, {'text': 'Reading Assignment 4- 03-5-22', 'entities': [(0, 20, 'ASSIGNMENT'), (22, 29, 'DATE')]}, {'text': 'Exam 4 - 10/11/22', 'entities': [(0, 6, 'ASSIGNMENT'), (9, 17, 'DATE')]}, {'text': 'We will have Exam 4 on March 3rd, 2022', 'entities': [(13, 19, 'ASSIGNMENT'), (23, 32, 'DATE')]}, {'text': 'Week of September 9', 'entities': [(8, 19, 'DATE')]}, {'text': 'office hours: w 1:00 - 2:00pm, th 2:30 - 3:30pm, or by appointment', 'entities': []}, {'text': 'time: t/th 1:15 - 2:30pm', 'entities': []}, {'text': 'prerequisite: one of math 2810, 2820, or 3641; and one of math 2410, 2501, or 2600.', 'entities': []}, {'text': 'HW 2 - Sep. 27', 'entities': [(0, 4, 'ASSIGNMENT'), (7, 14, 'DATE')]}, {'text': 'Final: May 5, 3

In [10]:
nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin() # create a DocBin object

# Text to vectors to Spacy documents
for training_example in tqdm(training_data['annotations']):
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            # Weird issue with th-9/12 input (we only want 9/12 here)
            new_text = text.replace("-", " ")
            new_doc = nlp.make_doc(new_text)
            span = new_doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
              print("Skipping entity:" + str(label))
            else:
              print(span)
            ents.append(span)
        else:
            print(span)
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)
doc_bin.to_disk("training_data.spacy") # save the docbin object

100%|██████████| 22/22 [00:00<00:00, 1431.77it/s]

HW 2
September 27th
Exam #1
10/11
Reading Assignment 4
3-16
Reading Assignment
03-5-22
Exam 4
10/11/22
Exam 4
March 3rd
September 9
HW 2
Sep. 27
Final
May 5
Final
May 5
9/15
quiz 2
9/13
9/8
9/6
quiz 1
9/27
exam 1
9/29
10/18
quiz 3
10/27
11/1
exam 2





In [11]:
!python -m spacy init fill-config base_config.cfg config.cfg   

2022-11-03 14:54:34.452738: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [151]:
!python3 -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy --gpu-id 0


[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2022-11-02 20:32:19,646] [INFO] Set up nlp object from config
INFO:spacy:Set up nlp object from config
[2022-11-02 20:32:19,657] [INFO] Pipeline: ['transformer', 'ner']
INFO:spacy:Pipeline: ['transformer', 'ner']
[2022-11-02 20:32:19,661] [INFO] Created vocabulary
INFO:spacy:Created vocabulary
[2022-11-02 20:32:19,662] [INFO] Finished initializing nlp object
INFO:spacy:Finished initializing nlp object
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initial

In [152]:
nlp_ner = spacy.load("model-best")

In [179]:
doc = nlp_ner("Quiz 3: August 24")

In [180]:
colors = {"ASSIGNMENT": "#F67DE3", "DATE": "#7DF6D9"}
options = {"colors": colors} 
spacy.displacy.render(doc, style="ent", options= options, jupyter=True)

In [159]:
!zip -r model-best.zip model-best

  adding: model-best/ (stored 0%)
  adding: model-best/tokenizer (deflated 81%)
  adding: model-best/transformer/ (stored 0%)
  adding: model-best/transformer/cfg (stored 0%)
  adding: model-best/transformer/model (deflated 16%)
  adding: model-best/config.cfg (deflated 61%)
  adding: model-best/vocab/ (stored 0%)
  adding: model-best/vocab/strings.json (deflated 74%)
  adding: model-best/vocab/lookups.bin (stored 0%)
  adding: model-best/vocab/vectors (deflated 45%)
  adding: model-best/vocab/key2row (stored 0%)
  adding: model-best/vocab/vectors.cfg (stored 0%)
  adding: model-best/ner/ (stored 0%)
  adding: model-best/ner/cfg (deflated 33%)
  adding: model-best/ner/model (deflated 8%)
  adding: model-best/ner/moves (deflated 53%)
  adding: model-best/meta.json (deflated 61%)


In [None]:
from google.colab import files
files.download("model-best.zip")

In [160]:
!cp model-best.zip drive/MyDrive/Vanderbilt