In [1]:
import spacy
import json
import os

from spacy.tokens import DocBin
from tqdm import tqdm

from spacy.util import filter_spans

from spacy import displacy

DATA_PATH, MODELS_PATH = "data/", "models/"

nlp = spacy.blank("en")
doc_bin = DocBin()

In [2]:
with open((DATA_PATH + "train.json"), "r") as f:
    data = json.load(f)

In [3]:
for example in data:
    line = example['line']
    entities = example['entities']
    doc = nlp.make_doc(line)
    ents = []
    for start, end, label in entities:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span:
            ents.append(span)
        else:
            print(f"Skipping entity: {example}, ({start}, {end}, {label})")
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)
    
doc_bin.to_disk(MODELS_PATH + "train.spacy")

In [4]:
# Build config.cfg if it doesn't exist (edit paths after creation)

# !python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

In [5]:
!python -m spacy train config.cfg --output ./models --paths.train ./models/train.spacy --paths.dev ./models/train.spacy

[38;5;4mℹ Saving to output directory: models[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     58.09    0.00    0.00    0.00    0.00
  0     200        735.47   4241.75   83.21   83.20   83.22    0.83
  1     400        591.73   1126.11   95.34   95.33   95.35    0.95
  2     600        184.04    533.90   98.24   98.20   98.29    0.98
  3     800        171.56    331.40   98.49   98.49   98.49    0.98
  5    1000        342.60    373.59   98.80   98.80   98.80    0.99
  7    1200        364.76    321.78   99.58   99.58   99.58    1.00
  9    1400        255.88    221.44   99.62   99.62   99.62    1.00
 12    1600        449.58    258.02   99.71   99.71   99.71    1.00
 16    1800        442.81    227.76   99.73   

In [6]:
nlp_ner = spacy.load("models/model-best")

test_lines = [
    'Trevor Lawrence pass complete short right to Luke Farrell for no gain (tackle by Micah Hyde). Penalty on Anton Harrison: Ineligible Downfield Pass, 5 yards (accepted) (no play)',
    'Nick Martin aborted snap, recovered by Deshaun Watson at LAC-21, touchdown Deshaun Watson pass complete deep right to Darren Fells for 16 yards, touchdown',
    'Marcus Mariota left end for 8 yards (tackle by Adam Jones). Penalty on Vontaze Burfict: Disqualification',
    'Kyler Murray right end for 16 yards (tackle by Donovan Wilson). Penalty on Tyrone Crawford: Taunting',
    'Ben Roethlisberger kneels for -2 yards',
    'Robert Griffin III pass complete short right to Pierre Garcon for 9 yards (tackle by Curtis Lofton and Corey White)',
    'Chandler Catanzaro 39 yard field goal no good, blocked by Bobby Wagner, recovered by Bobby Wagner',
    'Clyde Edwards-Helaire right end for no gain (tackle by Brandon Jones and Malik Reed)',
    'Two Point Attempt: Bryce Young pass incomplete, sacked by Baker Mayfield sacked by. Penalty on Joe Tryon-Shoyinka: Face Mask (15 Yards), 1 yard (accepted) (no play)',
    'Amon-Ra St. Brown left end for 6 yards (tackle by Derwin James)',
    'Two Point Attempt: Josh Allen pass incomplete intended for Josh Allen sacked by',
    'Aaron Rodgers pass complete short right to Equanimeous St. Brown for 7 yards (tackle by Adrian Amos)',
    'Patrick Mahomes aborted snap, recovered by Patrick Mahomes at SF-6 Patrick Mahomes pass incomplete',
    "Dan Bailey yard field goal no good blocked by Joshua Kalu, recovered by Tye Smith. Penalty on Joshua Kalu: Defensive Offside, 5 yards (no play)",
    ", recovered by Jalen Hurts at PHI-20 Jalen Hurts for no gain. Penalty on Deatrich Wise: Defensive Offside, 5 yards (accepted) (no play)",
    "Robert Turbin up the middle for no gain (tackle by Alec Ogletree). Penalty on Alec Ogletree: Unsportsmanlike Conduct, 6 yards, Penalty on Kendall Langford: Disqualification, 3 yards, Penalty on Kendall Langford: Unsportsmanlike Conduct, 1 yards",
    "Eli Manning pass complete short right to Odell Beckham for 6 yards (tackle by Alec Ogletree). Penalty on Alec Ogletree: Unsportsmanlike Conduct (Offsetting), Penalty on NYG: Unnecessary Roughness (Offsetting), Penalty on NYG: Unnecessary Roughness, 13 yards (Offsetting), Penalty on Preston Parker: Disqualification (Offsetting), Penalty on Damontre Moore: Disqualification (Offsetting), Penalty on William Hayes: Disqualification (Offsetting)",
    "Two Point Attempt: Jared Goff pass complete to to Quintez Cephus for no gain",
    "Ka'imi Fairbairn kicks off 66 yards, returned by Mecole Hardman for 19 yards (tackle by Xavier Crawford and Cullen Gillaspia). Penalty on Anthony Sherman: Illegal Double-Team Block, 9 yards",
    "Dorian Thompson-Robinson pass deep left (defended by Arthur Maulet) intended for Amari Cooper is intercepted by Brandon Stephens at BAL-38 and returned for 52 yards (tackle by Dorian Thompson-Robinson)",
    "Wan'Dale Robinson right end for -5 yards (tackle by Kurt Hinish and Jalen Pitre)"
]

colors = {
    'PASSER': '#b0e0e6', 'RECEIVER': '#00ff44', 'RUSHER': '#ff7373', 
    'TACKLER': '#003366','PENALIZER': '#ffd700', 'FUMBLE_RECOVERER': '#ff1493', 
    'KICKER': '#d3ffce', 'PUNTER': '#f0f8ff', 'RETURNER': '#faebd7',
    'DEFENDER': '#20b2aa', 'INTERCEPTOR': '#66cdaa', 'SACKER': '#800080',
    'FUMBLER': '#ff00ff', 'OTHER': '#f5f5f5', 'TEAM_NAME': '#008000',
    'FUMBLE_FORCER': '#ee00ee', 'LATERALER': '#ffff00', 'BLOCKER': '#ffe7a6'
}
options = {"colors": colors}

for line in test_lines:
    doc = nlp_ner(line)
    displacy.render(doc, style="ent", options=options, jupyter=True)