In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import spacy
from spacy import displacy
from spacy.tokens import DocBin, Doc, Span, Token
from spacy.matcher import Matcher
from spacy import Language
import random
from utils import string, punct, label_sent, label_many_sents, LabelHolder, my_sentencizer
# from text_to_num import alpha2digit # python -m pip install text2num

In [2]:
nlp = spacy.load('en_core_web_md')
nlp.remove_pipe('ner')

('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1a4f82c07b0>)

In [3]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x1a4f8457280>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x1a4f8457f40>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x1a4f82c0970>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x1a4f850d700>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x1a4f851c100>)]

In [4]:
iron_labels = ['DEATH', 'TIME', 'LIGHT']
iron_label_holder = LabelHolder(iron_labels)

In [5]:
matcher = Matcher(nlp.vocab)

In [6]:
# https://spacy.io/usage/rule-based-matching
with open('death_words.txt') as f:
    death_words = f.read().split()

def on_death_match(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    ent = Span(doc, start, end, label='DEATH')
    try:
        doc.ents += (ent,)
    except ValueError:
        # due to trying to add an entity where one already exists
        pass

matcher.add('DEATH_RULES', [
        [{'LEMMA': 'die', 'POS': 'VERB'}], # because of "die" as in "dice"
        [{'LEMMA': 'grave', 'POS': 'NOUN'}] # because of "grave" as in "grave accusation" 
    ] + 
    [[{'LEMMA': x}] for x in death_words], 
            on_match=on_death_match)

In [7]:
with open("time_words.txt") as f:
    time_words = f.read().split()

def on_time_match(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    ent = Span(doc, start, end, label='TIME')
    try:
        doc.ents += (ent,)
    except ValueError:
        pass
    
matcher.add('TIME_RULES', [
        [{'LEMMA': x}] for x in time_words
    ] + [[{'LEMMA': 'second', 'POS': 'NOUN'}]], # because of "second" as in "I second that emotion" or "a second bite" 
            on_match=on_time_match)

In [8]:
with open('light_words.txt') as f:
    light_words = f.read().split()

def on_light_match(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    ent = Span(doc, start, end, label='LIGHT')
    try:
        doc.ents += (ent,)
    except ValueError:
        pass
    
matcher.add('LIGHT_RULES', [
    [{'LEMMA': x}] for x in light_words
] + [[{'LEMMA': 'light', 'POS': 'NOUN'}],
     [{'LEMMA': 'light', 'POS': 'VERB'}], # have to exclude 'light' when used as an adjective
    ],
on_match=on_light_match
)

In [9]:
# @Language.component('light_dark_time')
# def light_dark_time_match(doc):
#     matcher(doc)
#     return doc
# nlp_special = spacy.load('en_core_web_md')
# nlp_special.add_pipe('light_dark_time', after='lemmatizer')
# nlp_special.remove_pipe('ner')

In [10]:
ex = nlp('''He is dead, we're dying, roll a die, it's deadly, the soul dies.
After hundreds of years, the ancient dark demon died in seconds.
It took hours to climb its carcass.
His heart was light, but once the light failed, buzzards swooped out of the darkness of night''')
matcher(ex)
print([(e.text, e.label_) for e in ex.ents])

[('dead', 'DEATH'), ('dying', 'DEATH'), ('soul', 'DEATH'), ('dies', 'DEATH'), ('years', 'TIME'), ('ancient', 'TIME'), ('dark', 'LIGHT'), ('died', 'DEATH'), ('seconds', 'TIME'), ('hours', 'TIME'), ('light', 'LIGHT'), ('darkness', 'LIGHT'), ('night', 'LIGHT')]


In [11]:
iron = pd.read_csv('iron_maiden_songs.tsv', delimiter='\t')

In [12]:
iron = iron[iron.lyrics.apply(type) == str]
# there are four pure-instrumental songs, so there's no lyrics listed
# it wasn't a web-scraping error!

In [13]:
iron['lyrics'] = iron.lyrics.apply(
    lambda x: my_sentencizer([x.split() for x in x.split('\\n')[:-1]])
)

In [14]:
iron[iron.album == 'No More Lies - Dance Of Death'].lyrics.iat[0][:300]

"One, two, three, four...  Hahaha! Oh! They even got the music to go with it, thatâ\x80\x99s lovely! I can't be compromising in my thoughts no more, ha ha, oh yeah, ah. I can't remember whats-er-name the name my anger fills my heart. I can't be sympathising with a new lost fart. Hahaha! Ohâ\x80¦ Ohhh... I can"

In [15]:
iron = iron[iron.album != 'No More Lies - Dance Of Death']
# that's a live show where Bruce Dickinson is babbling about some random stuff

In [16]:
iron.columns

Index(['album', 'year', 'num', 'title', 'lyrics'], dtype='object')

In [17]:
albums = set(iron.album)

In [18]:
albums

{'A Matter Of Life And Death',
 'Be Quick Or Be Dead',
 'Brave New World',
 'Can I Play With Madness',
 'Dance Of Death',
 'Fear Of The Dark',
 'From Here For Eternity',
 'Iron Maiden',
 'Killers',
 'Man On The Edge',
 'No Prayer For The Dying',
 'Piece Of Mind',
 'Powerslave',
 'Rainmaker',
 'Running Free',
 'Senjutsu',
 'Seventh Son Of A Seventh Son',
 'Somewhere In Time',
 'The Book Of Souls',
 'The Final Frontier',
 'The Number Of The Beast',
 'The X-Factor',
 'Virtual XI',
 'Virus',
 'Wasted Years',
 'Wildest Dreams',
 'Women In Uniform'}

In [19]:
lyr_ctx = [(r.lyrics, r.iloc[:-1].to_dict()) for _,r in iron.iterrows()]

In [20]:
lyr_ctx[13]

("As I lay here lying on my bed, sweet voices come into my head. Oh what it is, I wanna know, please won't you tell me it's got to go. There's a feeling that's inside me, telling me to get away. But I'm so tired of living, I might as well end today. ",
 {'album': 'Killers', 'year': 1981, 'num': 4, 'title': 'Another Life'})

In [21]:
for ext in ['album', 'year', 'song_num', 'title']:
    Doc.set_extension(ext, default=None)

In [22]:
all_sents = []
for doc, ctx in nlp.pipe(lyr_ctx, as_tuples=True):
    # we first run this through the naive model that doesn't recognize entities
    # just so that we can use its sentencizer
    for sent in doc.sents:
        s = sent.as_doc()
        all_sents.append((s.text, ctx))
del lyr_ctx

In [23]:
train_test_inds = list(range(len(all_sents)))
random.shuffle(train_test_inds)

In [24]:
val_cutoff = int(len(all_sents) * 0.9)
train_cutoff = int(len(all_sents) * 0.4)
train_inds = train_test_inds[:train_cutoff]
test_inds = train_test_inds[train_cutoff:val_cutoff]
val_inds = train_test_inds[val_cutoff:]
train_sents = [all_sents[ii] for ii in train_inds]
test_sents = [all_sents[ii] for ii in test_inds]
val_sents = [all_sents[ii] for ii in val_inds]

In [25]:
train_docs = []
for doc, ctx in nlp.pipe(train_sents, as_tuples=True):
    # now we our pre-defined labeling rules to label only the training set
    # we will also hand-label a lot of the training set to get some things that slipped through our initial rules
    doc._.album = ctx['album']
    doc._.song_num = ctx['num']
    doc._.title = ctx['title']
    doc._.year = ctx['year']
    matcher(doc)
    train_docs.append(doc)

In [26]:
test_docs = []
for doc, ctx in nlp.pipe(test_sents, as_tuples=True):
    doc._.album = ctx['album']
    doc._.song_num = ctx['num']
    doc._.title = ctx['title']
    doc._.year = ctx['year']
    matcher(doc)
    test_docs.append(doc)

In [27]:
val_docs = []
for doc, ctx in nlp.pipe(val_sents, as_tuples=True):
    doc._.album = ctx['album']
    doc._.song_num = ctx['num']
    doc._.year = ctx['year']
    doc._.title = ctx['title']
    matcher(doc)
    val_docs.append(doc)

In [28]:
docs_by_album = iron.groupby(['album', 'year']) \
                     .apply(lambda x: nlp('\n'.join(x.lyrics))) \
                     .reset_index() \
                     .rename({0: 'lyrics'}, axis=1) \
                     .sort_values('year')

In [29]:
iron['lyrics'] = iron.lyrics.apply(lambda x: nlp(x))

In [30]:
alchemist = iron[iron.title == 'The Alchemist'].iat[0, -1]

In [31]:
docs_by_album.iloc[[0,1,-2, -1], :]

Unnamed: 0,album,year,lyrics
26,Women In Uniform,1980,"(Beehive, hairdo, ,, 45, on, the, hip, ., Patr..."
14,Running Free,1980,"(So, you, think, you, can, own, me, ,, well, y..."
18,The Book Of Souls,2015,"(Here, is, the, soul, of, a, man, ., Here, in,..."
15,Senjutsu,2021,"(Beat, the, warning, the, sound, of, the, drum..."


In [32]:
iron_doc = docs_by_album[docs_by_album.album == 'Iron Maiden'].lyrics.iat[0]
senju_doc = docs_by_album[docs_by_album.album == 'Senjutsu'].lyrics.iat[0]

In [33]:
# iron.to_pickle('all_songs_df__en_core_web_md.pkl.gz')

In [34]:
# docs_by_album.to_pickle('songs_by_album_df__en_core_web_md.pkl.gz')

In [53]:
mysents = [
    nlp("I am a light and i love flight!"),
    nlp("I eat light every second."),
    nlp("Squirrels also eat a lot of death"),
    nlp("I second that emotion.")
]
for sent in mysents:
    matcher(sent)
[s.ents for s in mysents]

[(light,), (light, second), (death,), ()]

In [36]:
print(len(train_docs), len(test_docs), len(val_docs))
print('train\n', [(e.text, e.label_) for sent in train_docs[:60] for e in sent.ents])
print('test\n', [(e.text, e.label_) for sent in test_docs[:60] for e in sent.ents])
# to make sure the train sentences are being labeled

2671 3340 668
train
 [('time', 'TIME'), ('hour', 'TIME'), ('white', 'LIGHT'), ('light', 'LIGHT'), ('kill', 'DEATH'), ('dead', 'DEATH'), ('dead', 'DEATH'), ('Waiting', 'TIME')]
test
 [('Time', 'TIME'), ('waits', 'TIME'), ('dead', 'DEATH'), ('time', 'TIME'), ('time', 'TIME'), ('fates', 'DEATH'), ('dead', 'DEATH'), ('kill', 'DEATH'), ('Dark', 'LIGHT'), ('dark', 'LIGHT'), ('Kill', 'DEATH'), ('dead', 'DEATH'), ('graves', 'DEATH'), ('dark', 'LIGHT'), ('time', 'TIME'), ('time', 'TIME'), ('Waiting', 'TIME')]


In [37]:
# label_many_sents(mysents, ['animal', 'food'])
# [[(e.text, e.label_) for e in s.ents] for s in mysents]

In [38]:
# random_sents = random.sample(train_docs, 500)
# label_many_sents(random_sents, iron_label_holder)

In [39]:
train_doc_bin = DocBin()
for d in train_docs:
    train_doc_bin.add(d)
train_doc_bin.to_disk('train_docs__en_core_web_md.spacy')
# note that this is much faster than pickling the DataFrames
test_doc_bin = DocBin()
for d in test_docs:
    test_doc_bin.add(d)
test_doc_bin.to_disk('test_docs__en_core_web_md.spacy')
val_doc_bin = DocBin()
for d in val_docs:
    val_doc_bin.add(d)
val_doc_bin.to_disk('val_docs__en_core_web_md.spacy')

In [40]:
!python -m spacy init config -F ./config.cfg --lang en --pipeline tagger,parser,ner

[i] Generated config template specific for your use case
- Language: en
- Pipeline: tagger, parser, ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


2022-05-03 14:37:04.403082: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-05-03 14:37:04.403119: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [41]:
!python -m spacy train ./config.cfg --output ./output --paths.train train_docs__en_core_web_md.spacy --paths.dev test_docs__en_core_web_md.spacy

[i] Saving to output directory: output

2022-05-03 14:37:11.791570: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-05-03 14:37:11.791608: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[2022-05-03 14:37:16,171] [INFO] Set up nlp object from config
[2022-05-03 14:37:16,194] [INFO] Pipeline: ['tok2vec', 'tagger', 'parser', 'ner']
[2022-05-03 14:37:16,202] [INFO] Created vocabulary
[2022-05-03 14:37:16,203] [INFO] Finished initializing nlp object
[2022-05-03 14:37:18,549] [INFO] Initialized pipeline components: ['tok2vec', 'tagger', 'parser', 'ner']



[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'tagger', 'parser', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS TAGGER  LOSS PARSER  LOSS NER  TAG_ACC  DEP_UAS  DEP_LAS  SENTS_F  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  -----------  -----------  --------  -------  -------  -------  -------  ------  ------  ------  ------
  0       0          0.00        90.89       252.60      7.64    32.27    23.83    11.94     0.04    0.00    0.00    0.00    0.17
  0     200       2158.39      8647.25     16985.99   1170.19    85.53    80.04    71.72    95.73   67.49   73.42   62.45    0.76
  2     400       3129.96      3653.84      9351.59    513.23    92.28    86.04    79.57    98.96   87.62   86.80   88.45    0.88
  3     600       3614.08      2100.48      7602.37    221.77    93.60    87.23    82.16    99.19   91.24   89.24   93.33    0.90
  5     800       4209.61      1638.17      6439.77    144.34    93.97    88.02    83.3