In [5]:
# cited from tutorial of pytorch: https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 5
HIDDEN_DIM = 4

In [6]:
# Make up some training data
training_data = [(
    "the wall street journal reported today that apple corporation made money".split(),
    "B I I I O O O B I O O".split()
), (
    "georgia tech is a university in georgia".split(),
    "B I O O O O B".split()
)]


In [48]:
word_to_ix = {}                                                                                                                                                                                        
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
import json
print(json.dumps(word_to_ix, indent=2, ensure_ascii=False))

# tag_dict
tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4}
id_to_tag = {0:"B", 1:"I", 2:"O", 3: START_TAG, 4:STOP_TAG}


{
  "the": 0,
  "wall": 1,
  "street": 2,
  "journal": 3,
  "reported": 4,
  "today": 5,
  "that": 6,
  "apple": 7,
  "corporation": 8,
  "made": 9,
  "money": 10,
  "georgia": 11,
  "tech": 12,
  "is": 13,
  "a": 14,
  "university": 15,
  "in": 16
}


In [25]:
from BiLSTM_CRF import BiLSTM_CRF
from BiLSTM_CRF import argmax,prepare_sequence,log_sum_exp
import torch.optim as optim
import torch

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM) # (11, tag_dict, 5, 4)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# Check predictions before training
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
    precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long)
    print(model(precheck_sent))
    print(model(precheck_tags))
    

(tensor(14.0426), [2, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1])
(tensor(14.8678), [2, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1])


In [45]:
print(precheck_sent)
print(precheck_tags)
print(training_data[0][0])

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])
tensor([0, 1, 1, 1, 2, 2, 2, 0, 1, 2, 2])
['the', 'wall', 'street', 'journal', 'reported', 'today', 'that', 'apple', 'corporation', 'made', 'money']


In [46]:
# Make sure prepare_sequence from earlier in the LSTM section is loaded
for epoch in range(                                                                                                                                                                                    
        300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad() # 为什么这里不是optimizer.zero_grad()  ; 解决: 这里model.zero_grad()跟optimizer.zero_grad()是等价的

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()


In [47]:
# Check predictions after training
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
    print(model(precheck_sent))
    


(tensor(34.3337), [0, 1, 1, 1, 2, 2, 2, 0, 1, 2, 2])
