In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import logging
import sys
sys.path.append("./utils")

from utils.skseq.sequences.sequence import Sequence
from utils.skseq.readers import pos_corpus
from utils.skseq.sequences.id_feature import IDFeatures
from skseq.sequences.extended_feature import ExtendedFeatures
import utils.skseq.sequences.structured_perceptron as spc
from utils.utils import *

## Structured Perceptron

### Reading the data

Data has the following format:

```sentence_id```, ```words```, ```tags```

Some samples are shown below.
- 0,Thousands,O
- 0,London,B-geo
- 0,British,B-gpe
- 2,Saturday,B-tim
- 2,Taliban,B-org
- 4,Egeland,I-per

In [3]:
corpus = pos_corpus.PostagCorpus()

In [4]:
data_path = "data/"

train_seq = corpus.read_sequence_list(data_path + "train_data_ner.csv")                                     
test_seq = corpus.read_sequence_list(data_path + "test_data_ner.csv")

In [5]:
print(len(train_seq))
print(len(test_seq))

38358
38359


In [6]:
corpus.tag_dict

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-tim': 3,
 'B-org': 4,
 'I-geo': 5,
 'B-per': 6,
 'I-per': 7,
 'I-org': 8,
 'B-art': 9,
 'I-art': 10,
 'I-tim': 11,
 'I-gpe': 12,
 'B-nat': 13,
 'I-nat': 14,
 'B-eve': 15,
 'I-eve': 16}

In [7]:
# number of possible words
len(train_seq.x_dict), len(test_seq.x_dict)

(55143, 55143)

In [8]:
# number of possible labels
len(train_seq.y_dict)

17

In [9]:
train_seq[0]

0/0 1/0 2/0 3/0 4/0 5/0 6/1 7/0 8/0 9/0 10/0 11/0 12/1 13/0 14/0 9/0 15/0 1/0 16/2 17/0 18/0 19/0 20/0 21/0 

In [10]:
train_seq[0].to_words(sequence_list=train_seq)

'Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O '

### Feature mapper (Default)

In [82]:
feature_mapper = IDFeatures(train_seq)
feature_mapper.feature_dict

{}

In [83]:
len(feature_mapper.dataset)

38358

In [84]:
# get features
feature_mapper.build_features()

len(feature_mapper.feature_dict), len(feature_mapper.feature_list)

(39801, 38358)

In [85]:
# show some features
list(feature_mapper.feature_dict)[0:10]

['init_tag:O',
 'id:Thousands::O',
 'id:of::O',
 'prev_tag:O::O',
 'id:demonstrators::O',
 'id:have::O',
 'id:marched::O',
 'id:through::O',
 'id:London::B-geo',
 'prev_tag:O::B-geo']

In [86]:
features_default = set(feature_mapper.feature_dict.keys())

In [87]:
len(features_default)

39801

In [16]:
sp = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict, feature_mapper)
sp.num_epochs = 15

In [17]:
sp.state_labels

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-tim': 3,
 'B-org': 4,
 'I-geo': 5,
 'B-per': 6,
 'I-per': 7,
 'I-org': 8,
 'B-art': 9,
 'I-art': 10,
 'I-tim': 11,
 'I-gpe': 12,
 'B-nat': 13,
 'I-nat': 14,
 'B-eve': 15,
 'I-eve': 16}

In [18]:
sp.get_num_states(), sp.get_num_observations()

(17, 55143)

In [19]:
len(sp.parameters)

39801

In [20]:
%%time
# default features
num_epochs = 15
sp.fit(feature_mapper.dataset, num_epochs)

Epoch: 0 Accuracy: 0.893740
Epoch: 1 Accuracy: 0.932009
Epoch: 2 Accuracy: 0.940784
Epoch: 3 Accuracy: 0.946467
Epoch: 4 Accuracy: 0.949568
Epoch: 5 Accuracy: 0.952513
Epoch: 6 Accuracy: 0.954342
Epoch: 7 Accuracy: 0.955855
Epoch: 8 Accuracy: 0.957418
Epoch: 9 Accuracy: 0.958337
Epoch: 10 Accuracy: 0.959105
Epoch: 11 Accuracy: 0.960058
Epoch: 12 Accuracy: 0.960956
Epoch: 13 Accuracy: 0.961745
Epoch: 14 Accuracy: 0.961832
Wall time: 1h 37min 9s


### Save model

In [23]:
sp.parameters

array([ 11.        ,   4.5       ,   2.58333333, ..., -10.5       ,
         1.41666667,   1.41666667])

In [24]:
models_path = 'fitted_models/'

sp.save_model(models_path + "perceptron_{}_iter".format(num_epochs))

### Feature mapper (Extended)

To train this model, we have used a server which two of our team members are using for their master thesis.

In [88]:
#feature_mapper = IDFeatures(train_seq)
feature_mapper = ExtendedFeatures(train_seq)

feature_mapper.feature_dict

{}

In [89]:
len(feature_mapper.dataset)

38358

In [90]:
# get features
feature_mapper.build_features()

len(feature_mapper.feature_dict), len(feature_mapper.feature_list)

(54286, 38358)

In [91]:
# show some features
list(feature_mapper.feature_dict)[0:10]

['init_tag:O',
 'id:Thousands::O',
 'suffix:s::O',
 'suffix:ds::O',
 'suffix:nds::O',
 'id:of::O',
 'suffix:f::O',
 'prev_tag:O::O',
 'id:demonstrators::O',
 'suffix:rs::O']

Let's see which features are added when we use the extended features:

In [92]:
features_extended = set(feature_mapper.feature_dict.keys())

Features from default configuration:

In [78]:
set([f.split(":")[0] for f in features_default])

{'final_prev_tag', 'id', 'init_tag', 'prev_tag'}

Features from extended configuration:

In [79]:
set([f.split(":")[0] for f in features_extended])

{'final_prev_tag', 'id', 'init_tag', 'prev_tag', 'suffix'}

Let's check how many different suffixes are in the extended features:

In [77]:
[f.split(":")[0] + ":" + f.split(":")[1] for f in features_extended if f.split(":")[0] == "suffix"]

['suffix:sos',
 'suffix:ghe',
 'suffix:van',
 'suffix:eal',
 'suffix:van',
 'suffix:sab',
 'suffix:uen',
 'suffix:uty',
 'suffix:ng.',
 'suffix:nes',
 'suffix:uiz',
 'suffix:ica',
 'suffix:wi',
 'suffix:eah',
 'suffix:ny',
 'suffix:ko',
 'suffix:ic',
 'suffix:ooq',
 'suffix:und',
 'suffix:76"',
 'suffix:put',
 'suffix:lis',
 'suffix:ges',
 'suffix:FTA',
 'suffix:0',
 'suffix:ny',
 'suffix:tar',
 'suffix:NIN',
 'suffix:cs',
 'suffix:ao',
 'suffix:uff',
 'suffix:329',
 'suffix:ope',
 'suffix:ms',
 'suffix:ry',
 'suffix:38',
 'suffix:v',
 'suffix:out',
 'suffix:fer',
 'suffix:rdi',
 'suffix:nse',
 'suffix:uba',
 'suffix:age',
 'suffix:mid',
 'suffix:it',
 'suffix:yaz',
 'suffix:iot',
 'suffix:sip',
 'suffix:oid',
 'suffix:tus',
 'suffix:96"',
 'suffix:bov',
 'suffix:ao',
 'suffix:ows',
 'suffix:oum',
 'suffix:ico',
 'suffix:gbe',
 'suffix:tur',
 'suffix:ces',
 'suffix:C',
 'suffix:0.7',
 'suffix:t"',
 'suffix:hen',
 'suffix:lla',
 'suffix:ean',
 'suffix:mp',
 'suffix:kez',
 'suffix:own',


In [80]:
len([f.split(":")[0] + ":" + f.split(":")[1] for f in features_extended if f.split(":")[0] == "suffix"])

14485

Length of unique suffixes:

In [81]:
len(set([f.split(":")[0] + ":" + f.split(":")[1] for f in features_extended if f.split(":")[0] == "suffix"]))

4588

In [16]:
sp = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict, feature_mapper)
sp.num_epochs = 10

In [17]:
sp.state_labels

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-tim': 3,
 'B-org': 4,
 'I-geo': 5,
 'B-per': 6,
 'I-per': 7,
 'I-org': 8,
 'B-art': 9,
 'I-art': 10,
 'I-tim': 11,
 'I-gpe': 12,
 'B-nat': 13,
 'I-nat': 14,
 'B-eve': 15,
 'I-eve': 16}

In [18]:
sp.get_num_states(), sp.get_num_observations()

(17, 55143)

In [60]:
feature_types = [f.split(":")[0] for f in feature_diff]
for f in feature_types:
    if f != "suffix":
        print(f)

In [29]:
len(sp.parameters)

54286

In [30]:
%%time
# extended features
num_epochs = 10
sp.fit(feature_mapper.dataset, num_epochs)

Epoch: 0 Accuracy: 0.919682
Epoch: 1 Accuracy: 0.940374
Epoch: 2 Accuracy: 0.945737
Epoch: 3 Accuracy: 0.949033
Epoch: 4 Accuracy: 0.951819
Epoch: 5 Accuracy: 0.953552
Epoch: 6 Accuracy: 0.955185
Epoch: 7 Accuracy: 0.956131
Epoch: 8 Accuracy: 0.957545
Epoch: 9 Accuracy: 0.958140
CPU times: user 34min 47s, sys: 19.4 s, total: 35min 6s
Wall time: 34min 41s


### Save model

In [31]:
sp.parameters

array([10.6,  2.8,  2.6, ..., -9.6,  1.9,  1.9])

In [32]:
models_path = 'fitted_models/'

sp.save_model(models_path + "perceptron_{}_iter_extended".format(num_epochs))

In [25]:
num_epochs

10

## Bidirectional LSTM

To train this model, we have used a server which two of our team members are using for their master thesis.

In [3]:
# logging.basicConfig(filename=f"ner_lstm.log",
#                     format='%(asctime)s - %(message)s', level=logging.INFO)

In [8]:
# labels = dict(train_seq.y_dict)
# START_TAG = "<START>"
# STOP_TAG = "<STOP>"
# labels[START_TAG] = 17
# labels[STOP_TAG] = 18

# EMBEDDING_DIM = 5
# HIDDEN_DIM = 4

In [9]:
# model = BiLSTM_CRF_v2(len(train_seq.x_dict), labels, EMBEDDING_DIM, HIDDEN_DIM)
# optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [10]:
# for epoch in range(15):
#     for i in range(38357):
#         # Step 1. Remember that Pytorch accumulates gradients.
#         # We need to clear them out before each instance
#         model.zero_grad()

#         sentence_in = torch.tensor(train_seq[i].x, dtype=torch.long)
#         # sentence_in = sentence_in.to("cuda")
#         targets = torch.tensor(train_seq[i].y, dtype=torch.long)
#         # targets = targets.to("cuda")

#         # Step 3. Run our forward pass.
#         loss = model.neg_log_likelihood(sentence_in, targets)

#         # Step 4. Compute the loss, gradients, and update the parameters by
#         # calling optimizer.step()
#         loss.backward()
#         optimizer.step()

#     logging.info(f"Epoch {epoch} loss: {loss.item()}")

2021-06-17 19:57:36,061 - Epoch 0 loss: 4.4257965087890625

2021-06-17 21:08:28,791 - Epoch 1 loss: 3.559356689453125

2021-06-17 22:19:16,304 - Epoch 2 loss: 2.704498291015625

2021-06-17 23:15:25,847 - Epoch 3 loss: 2.14013671875

2021-06-18 00:08:14,939 - Epoch 4 loss: 1.69354248046875

2021-06-18 01:00:55,245 - Epoch 5 loss: 1.399871826171875

2021-06-18 01:53:40,715 - Epoch 6 loss: 1.235626220703125

2021-06-18 02:46:17,931 - Epoch 7 loss: 1.08868408203125

2021-06-18 03:38:55,666 - Epoch 8 loss: 0.985626220703125

2021-06-18 04:31:39,916 - Epoch 9 loss: 0.869171142578125

2021-06-18 05:24:23,993 - Epoch 10 loss: 0.76190185546875

2021-06-18 06:16:57,598 - Epoch 11 loss: 0.679656982421875

2021-06-18 07:09:33,227 - Epoch 12 loss: 0.6939697265625

2021-06-18 08:02:15,899 - Epoch 13 loss: 0.67559814453125

2021-06-18 08:54:56,115 - Epoch 14 loss: 0.899566650390625

### Save model

In [11]:
# torch.save(model, "modelLSTM.pt")

  "type " + obj.__name__ + ". It won't be checked "
