In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [11]:
import sys
sys.path.append("./utils")

from utils.skseq.sequences.sequence import Sequence
from utils.skseq.readers import pos_corpus
from utils.skseq.sequences.id_feature import IDFeatures
from skseq.sequences.extended_feature import ExtendedFeatures
import utils.skseq.sequences.structured_perceptron as spc

### Reading the data

Data has the following format:

```sentence_id```, ```words```, ```tags```

Some samples are shown below.
- 0,Thousands,O
- 0,London,B-geo
- 0,British,B-gpe
- 2,Saturday,B-tim
- 2,Taliban,B-org
- 4,Egeland,I-per

In [3]:
corpus = pos_corpus.PostagCorpus()

In [4]:
data_path = "data/"

train_seq = corpus.read_sequence_list(data_path + "train_data_ner.csv")                                     
test_seq = corpus.read_sequence_list(data_path + "test_data_ner.csv")

In [5]:
print(len(train_seq))
print(len(test_seq))

38358
38359


In [6]:
corpus.tag_dict

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-tim': 3,
 'B-org': 4,
 'I-geo': 5,
 'B-per': 6,
 'I-per': 7,
 'I-org': 8,
 'B-art': 9,
 'I-art': 10,
 'I-tim': 11,
 'I-gpe': 12,
 'B-nat': 13,
 'I-nat': 14,
 'B-eve': 15,
 'I-eve': 16}

In [7]:
# number of possible words
len(train_seq.x_dict), len(test_seq.x_dict)

(55143, 55143)

In [8]:
# number of possible labels
len(train_seq.y_dict)

17

In [9]:
train_seq[0]

0/0 1/0 2/0 3/0 4/0 5/0 6/1 7/0 8/0 9/0 10/0 11/0 12/1 13/0 14/0 9/0 15/0 1/0 16/2 17/0 18/0 19/0 20/0 21/0 

In [10]:
train_seq[0].to_words(sequence_list=train_seq)

'Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O '

### Feature mapper

In [12]:
#feature_mapper = IDFeatures(train_seq)
feature_mapper = ExtendedFeatures(train_seq)

feature_mapper.feature_dict

{}

In [13]:
len(feature_mapper.dataset)

38358

In [14]:
# get features
feature_mapper.build_features()

len(feature_mapper.feature_dict), len(feature_mapper.feature_list)

(54291, 38358)

In [15]:
# show some features
list(feature_mapper.feature_dict)[0:10]

['init_tag:O',
 'id:Thousands::O',
 'suffix:s::O',
 'suffix:ds::O',
 'suffix:nds::O',
 'id:of::O',
 'suffix:f::O',
 'prev_tag:O::O',
 'id:demonstrators::O',
 'suffix:rs::O']

In [16]:
sp = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict, feature_mapper)
sp.num_epochs = 15

In [17]:
sp.state_labels

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-tim': 3,
 'B-org': 4,
 'I-geo': 5,
 'B-per': 6,
 'I-per': 7,
 'I-org': 8,
 'B-art': 9,
 'I-art': 10,
 'I-tim': 11,
 'I-gpe': 12,
 'B-nat': 13,
 'I-nat': 14,
 'B-eve': 15,
 'I-eve': 16}

In [18]:
sp.get_num_states(), sp.get_num_observations()

(17, 55143)

In [19]:
len(sp.parameters)

54291

In [20]:
%%time
# default features
num_epochs = 15
sp.fit(feature_mapper.dataset, num_epochs)

Epoch: 0 Accuracy: 0.893740
Epoch: 1 Accuracy: 0.932009
Epoch: 2 Accuracy: 0.940784
Epoch: 3 Accuracy: 0.946467
Epoch: 4 Accuracy: 0.949568
Epoch: 5 Accuracy: 0.952513
Epoch: 6 Accuracy: 0.954342
Epoch: 7 Accuracy: 0.955855
Epoch: 8 Accuracy: 0.957418
Epoch: 9 Accuracy: 0.958337
Epoch: 10 Accuracy: 0.959105
Epoch: 11 Accuracy: 0.960058
Epoch: 12 Accuracy: 0.960956
Epoch: 13 Accuracy: 0.961745
Epoch: 14 Accuracy: 0.961832
Wall time: 1h 37min 9s


In [22]:
%%time
# extended features
num_epochs = 10
sp.fit(feature_mapper.dataset, num_epochs)

Epoch: 0 Accuracy: 0.949127
Epoch: 1 Accuracy: 0.951563
Epoch: 2 Accuracy: 0.953484
Epoch: 3 Accuracy: 0.954870
Epoch: 4 Accuracy: 0.956329
Epoch: 5 Accuracy: 0.957094
Epoch: 6 Accuracy: 0.958117
Epoch: 7 Accuracy: 0.958711
Epoch: 8 Accuracy: 0.959152
Epoch: 9 Accuracy: 0.960370
Wall time: 1h 7min 21s


### Save model

In [23]:
sp.parameters

array([ 11.        ,   4.5       ,   2.58333333, ..., -10.5       ,
         1.41666667,   1.41666667])

In [24]:
models_path = 'fitted_models/'

sp.save_model(models_path + "perceptron_{}_iter_extended".format(num_epochs))

In [25]:
num_epochs

10