In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("./utils")

from utils.skseq.sequences.sequence import Sequence
from utils.skseq.readers import pos_corpus
from utils.skseq.sequences.id_feature import IDFeatures
import utils.skseq.sequences.structured_perceptron as spc

### Reading the data

Data has the following format:

```sentence_id```, ```words```, ```tags```

Some samples are shown below.
- 0,Thousands,O
- 0,London,B-geo
- 0,British,B-gpe
- 2,Saturday,B-tim
- 2,Taliban,B-org
- 4,Egeland,I-per

In [3]:
corpus = pos_corpus.PostagCorpus()

In [36]:
data_path = "data/"

train_seq = corpus.read_sequence_list(data_path + "train_data_ner.csv")                                     
test_seq = corpus.read_sequence_list(data_path + "test_data_ner.csv")

In [37]:
print(len(train_seq))
print(len(test_seq))

38358
38359


In [27]:
corpus.tag_dict

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-tim': 3,
 'B-org': 4,
 'I-geo': 5,
 'B-per': 6,
 'I-per': 7,
 'I-org': 8,
 'B-art': 9,
 'I-art': 10,
 'I-tim': 11,
 'I-gpe': 12,
 'B-nat': 13,
 'I-nat': 14,
 'B-eve': 15,
 'I-eve': 16}

In [38]:
# number of possible words
len(train_seq.x_dict), len(test_seq.x_dict)

(55143, 55143)

In [29]:
# number of possible labels
len(train_seq.y_dict)

17

In [30]:
train_seq[0]

0/0 1/0 2/0 3/0 4/0 5/0 6/1 7/0 8/0 9/0 10/0 11/0 12/1 13/0 14/0 9/0 15/0 1/0 16/2 17/0 18/0 19/0 20/0 21/0 

In [31]:
train_seq[0].to_words(sequence_list=train_seq)

'Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O '

### Feature mapper

In [32]:
feature_mapper = IDFeatures(train_seq)

feature_mapper.feature_dict

{}

In [33]:
len(feature_mapper.dataset)

38358

In [34]:
# get features
feature_mapper.build_features()

len(feature_mapper.feature_dict), len(feature_mapper.feature_list)

(39801, 38358)

In [35]:
# show some features
list(feature_mapper.feature_dict)[0:10]

['init_tag:O',
 'id:Thousands::O',
 'id:of::O',
 'prev_tag:O::O',
 'id:demonstrators::O',
 'id:have::O',
 'id:marched::O',
 'id:through::O',
 'id:London::B-geo',
 'prev_tag:O::B-geo']

In [15]:
sp = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict, feature_mapper)
sp.num_epochs = 5

In [16]:
sp.state_labels

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-tim': 3,
 'B-org': 4,
 'I-geo': 5,
 'B-per': 6,
 'I-per': 7,
 'I-org': 8,
 'B-art': 9,
 'I-art': 10,
 'I-tim': 11,
 'I-gpe': 12,
 'B-nat': 13,
 'I-nat': 14,
 'B-eve': 15,
 'I-eve': 16}

In [17]:
sp.get_num_states(), sp.get_num_observations()

(17, 55143)

In [18]:
len(sp.parameters)

39801

In [23]:
%%time
num_epochs = 15
#sp.fit(feature_mapper.dataset, num_epochs)

Wall time: 0 ns


### Save model

In [21]:
sp.parameters

array([0., 0., 0., ..., 0., 0., 0.])

In [39]:
models_path = 'fitted_models/'

sp.save_model(models_path + "perceptron_{}_iter".format(num_epochs))

In [40]:
num_epochs

15

In [41]:
models_path

'fitted_models/'