In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("./utils")

from utils.skseq.sequences.sequence import Sequence
from utils.skseq.readers import pos_corpus
from utils.skseq.sequences.id_feature import IDFeatures
import utils.skseq.sequences.structured_perceptron as spc

### Reading the data

Data has the following format:

```sentence_id```, ```words```, ```tags```

Some samples are shown below.
- 0,Thousands,O
- 0,London,B-geo
- 0,British,B-gpe
- 2,Saturday,B-tim
- 2,Taliban,B-org
- 4,Egeland,I-per

In [3]:
corpus = pos_corpus.PostagCorpus()

In [4]:
data_path = "data/"

train_seq = corpus.read_sequence_list(data_path + "train_data_ner.csv")                                     
test_seq = corpus.read_sequence_list(data_path + "test_data_ner.csv")

In [5]:
print(len(train_seq))
print(len(test_seq))

38358
38359


In [6]:
corpus.tag_dict

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-tim': 3,
 'B-org': 4,
 'I-geo': 5,
 'B-per': 6,
 'I-per': 7,
 'I-org': 8,
 'B-art': 9,
 'I-art': 10,
 'I-tim': 11,
 'I-gpe': 12,
 'B-nat': 13,
 'I-nat': 14,
 'B-eve': 15,
 'I-eve': 16}

In [7]:
# number of possible words
len(train_seq.x_dict), len(test_seq.x_dict)

(55143, 55143)

In [8]:
# number of possible labels
len(train_seq.y_dict),len(test_seq.y_dict)

(17, 17)

In [9]:
train_seq[0]

0/0 1/0 2/0 3/0 4/0 5/0 6/1 7/0 8/0 9/0 10/0 11/0 12/1 13/0 14/0 9/0 15/0 1/0 16/2 17/0 18/0 19/0 20/0 21/0 

In [10]:
train_seq[0].to_words(sequence_list=train_seq)

'Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O '

### Feature mapper

In [11]:
feature_mapper = IDFeatures(train_seq)

feature_mapper.feature_dict

{}

In [12]:
len(feature_mapper.dataset)

38358

In [13]:
# get features
feature_mapper.build_features()

len(feature_mapper.feature_dict), len(feature_mapper.feature_list)

(39801, 38358)

In [14]:
# show some features
list(feature_mapper.feature_dict)[0:10]

['init_tag:O',
 'id:Thousands::O',
 'id:of::O',
 'prev_tag:O::O',
 'id:demonstrators::O',
 'id:have::O',
 'id:marched::O',
 'id:through::O',
 'id:London::B-geo',
 'prev_tag:O::B-geo']

In [15]:
sp = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict, feature_mapper)
sp.num_epochs = 5

In [16]:
sp.state_labels

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-tim': 3,
 'B-org': 4,
 'I-geo': 5,
 'B-per': 6,
 'I-per': 7,
 'I-org': 8,
 'B-art': 9,
 'I-art': 10,
 'I-tim': 11,
 'I-gpe': 12,
 'B-nat': 13,
 'I-nat': 14,
 'B-eve': 15,
 'I-eve': 16}

In [17]:
sp.get_num_states(), sp.get_num_observations()

(17, 55143)

In [18]:
len(sp.parameters)

39801

In [22]:
%%time
num_epochs = 15
sp.fit(feature_mapper.dataset, num_epochs)


Epoch: 0 Accuracy: 0.789581
Epoch: 1 Accuracy: 0.797753
Epoch: 2 Accuracy: 0.845761
Epoch: 3 Accuracy: 0.880490
Epoch: 4 Accuracy: 0.874362
Epoch: 5 Accuracy: 0.865169
Epoch: 6 Accuracy: 0.882533
Epoch: 7 Accuracy: 0.979571
Epoch: 8 Accuracy: 0.991828
Epoch: 9 Accuracy: 0.977528
Epoch: 10 Accuracy: 0.995914
Epoch: 11 Accuracy: 0.995914
Epoch: 12 Accuracy: 0.997957
Epoch: 13 Accuracy: 0.997957
Epoch: 14 Accuracy: 0.997957
Wall time: 4.31 s


In [39]:
def evaluate_corpus(sequences, sequences_predictions):
    """Evaluate classification accuracy at corpus level, comparing with
    gold standard."""
    total = 0.0
    correct = 0.0
    for i, sequence in enumerate(sequences):
        pred = sequences_predictions[i]
        for j, y_hat in enumerate(pred.y):
            if sequence.y[j] == y_hat:
                correct += 1
            total += 1
    return correct / total

In [40]:
# Make predictions for the various sequences using the trained model.
pred_train = sp.viterbi_decode_corpus(train_seq)
pred_dev   = sp.viterbi_decode_corpus(dev_seq)
pred_test  = sp.viterbi_decode_corpus(test_seq)

KeyboardInterrupt: 

In [None]:
# Evaluate and print accuracies
eval_train = evaluate_corpus(train_seq.seq_list, pred_train)
eval_dev = evaluate_corpus(dev_seq.seq_list, pred_dev)
eval_test = evaluate_corpus(test_seq.seq_list, pred_test)
print("SP -  Accuracy Train: %.3f Dev: %.3f Test: %.3f"%(eval_train,eval_dev, eval_test))