In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [32]:
import sys
sys.path.append("./utils")

from utils.skseq.sequences.sequence import Sequence
from utils.skseq.readers import pos_corpus
from utils.skseq.sequences.id_feature import IDFeatures
import utils.skseq.sequences.structured_perceptron as spc

from sklearn.metrics import confusion_matrix, f1_score

### Reading the data

In [10]:
corpus = pos_corpus.PostagCorpus()

In [11]:
data_path = "data/"

train_seq = corpus.read_sequence_list(data_path + "train_data_ner.csv")                                     
test_seq = corpus.read_sequence_list(data_path + "test_data_ner.csv")

In [12]:
print(len(train_seq))
print(len(test_seq))

38358
38359


### Feature mapper

In [13]:
feature_mapper = IDFeatures(train_seq)

feature_mapper.feature_dict

{}

In [15]:
# get features
feature_mapper.build_features()

len(feature_mapper.feature_dict), len(feature_mapper.feature_list)

(39801, 38358)

In [16]:
# show some features
list(feature_mapper.feature_dict)[0:10]

['init_tag:O',
 'id:Thousands::O',
 'id:of::O',
 'prev_tag:O::O',
 'id:demonstrators::O',
 'id:have::O',
 'id:marched::O',
 'id:through::O',
 'id:London::B-geo',
 'prev_tag:O::B-geo']

### Load model 

In [19]:
models_path = 'fitted_models/'

sp = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict, feature_mapper)
sp.load_model(models_path + "perceptron_15_iter")

In [20]:
sp.parameters

array([0., 0., 0., ..., 0., 0., 0.])

### Evaluation (accuracy, f1 weighted score)

In [21]:
def evaluate_corpus(sequences, sequences_predictions):
    """Evaluate classification accuracy at corpus level, comparing with
    gold standard."""
    total = 0.0
    correct = 0.0
    for i, sequence in enumerate(sequences):
        pred = sequences_predictions[i]
        for j, y_hat in enumerate(pred.y):
            if sequence.y[j] != "O":
                if sequence.y[j] == y_hat:
                    correct += 1
                total += 1
    return correct / total

In [30]:
evaluate_corpus(train_seq.seq_list, train_seq.seq_list)

1.0

In [None]:
#y_true = ["cat", "ant", "cat", "cat", "ant", "bird"]
#y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"]
#confusion_matrix(y_true, y_pred, labels=["ant", "bird", "cat"])

In [None]:
#f1_score(y_true, y_pred, average='weighted')