In [1]:
import numpy as np
import pandas as pd
from utils import preprocess, features, classifier

In [2]:
comp1_path = 'data/comp1.words'
comp2_path = 'data/comp2.words'
test1_path = 'data/test1.wtag'
train1_path = 'data/train1.wtag'
train2_path = 'data/train2.wtag'

## Create FeatureGroup params

In [3]:
group_thresholds = {
    lambda t2, t1, w, i, t: tuple([t2, t1, t]):                               1,
    lambda t2, t1, w, i, t: tuple([t1, t]):                                   1,
    lambda t2, t1, w, i, t: tuple([w[i].lower(), t]):                         None,
#     lambda t2, t1, w, i, t: tuple([w[i-1].lower(), t]):                       5,
#     lambda t2, t1, w, i, t: tuple([w[i+1].lower(), t]):                       5,
    lambda t2, t1, w, i, t: tuple([w[i][:3].lower(), t]):                     5,
    lambda t2, t1, w, i, t: tuple([w[i][:2].lower(), t]):                     5,
#     lambda t2, t1, w, i, t: tuple([w[i+1][:3].lower(), t]):                   5,
#     lambda t2, t1, w, i, t: tuple([w[i-1][:3].lower(), t]):                   5,
    lambda t2, t1, w, i, t: tuple([w[i][-3:].lower(), t]):                    5,
    lambda t2, t1, w, i, t: tuple([w[i][-2:].lower(), t]):                    5,
#     lambda t2, t1, w, i, t: tuple([w[i+1][-3:].lower(), t]):                  5,
#     lambda t2, t1, w, i, t: tuple([w[i-1][-3:].lower(), t]):                  5,
    lambda t2, t1, w, i, t: tuple([w[i].isalnum(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isalpha(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isascii(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isdecimal(), t]):                     1,
    lambda t2, t1, w, i, t: tuple([w[i].isdigit(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].islower(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isnumeric(), t]):                     1,
    lambda t2, t1, w, i, t: tuple([w[i].istitle(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isupper(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([len(w[i]) == 1, t]):                       1,
    lambda t2, t1, w, i, t: tuple([len(w[i]) == 2, t]):                       1,
    lambda t2, t1, w, i, t: tuple([len(w[i]) == 3, t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i][0].islower(), t]):                    1,
    lambda t2, t1, w, i, t: tuple([any(char.isdigit() for char in w[i]), t]): 1,
    lambda t2, t1, w, i, t: tuple([any(char.isupper() for char in w[i]), t]): 1,
}

In [4]:
%%time
train_dataset = preprocess.Dataset(train1_path)
val_dataset = preprocess.Dataset(test1_path)
feature_vector = features.create_feature_vector(train_dataset,
                                                group_thresholds=group_thresholds,
                                                pruning=True,
                                                get_stats=False,
                                                assertions=True,
                                                calls_counter=False)

CPU times: user 1.59 s, sys: 31.2 ms, total: 1.62 s
Wall time: 1.64 s


In [6]:
w0 = np.random.rand(len(feature_vector)).astype(np.float32)/20
seed = 42
models_path = 'models'
save = True
load = True
train = False
beam = 50
train_aprox = 0
val_aprox = 0
weight_decay = 1
batch_size = 5000
epochs = 50
tqdm_bar = False
version = f"beam-{beam}_train_aprox-{train_aprox}_val_aprox-{val_aprox}_" + \
          f"weight_decay-{weight_decay}_batch_size-{batch_size}_max_epochs-{epochs}"


if load:
    model = classifier.load_model(version, models_path, epoch=-1, seed=42, prints=True)
else:
    model = classifier.Model(version=version,
                             w0=w0,
                             tags=train_dataset.tags,
                             inference=classifier.viterbi,
                             feature_vector=feature_vector,
                             seed=seed,
                             score_func=classifier.accuracy,
                             models_path=models_path,
                             save=save)

if train:
    v_min, f_min, d_min = model.train(epochs=epochs,
                                      train_dataset=train_dataset,
                                      val_dataset=val_dataset,
                                      batch_size=batch_size,
                                      weight_decay=weight_decay,
                                      iprint=-1,
                                      save=save,
                                      tqdm_bar=tqdm_bar,
                                      beam=beam,
                                      train_aprox=train_aprox,
                                      val_aprox=val_aprox)

model version: beam-50_train_aprox-0_val_aprox-0_weight_decay-1_batch_size-5000_max_epochs-50
epochs: 23
train_time: 160.301

last train_loss: 54.828313
last val_loss: 56.020626
last train_score: nan
last val_score: nan
best val_score: nan at epoch 0


In [24]:
def accuracy(pred_tags, true_tags):
    correct = 0
    total = 0
    for preds, tags in zip(pred_tags, true_tags):
        for pred, tag in zip(preds, tags):
            total += 1
            if pred == tag:
                correct += 1
    return float(correct)/total

In [None]:
i = 50
beam = 50

pred_tags_50, true_tags_50 = model.predict(val_dataset.sentences[:i], beam, True)

 18%|█▊        | 9/50 [05:01<23:44, 34.74s/it]

In [None]:
score = model.score_func(pred_tags_50, true_tags_50)
print(score)

In [11]:
i = 50
beam = 10

pred_tags_10, true_tags_10 = model.predict(val_dataset.sentences[:i], beam, True)

100%|██████████| 50/50 [05:31<00:00,  6.63s/it]

0.0





In [28]:
score = model.score_func(pred_tags_10, true_tags_10)
print(score)

0.0838272650296359


In [25]:
model.score_func = accuracy

In [16]:
print(float(score))

0.0


In [8]:
sentences = {}
for sentence in train_dataset.sentences:
    sentences[len(sentence[0])] = sentence

In [31]:
i = 10
tags = classifier.viterbi(model, sentences[i][0], beam=beam)
print(sentences[i][0])
print(sentences[i][1])
print(tags)

['``', 'It', 'does', "n't", 'appear', 'to', 'be', 'getting', 'worse', '.']
['``', 'PRP', 'VBZ', 'RB', 'VB', 'TO', 'VB', 'VBG', 'RBR', '.']
['NNP', 'FW', 'NN', 'VBZ', 'NNP', '$', '$', '$', '$', '$']


In [28]:
print(sentences[i][0])
print(sentences[i][1])
print(tags)

['What', 'happened', '?']
['WP', 'VBD', '.']
['NNP', 'FW', 'NN']


In [5]:
for feat in feature_vector.feats:
    print('feat_group:', feat, '| feats:', len(feat))
print('feat_groups:', len(feature_vector.feats), '| total_feats:', len(feature_vector))

feat_group: FeatureGroup(tuple([t2, t1, t])) | feats: 5192
feat_group: FeatureGroup(tuple([t1, t])) | feats: 908
feat_group: FeatureGroup(tuple([w[i].lower(), t])) | feats: 14719
feat_group: FeatureGroup(tuple([w[i][:3].lower(), t])) | feats: 2431
feat_group: FeatureGroup(tuple([w[i][:2].lower(), t])) | feats: 1432
feat_group: FeatureGroup(tuple([w[i][-3:].lower(), t])) | feats: 1757
feat_group: FeatureGroup(tuple([w[i][-2:].lower(), t])) | feats: 955
feat_group: FeatureGroup(tuple([w[i].isalnum(), t])) | feats: 60
feat_group: FeatureGroup(tuple([w[i].isalpha(), t])) | feats: 60
feat_group: FeatureGroup(tuple([w[i].isascii(), t])) | feats: 44
feat_group: FeatureGroup(tuple([w[i].isdecimal(), t])) | feats: 45
feat_group: FeatureGroup(tuple([w[i].isdigit(), t])) | feats: 45
feat_group: FeatureGroup(tuple([w[i].islower(), t])) | feats: 75
feat_group: FeatureGroup(tuple([w[i].isnumeric(), t])) | feats: 45
feat_group: FeatureGroup(tuple([w[i].istitle(), t])) | feats: 76
feat_group: FeatureG

In [6]:
# # test run train_dataset
# tic = time.time()
# for t2, t1, w, i, t in train_dataset:
#     feat_vec_t = feature_vector(t2, t1, w, i, t, fmt='vec')
# print('fmt=vec: {:.3f} sec'.format(time.time() - tic))

# tic = time.time()
# for t2, t1, w, i, t in train_dataset:
#     feat_list_t = feature_vector(t2, t1, w, i, t, fmt='list')
# print('fmt=list: {:.3f} sec'.format(time.time() - tic))

# tic = time.time()
# for t2, t1, w, i, t in train_dataset:
#     feat_vec_t, feat_list_t = feature_vector(t2, t1, w, i, t, fmt='both')
# print('fmt=vec+list: {:.3f} sec'.format(time.time() - tic))

In [None]:
# for tag in train1_statistics.words_per_tag:
#     if len(train1_statistics.words_per_tag[tag]) < 10:
#         print('{:5} tf: {:5d} unique_count: {:4d} words: {}'.format(tag, train1_statistics.tags_count[tag], len(train1_statistics.words_per_tag[tag]),
#                                                                     train1_statistics.words_per_tag[tag]))
#     else:
#         print('{:5} tf: {:5d} unique_count: {:4d}'.format(tag, train1_statistics.tags_count[tag], len(train1_statistics.words_per_tag[tag])))

In [None]:
# features = []

# # one-to-one features
# for word in strange_words:
#     features.append(Feature(f'w[i] == "{word}"', t=train1_model.tags_per_word[word][0]))
#     print(word, train1_model.WordCount[word], train1_model.TagsPerWord[word])