In [1]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from utils import preprocess, features, classifier, metrics

In [2]:
comp1_path = 'data/comp1.words'
comp2_path = 'data/comp2.words'
test1_path = 'data/test1.wtag'
train1_path = 'data/train1.wtag'
train2_path = 'data/train2.wtag'

train_dataset = preprocess.Dataset(train1_path)
val_dataset = preprocess.Dataset(test1_path)

### Features, only used when load = False

In [3]:
group_thresholds = {
    # ---------- feature -------------------------------------------- | -- Threshold --
    lambda t2, t1, w, i, t: tuple([w[i].lower(), t]):                         0,  # mandatory feature f100
    lambda t2, t1, w, i, t: tuple([w[i][-4:].lower(), t]):                    5,     # mandatory feature f101
    lambda t2, t1, w, i, t: tuple([w[i][-3:].lower(), t]):                    5,     # mandatory feature f101
    lambda t2, t1, w, i, t: tuple([w[i][-2:].lower(), t]):                    5,     # mandatory feature f101
    lambda t2, t1, w, i, t: tuple([w[i][-1:].lower(), t]):                    5,     # mandatory feature f101
    lambda t2, t1, w, i, t: tuple([w[i][:4].lower(), t]):                     5,     # mandatory feature f102
    lambda t2, t1, w, i, t: tuple([w[i][:3].lower(), t]):                     5,     # mandatory feature f102
    lambda t2, t1, w, i, t: tuple([w[i][:2].lower(), t]):                     5,     # mandatory feature f102
    lambda t2, t1, w, i, t: tuple([w[i][:1].lower(), t]):                     5,     # mandatory feature f102
    lambda t2, t1, w, i, t: tuple([t2, t1, t]):                               1,     # mandatory feature f103
    lambda t2, t1, w, i, t: tuple([t1, t]):                                   1,     # mandatory feature f104
    lambda t2, t1, w, i, t: tuple([t]):                                       1,     # mandatory feature f105
    lambda t2, t1, w, i, t: tuple([w[i].islower(), t]):                       1,     # mandatory feature has uppercase
    lambda t2, t1, w, i, t: tuple([any(char.isdigit() for char in w[i]), t]): 1,     # mandatory feature has digits
#     lambda t2, t1, w, i, t: tuple([w[i-1].lower(), t]):                       5,
#     lambda t2, t1, w, i, t: tuple([w[i+1].lower(), t]):                       5,
    lambda t2, t1, w, i, t: tuple([w[i+1][:3].lower(), t]):                   20,
    lambda t2, t1, w, i, t: tuple([w[i-1][:3].lower(), t]):                   20,
    lambda t2, t1, w, i, t: tuple([w[i+1][:2].lower(), t]):                   20,
    lambda t2, t1, w, i, t: tuple([w[i-1][:2].lower(), t]):                   20,
    lambda t2, t1, w, i, t: tuple([w[i+1][-3:].lower(), t]):                  20,
    lambda t2, t1, w, i, t: tuple([w[i-1][-3:].lower(), t]):                  20,
    lambda t2, t1, w, i, t: tuple([w[i+1][-2:].lower(), t]):                  20,
    lambda t2, t1, w, i, t: tuple([w[i-1][-2:].lower(), t]):                  20,
    lambda t2, t1, w, i, t: tuple([w[i].isalnum(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isalpha(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isascii(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isdecimal(), t]):                     1,
    lambda t2, t1, w, i, t: tuple([w[i].isdigit(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isnumeric(), t]):                     1,
    lambda t2, t1, w, i, t: tuple([w[i].istitle(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isupper(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([len(w[i]) == 1, t]):                       1,
    lambda t2, t1, w, i, t: tuple([len(w[i]) == 2, t]):                       1,
    lambda t2, t1, w, i, t: tuple([len(w[i]) == 3, t]):                       1,
}

tic = time.time()
feature_vector = features.create_feature_vector(train_dataset,
                                                group_thresholds=group_thresholds,
                                                pruning=True,
                                                get_stats=False,
                                                assertions=True,
                                                calls_counter=False)

print('feature_vector creation time:', time.time() - tic, 'sec\n')
for feat in feature_vector.feats:
    print('feat_group:', feat, '| feats:', len(feat))
print('feat_groups:', len(feature_vector.feats), '| total_feats:', len(feature_vector))

feature_vector creation time: 2.8578293323516846 sec

feat_group: FeatureGroup(tuple([w[i].lower(), t])) | feats: 14719
feat_group: FeatureGroup(tuple([w[i][-4:].lower(), t])) | feats: 2289
feat_group: FeatureGroup(tuple([w[i][-3:].lower(), t])) | feats: 1757
feat_group: FeatureGroup(tuple([w[i][-2:].lower(), t])) | feats: 955
feat_group: FeatureGroup(tuple([w[i][-1:].lower(), t])) | feats: 248
feat_group: FeatureGroup(tuple([w[i][:4].lower(), t])) | feats: 2540
feat_group: FeatureGroup(tuple([w[i][:3].lower(), t])) | feats: 2431
feat_group: FeatureGroup(tuple([w[i][:2].lower(), t])) | feats: 1432
feat_group: FeatureGroup(tuple([w[i][:1].lower(), t])) | feats: 392
feat_group: FeatureGroup(tuple([t2, t1, t])) | feats: 5192
feat_group: FeatureGroup(tuple([t1, t])) | feats: 908
feat_group: FeatureGroup(tuple([t])) | feats: 44
feat_group: FeatureGroup(tuple([w[i].islower(), t])) | feats: 75
feat_group: FeatureGroup(tuple([any(char.isdigit() for char in w[i]), t])) | feats: 49
feat_group: F

In [5]:
w0 = np.random.rand(len(feature_vector)).astype(np.float32)  # init weights
seed = 42  # seed for batch shuffle for loader
models_path = 'models'  # folder of models
load = False  # load last weights, log and feature_vector into model

train = True  # perform a training session
train_save = True  # save model after each training epoch, if False model will need to be saved manually
beam = 1  # viterbi beam size for model evaluation during training
train_aprox = 0  # aproximate train_score with first train_aprox train samples
val_aprox = 50  # aproximate val_score with first val_aprox train samples 
weight_decay = 0.0  # lamda regularization parameter
batch_size = 1024  # batch_size for loader
epochs = 20  # training epochs
tqdm_bar = False  # display tqdm progress bars
# generated model version ID
version = f"weight_decay-{weight_decay}_batch_size-{batch_size}_max_epochs-{epochs}_feats-{len(feature_vector.feats)}"

model = classifier.Model(version=version,
                         w0=w0,
                         tags=train_dataset.tags,
                         inference=classifier.viterbi,
                         feature_vector=feature_vector,
                         seed=seed,
                         score_func=metrics.accuracy,
                         models_path=models_path,
                         save=False)

if load:
    model.load(weights=True, feature_vector=True, log=True, epoch=-1, prints=True)

if train:
    v_min, f_min, d_min = model.train(epochs=epochs,
                                      train_dataset=train_dataset,
                                      val_dataset=val_dataset,
                                      batch_size=batch_size,
                                      weight_decay=weight_decay,
                                      save=train_save,
                                      tqdm_bar=tqdm_bar,
                                      beam=beam,
                                      train_aprox=train_aprox,
                                      val_aprox=val_aprox)

model version: beam-50_train_aprox-0_val_aprox-0_weight_decay-1_batch_size-5000_max_epochs-50
epochs: 23
train_time: 160.301

last train_loss: 54.828313
last val_loss: 56.020626
last train_score: nan
last val_score: nan
best val_score: nan at epoch 0


### manual model evaluation on dataset

In [None]:
aprox_num = 100  # max samples to aproximate score
predict_beam = 2  # viterbi beam size
dataset = val_dataset  # dataset to evaluate
display_all = True

pred_tags = []
true_tags = []
pbar = tqdm(dataset.sentences[:aprox_num])
for sentence in pbar:
    preds = model(sentence[0], predict_beam)
    pred_tags.append(preds)
    true_tags.append(sentence[1])
    
    if display_all:
        display(pd.DataFrame((sentence[0], sentence[1], preds), index=('words', 'tags', 'preds')))
    pbar.set_postfix(acc=model.score_func(pred_tags, true_tags), refresh=False)
    
matrix, worst = metrics.confusion_matrix(train_dataset.tags, pred_tags, true_tags)
display(worst)

### manual model evaluation on a sample_sentence

In [72]:
%%time
sample_sentence = (['Terms', 'were', "n't", 'disclosed', '.'],  # sentence words
                   ['NNS',   'VBD',  'RB',  'VBN',       '.'])  # sentence true tags
predict_beam = 100  # viterbi beam size

tags, bp_pi = viterbi(model, sample_sentence[0], beam=predict_beam)
print('sentence ', sample_sentence[0])
print('true tags', sample_sentence[1])
print('pred tags', tags)
print()

sentence  ['Terms', 'were', "n't", 'disclosed', '.']
true tags ['NNS', 'VBD', 'RB', 'VBN', '.']
pred tags ['NNP', 'NNP', 'IN', 'DT', 'NN']
CPU times: user 10.2 s, sys: 0 ns, total: 10.2 s
Wall time: 10.2 s


### Code that may be useful

In [None]:
# sentences = {}
# for sentence in train_dataset.sentences:
#     sentences[len(sentence[0])] = sentence

In [None]:
# # test run train_dataset
# tic = time.time()
# for t2, t1, w, i, t in train_dataset:
#     feat_vec_t = feature_vector(t2, t1, w, i, t, fmt='vec')
# print('fmt=vec: {:.3f} sec'.format(time.time() - tic))

# tic = time.time()
# for t2, t1, w, i, t in train_dataset:
#     feat_list_t = feature_vector(t2, t1, w, i, t, fmt='list')
# print('fmt=list: {:.3f} sec'.format(time.time() - tic))

# tic = time.time()
# for t2, t1, w, i, t in train_dataset:
#     feat_vec_t, feat_list_t = feature_vector(t2, t1, w, i, t, fmt='both')
# print('fmt=vec+list: {:.3f} sec'.format(time.time() - tic))

In [None]:
# for tag in train1_statistics.words_per_tag:
#     if len(train1_statistics.words_per_tag[tag]) < 10:
#         print('{:5} tf: {:5d} unique_count: {:4d} words: {}'.format(tag, train1_statistics.tags_count[tag], len(train1_statistics.words_per_tag[tag]),
#                                                                     train1_statistics.words_per_tag[tag]))
#     else:
#         print('{:5} tf: {:5d} unique_count: {:4d}'.format(tag, train1_statistics.tags_count[tag], len(train1_statistics.words_per_tag[tag])))

In [None]:
# features = []

# # one-to-one features
# for word in strange_words:
#     features.append(Feature(f'w[i] == "{word}"', t=train1_model.tags_per_word[word][0]))
#     print(word, train1_model.WordCount[word], train1_model.TagsPerWord[word])