In [1]:
import time
import inspect
import numpy as np
import pandas as pd
from tqdm import tqdm
from utils import preprocess, features, classifier, metrics, logging

In [2]:
seed = 42
np.random.seed(seed)

models_path = 'models'  # folder of models
log_path = 'log'  # folder of log.csv
logger = logging.Logger(log_path)  # init logger to log.csv
# logger.init_log()

In [3]:
def load_datasets():
    test1_path = 'data/test1.wtag'
    train1_path = 'data/train1.wtag'
    train2_path = 'data/train2.wtag'
    comp1_path = 'data/comp1_tagged.words'  # need to tag dataset to use
    comp2_path = 'data/comp2_tagged.words'  # need to tag dataset to use

    train_dataset = preprocess.Dataset(train1_path)
#     train_dataset = preprocess.Dataset(train2_path)
    val_dataset = preprocess.Dataset(test1_path)
#     val_dataset = preprocess.Dataset(comp1_path)
#     val_dataset = preprocess.Dataset(comp2_path)
    return train_dataset, val_dataset

def comp_dataset(tags):
    comp1_path = 'data/comp1.words'
    comp2_path = 'data/comp2.words'
    
    comp_dataset = preprocess.Dataset(comp1_path, labeled=False, tags=tags)
#     comp_dataset = preprocess.Dataset(comp2_path, labeled=False, tags=tags)
    return comp_dataset

train_dataset, val_dataset = load_datasets()
# comp_dataset = comp_dataset(train_dataset.tags)

In [4]:
def create_feats(prints=True):
    group_thresholds = {
        # -------------------------------- feature --------------------- | -- Threshold --
        lambda t2, t1, w, i, t: tuple([w[i].lower(), t]):                         0,     # mandatory feature f100
        lambda t2, t1, w, i, t: tuple([w[i][-4:].lower(), t]):                    5,     # mandatory feature f101
        lambda t2, t1, w, i, t: tuple([w[i][-3:].lower(), t]):                    5,     # mandatory feature f101
        lambda t2, t1, w, i, t: tuple([w[i][-2:].lower(), t]):                    5,     # mandatory feature f101
        lambda t2, t1, w, i, t: tuple([w[i][-1:].lower(), t]):                    5,     # mandatory feature f101
        lambda t2, t1, w, i, t: tuple([w[i][:4].lower(), t]):                     5,     # mandatory feature f102
        lambda t2, t1, w, i, t: tuple([w[i][:3].lower(), t]):                     5,     # mandatory feature f102
        lambda t2, t1, w, i, t: tuple([w[i][:2].lower(), t]):                     5,     # mandatory feature f102
        lambda t2, t1, w, i, t: tuple([w[i][:1].lower(), t]):                     5,     # mandatory feature f102
        lambda t2, t1, w, i, t: tuple([t2, t1, t]):                               1,     # mandatory feature f103
        lambda t2, t1, w, i, t: tuple([t1, t]):                                   1,     # mandatory feature f104
        lambda t2, t1, w, i, t: tuple([t]):                                       1,     # mandatory feature f105
        lambda t2, t1, w, i, t: tuple([w[i].islower(), t]):                       1,     # mandatory feature has_uppercase
        lambda t2, t1, w, i, t: tuple([any(char.isdigit() for char in w[i]), t]): 1,     # mandatory feature has_digits
        lambda t2, t1, w, i, t: tuple([w[i-1].lower(), t]):                       20,
        lambda t2, t1, w, i, t: tuple([w[i+1].lower(), t]):                       20,
        lambda t2, t1, w, i, t: tuple([w[i+1][:3].lower(), t]):                   20,
        lambda t2, t1, w, i, t: tuple([w[i-1][:3].lower(), t]):                   20,
        lambda t2, t1, w, i, t: tuple([w[i+1][:2].lower(), t]):                   20,
        lambda t2, t1, w, i, t: tuple([w[i-1][:2].lower(), t]):                   20,
        lambda t2, t1, w, i, t: tuple([w[i+1][-3:].lower(), t]):                  20,
        lambda t2, t1, w, i, t: tuple([w[i-1][-3:].lower(), t]):                  20,
        lambda t2, t1, w, i, t: tuple([w[i+1][-2:].lower(), t]):                  20,
        lambda t2, t1, w, i, t: tuple([w[i-1][-2:].lower(), t]):                  20,
        lambda t2, t1, w, i, t: tuple([w[i].isalnum(), t]):                       1,
        lambda t2, t1, w, i, t: tuple([w[i].isalpha(), t]):                       1,
        lambda t2, t1, w, i, t: tuple([w[i].isascii(), t]):                       1,
        lambda t2, t1, w, i, t: tuple([w[i].isdecimal(), t]):                     1,
        lambda t2, t1, w, i, t: tuple([w[i].isdigit(), t]):                       1,
        lambda t2, t1, w, i, t: tuple([w[i].isnumeric(), t]):                     1,
        lambda t2, t1, w, i, t: tuple([w[i].istitle(), t]):                       1,
        lambda t2, t1, w, i, t: tuple([w[i].isupper(), t]):                       1,
        lambda t2, t1, w, i, t: tuple([len(w[i]) == 1, t]):                       1,
        lambda t2, t1, w, i, t: tuple([len(w[i]) == 2, t]):                       1,
        lambda t2, t1, w, i, t: tuple([len(w[i]) == 3, t]):                       1,
    }

    tic = time.time()
    feature_vector = features.create_feature_vector(dataset=train_dataset,
                                                    group_thresholds=group_thresholds,
                                                    pruning=True,
                                                    get_stats=False,
                                                    assertions=True,
                                                    calls_counter=False)

    if prints:
        print('feature_vector creation time:', time.time() - tic, 'sec\n')
        for feat in feature_vector.feats:
            print('feat_group:', feat, '| feats:', len(feat))
    print('feat_groups:', len(feature_vector.feats), '| total_feats:', len(feature_vector))
    return feature_vector

feature_vector = create_feats(prints=False)

feat_groups: 35 | total_feats: 42783


In [5]:
def w0_uniform_0_1_centered_normalized():
    w0 = np.random.rand(len(feature_vector))
    w0 -= w0.mean()
    w0 /= w0.std()
    return w0

def w0_uniform_0_1_normalized():
    w0 = np.random.rand(len(feature_vector))
    w0 /= w0.std()
    return w0

def w0_uniform_0_1_centered():
    w0 = np.random.rand(len(feature_vector))
    w0 -= w0.mean()
    return w0

def w0_uniform_0_1():
    return np.random.rand(len(feature_vector))

def w0_xavier():
    return np.random.randn(len(feature_vector))*np.sqrt(1/len(feature_vector))

def w0_zero():
    return np.zeros(len(feature_vector)).astype(np.float32)

In [9]:
model = classifier.load_model(from_file='checkpoint_V11_E054_SEED42.pth')

model version: 11
epochs: 54
train_time: 319.899

last train_loss: 4.146261
last val_loss: 4.894692
last train_score: 0.000000
last val_score: 0.000000
best val_score: 0.0000 at epoch 54


In [None]:
init_w0 = w0_uniform_0_1
versions = 3
load = False  # load last weights, log and feature_vector into model

train = True  # perform a training session
train_save = True  # save model after each training epoch, if False model will need to be saved manually
beam = 1  # viterbi beam size for model evaluation during training
train_aprox = 0  # aproximate train_score with first train_aprox train samples
val_aprox = 50  # aproximate val_score with first val_aprox train samples 
# weight_decay = 0.0  # lamda regularization parameter
# batch_size = 256  # batch_size for loader
# epochs = 20  # training epochs
tqdm_bar = False  # display tqdm progress bars

experiments = [
    {'weight_decay': 0.0, 'batch_size': 256, 'epochs': 15},
    {'weight_decay': 1e-4, 'batch_size': 256, 'epochs': 15},
    {'weight_decay': 1e-2, 'batch_size': 256, 'epochs': 15},
    {'weight_decay': 1e-1, 'batch_size': 256, 'epochs': 15},
    {'weight_decay': 1, 'batch_size': 256, 'epochs': 15},
    {'weight_decay': 10, 'batch_size': 256, 'epochs': 15},
    {'weight_decay': 100, 'batch_size': 256, 'epochs': 15},
]

for i, experiment in enumerate(experiments):
    model = classifier.Model(version=versions + i + 1,
                             w0=init_w0(),
                             tags=train_dataset.tags,
                             inference=classifier.viterbi,
                             feature_vector=feature_vector,
                             score_func=metrics.accuracy,
                             models_path=models_path,
                             save=False)

    v_min, f_min, d_min = model.train(epochs=experiment['epochs'],
                                      train_dataset=train_dataset,
                                      val_dataset=val_dataset,
                                      batch_size=experiment['batch_size'],
                                      weight_decay=experiment['weight_decay'],
                                      save=train_save,
                                      tqdm_bar=tqdm_bar,
                                      beam=beam,
                                      train_aprox=train_aprox,
                                      val_aprox=val_aprox)
    description = f"{inspect.getsource(init_w0).split()[1].split('(')[0]}, " + \
                  f"{len(feature_vector.feats)} feat_groups, " + \
                  f"{len(feature_vector)} total_feats"

    logger.log(model, init_w0, create_feats, load_datasets, description)

In [None]:
leadboard = logger.leadboard(col='')
display(leadboard)
# print(leadboard['init'][0])

In [None]:
aprox_num = 1000  # max samples to aproximate score
display_all = False
beam_stats = {}

for predict_beam in [1]:  # viterbi beam size
    beam_stats[predict_beam] = {}
    beam_stats[predict_beam]['pred_tags'] = []
    beam_stats[predict_beam]['true_tags'] = []
    pbar = tqdm(val_dataset.sentences[:aprox_num])
    for sentence in pbar:
        preds = model(sentence[0], predict_beam)
        beam_stats[predict_beam]['pred_tags'].append(preds)
        beam_stats[predict_beam]['true_tags'].append(sentence[1])

        if display_all:
            display(pd.DataFrame((sentence[0], sentence[1], preds), index=('words', 'tags', 'preds')))
        pbar.set_postfix(acc=model.score_func(beam_stats[predict_beam]['pred_tags'],
                                              beam_stats[predict_beam]['true_tags']),
                         refresh=False)

    beam_stats[predict_beam]['matrix'], beam_stats[predict_beam]['worst'] = metrics.confusion_matrix(train_dataset.tags,
                                                                                                     beam_stats[predict_beam]['pred_tags'],
                                                                                                     beam_stats[predict_beam]['true_tags'])
#     display(worst)
model.beam_stats = beam_stats
model.save(epoch=True, best=True)
# model.val_predictions = pred_tags
# model.val_sentences = val_dataset.sentences
# model.save()

 14%|█▍        | 145/1000 [03:26<19:33,  1.37s/it, acc=0.93] 

In [23]:
aprox_num = 1000  # max samples to aproximate score
display_all = False
beam_stats = {}

for predict_beam in [1, 2, 3]:
    beam_stats[predict_beam] = {}
    beam_stats[predict_beam]['pred_tags'] = []
    beam_stats[predict_beam]['true_tags'] = []
    pbar = tqdm(val_dataset.sentences[:aprox_num])
    for sentence in pbar:
        preds = model(sentence[0], predict_beam)
        beam_stats[predict_beam]['pred_tags'].append(preds)
        beam_stats[predict_beam]['true_tags'].append(sentence[1])

        if display_all:
            display(pd.DataFrame((sentence[0], sentence[1], preds), index=('words', 'tags', 'preds')))
        pbar.set_postfix(acc=model.score_func(beam_stats[predict_beam]['pred_tags'],
                                              beam_stats[predict_beam]['true_tags']),
                         refresh=False)

    beam_stats[predict_beam]['matrix'], beam_stats[predict_beam]['worst'] = metrics.confusion_matrix(train_dataset.tags,
                                                                                                     beam_stats[predict_beam]['pred_tags'],
                                                                                                     beam_stats[predict_beam]['true_tags'])
#     display(worst)

# model.val_predictions = pred_tags
# model.val_sentences = val_dataset.sentences
# model.save()

100%|██████████| 1000/1000 [29:43<00:00,  1.78s/it, acc=0.906]
100%|██████████| 1000/1000 [57:03<00:00,  3.42s/it, acc=0.907] 
100%|██████████| 1000/1000 [1:25:07<00:00,  5.11s/it, acc=0.907]


In [26]:
model.beam_stats = beam_stats
model.save()

In [None]:
display(matrix1)
display(matrix2)

In [None]:
display(worst1)
display(worst2)

### Code that may be useful

In [None]:
# sentences = {}
# for sentence in train_dataset.sentences:
#     sentences[len(sentence[0])] = sentence

In [None]:
# %%time
# sample_sentence = (['Terms', 'were', "n't", 'disclosed', '.'],  # sentence words
#                    ['NNS',   'VBD',  'RB',  'VBN',       '.'])  # sentence true tags
# predict_beam = 100  # viterbi beam size

# tags, bp_pi = viterbi(model, sample_sentence[0], beam=predict_beam)
# print('sentence ', sample_sentence[0])
# print('true tags', sample_sentence[1])
# print('pred tags', tags)
# print()

In [7]:
%%time
sum_vec = np.zeros(len(feature_vector)).astype(np.float32)
sum_inds = 0
for t2, t1, w, i, t in train_dataset:
    vec = feature_vector(t2, t1, w, i, t, fmt='vec')
    sum_vec += vec
    sum_inds += (vec).sum()

df = pd.DataFrame(sum_vec, columns=['feat']).astype({'feat': int}).sort_values('feat', ascending=False)
print(sum_vec.sum())
print(sum_inds/len(train_dataset.sentences))
display(df.head(50))

3802186.0
760.4372


Unnamed: 0,feat
32871,16939
42333,16939
42378,16939
42289,16939
42423,16939
...,...
7471,1
7472,1
13388,1
7476,1


CPU times: user 28.2 s, sys: 15.6 ms, total: 28.3 s
Wall time: 28.8 s


In [13]:
display(df.loc[0:100].head(50))

Unnamed: 0,feat
0,6199
17008,6199
19968,6199
42480,6044
24990,6044
19758,6044
20012,6044
56,6044
26421,6044
42299,6044


In [15]:
feat, key = feature_vector.invert_feat(4914)  # 41453 22811
print(feat)
print(key)


FeatureGroup(tuple([w[i].lower(), t]))
('low-density', 'NN')


In [None]:
# # test run train_dataset
# tic = time.time()
# for t2, t1, w, i, t in train_dataset:
#     feat_vec_t = feature_vector(t2, t1, w, i, t, fmt='vec')
# print('fmt=vec: {:.3f} sec'.format(time.time() - tic))

# tic = time.time()
# for t2, t1, w, i, t in train_dataset:
#     feat_list_t = feature_vector(t2, t1, w, i, t, fmt='list')
# print('fmt=list: {:.3f} sec'.format(time.time() - tic))

# tic = time.time()
# for t2, t1, w, i, t in train_dataset:
#     feat_vec_t, feat_list_t = feature_vector(t2, t1, w, i, t, fmt='both')
# print('fmt=vec+list: {:.3f} sec'.format(time.time() - tic))

In [None]:
# for tag in train1_statistics.words_per_tag:
#     if len(train1_statistics.words_per_tag[tag]) < 10:
#         print('{:5} tf: {:5d} unique_count: {:4d} words: {}'.format(tag, train1_statistics.tags_count[tag], len(train1_statistics.words_per_tag[tag]),
#                                                                     train1_statistics.words_per_tag[tag]))
#     else:
#         print('{:5} tf: {:5d} unique_count: {:4d}'.format(tag, train1_statistics.tags_count[tag], len(train1_statistics.words_per_tag[tag])))

In [None]:
# features = []

# # one-to-one features
# for word in strange_words:
#     features.append(Feature(f'w[i] == "{word}"', t=train1_model.tags_per_word[word][0]))
#     print(word, train1_model.WordCount[word], train1_model.TagsPerWord[word])