In [1]:
import numpy as np
import pandas as pd
from utils import preprocess, score, features, classifier

In [2]:
comp1_path = 'data/comp1.words'
comp2_path = 'data/comp2.words'
test1_path = 'data/test1.wtag'
train1_path = 'data/train1.wtag'
train2_path = 'data/train2.wtag'

## Create FeatureGroup params

In [3]:
group_thresholds = {
    lambda t2, t1, w, i, t: tuple([t2, t1, t]):                               1,
    lambda t2, t1, w, i, t: tuple([t1, t]):                                   1,
    lambda t2, t1, w, i, t: tuple([w[i].lower(), t]):                         None,
#     lambda t2, t1, w, i, t: tuple([w[i-1].lower(), t]):                       5,
#     lambda t2, t1, w, i, t: tuple([w[i+1].lower(), t]):                       5,
    lambda t2, t1, w, i, t: tuple([w[i][:3].lower(), t]):                     5,
    lambda t2, t1, w, i, t: tuple([w[i][:2].lower(), t]):                     5,
#     lambda t2, t1, w, i, t: tuple([w[i+1][:3].lower(), t]):                   5,
#     lambda t2, t1, w, i, t: tuple([w[i-1][:3].lower(), t]):                   5,
    lambda t2, t1, w, i, t: tuple([w[i][-3:].lower(), t]):                    5,
    lambda t2, t1, w, i, t: tuple([w[i][-2:].lower(), t]):                    5,
#     lambda t2, t1, w, i, t: tuple([w[i+1][-3:].lower(), t]):                  5,
#     lambda t2, t1, w, i, t: tuple([w[i-1][-3:].lower(), t]):                  5,
    lambda t2, t1, w, i, t: tuple([w[i].isalnum(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isalpha(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isascii(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isdecimal(), t]):                     1,
    lambda t2, t1, w, i, t: tuple([w[i].isdigit(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].islower(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isnumeric(), t]):                     1,
    lambda t2, t1, w, i, t: tuple([w[i].istitle(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isupper(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([len(w[i]) == 1, t]):                       1,
    lambda t2, t1, w, i, t: tuple([len(w[i]) == 2, t]):                       1,
    lambda t2, t1, w, i, t: tuple([len(w[i]) == 3, t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i][0].islower(), t]):                    1,
    lambda t2, t1, w, i, t: tuple([any(char.isdigit() for char in w[i]), t]): 1,
    lambda t2, t1, w, i, t: tuple([any(char.isupper() for char in w[i]), t]): 1,
}

In [4]:
%%time
train_dataset = preprocess.Dataset(train1_path)
val_dataset = preprocess.Dataset(test1_path)
feature_vector = features.create_feature_vector(train_dataset,
                                                group_thresholds=group_thresholds,
                                                pruning=True,
                                                get_stats=False,
                                                assertions=True,
                                                calls_counter=False)

CPU times: user 1.67 s, sys: 46.9 ms, total: 1.72 s
Wall time: 1.71 s


In [5]:
w0 = np.random.rand(len(feature_vector)).astype(np.float32)/20
seed = 42
models_path = 'models'
version = 1
save = True

model = classifier.Model(version=version,
                         w0=w0,
                         tags=train_dataset.tags,
                         feature_vector=feature_vector,
                         seed=seed,
                         score_func=score.accuracy,
                         models_path=models_path,
                         save=save)

In [6]:
weight_decay = 1e7
batch_size = 512
epochs = 20
tqdm_bar = False

v_min, f_min, d_min = model.train(epochs=epochs,
                                  train_dataset=train_dataset,
                                  val_dataset=val_dataset,
                                  batch_size=batch_size,
                                  weight_decay=weight_decay,
                                  iprint=-1,
                                  save=True,
                                  tqdm_bar=tqdm_bar)

epoch   1/20 | train_loss 232022.011453 | val_loss 121288.541844 | train_score 0.021763 | val_score 0.0229 | train_time   0.73 min
epoch   2/20 | train_loss 146758.931201 | val_loss 77023.877246 | train_score 0.022887 | val_score 0.0210 | train_time   1.51 min
epoch   3/20 | train_loss 90.147963 | val_loss 87.094723 | train_score 0.022296 | val_score 0.0228 | train_time   2.30 min
epoch   4/20 | train_loss 91.262248 | val_loss 87.713709 | train_score 0.023060 | val_score 0.0210 | train_time   3.07 min
epoch   5/20 | train_loss 89.227516 | val_loss 87.094878 | train_score 0.022690 | val_score 0.0216 | train_time   3.81 min
epoch   6/20 | train_loss 90.369326 | val_loss 87.094878 | train_score 0.023207 | val_score 0.0217 | train_time   4.57 min
epoch   7/20 | train_loss 89.565198 | val_loss 87.094878 | train_score 0.022559 | val_score 0.0233 | train_time   5.34 min
epoch   8/20 | train_loss 90.881539 | val_loss 87.173560 | train_score 0.021976 | val_score 0.0218 | train_time   6.08 min
e

In [8]:
model = classifier.load_model(version, models_path, epoch=-1, seed=42, prints=True)

model version: 2
epochs: 2
train_time: 1.522

last train_loss: 149423.090295
last val_loss: 78273.272377
last train_score: 0.022485
last val_score: 0.021627
best val_accuracy: 0.0216 at epoch 1


In [9]:
v_min

array([ 1.00650830e-05,  5.20087830e-07,  5.03616629e-08, ...,
       -1.35678368e-10, -2.23178660e-11, -1.39021980e-10])

In [9]:
display(v_min)

0.00295728559167523

In [10]:
display(f_min)

36385.19124304004

In [11]:
display(d_min)

{'grad': array([ 36.0791321 , 189.732937  , 181.03639973, ..., 191.54076316,
         62.58931548,  56.02766903]),
 'task': b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT',
 'funcalls': 6,
 'nit': 1,
 'warnflag': 1}

In [5]:
for feat in feature_vector.feats:
    print('feat_group:', feat, '| feats:', len(feat))
print('feat_groups:', len(feature_vector.feats), '| total_feats:', len(feature_vector))

feat_group: FeatureGroup(tuple([t2, t1, t])) | feats: 5192
feat_group: FeatureGroup(tuple([t1, t])) | feats: 908
feat_group: FeatureGroup(tuple([w[i].lower(), t])) | feats: 14719
feat_group: FeatureGroup(tuple([w[i][:3].lower(), t])) | feats: 2431
feat_group: FeatureGroup(tuple([w[i][:2].lower(), t])) | feats: 1432
feat_group: FeatureGroup(tuple([w[i][-3:].lower(), t])) | feats: 1757
feat_group: FeatureGroup(tuple([w[i][-2:].lower(), t])) | feats: 955
feat_group: FeatureGroup(tuple([w[i].isalnum(), t])) | feats: 60
feat_group: FeatureGroup(tuple([w[i].isalpha(), t])) | feats: 60
feat_group: FeatureGroup(tuple([w[i].isascii(), t])) | feats: 44
feat_group: FeatureGroup(tuple([w[i].isdecimal(), t])) | feats: 45
feat_group: FeatureGroup(tuple([w[i].isdigit(), t])) | feats: 45
feat_group: FeatureGroup(tuple([w[i].islower(), t])) | feats: 75
feat_group: FeatureGroup(tuple([w[i].isnumeric(), t])) | feats: 45
feat_group: FeatureGroup(tuple([w[i].istitle(), t])) | feats: 76
feat_group: FeatureG

In [6]:
# # test run train_dataset
# tic = time.time()
# for t2, t1, w, i, t in train_dataset:
#     feat_vec_t = feature_vector(t2, t1, w, i, t, fmt='vec')
# print('fmt=vec: {:.3f} sec'.format(time.time() - tic))

# tic = time.time()
# for t2, t1, w, i, t in train_dataset:
#     feat_list_t = feature_vector(t2, t1, w, i, t, fmt='list')
# print('fmt=list: {:.3f} sec'.format(time.time() - tic))

# tic = time.time()
# for t2, t1, w, i, t in train_dataset:
#     feat_vec_t, feat_list_t = feature_vector(t2, t1, w, i, t, fmt='both')
# print('fmt=vec+list: {:.3f} sec'.format(time.time() - tic))

In [None]:
# for tag in train1_statistics.words_per_tag:
#     if len(train1_statistics.words_per_tag[tag]) < 10:
#         print('{:5} tf: {:5d} unique_count: {:4d} words: {}'.format(tag, train1_statistics.tags_count[tag], len(train1_statistics.words_per_tag[tag]),
#                                                                     train1_statistics.words_per_tag[tag]))
#     else:
#         print('{:5} tf: {:5d} unique_count: {:4d}'.format(tag, train1_statistics.tags_count[tag], len(train1_statistics.words_per_tag[tag])))

In [None]:
# features = []

# # one-to-one features
# for word in strange_words:
#     features.append(Feature(f'w[i] == "{word}"', t=train1_model.tags_per_word[word][0]))
#     print(word, train1_model.WordCount[word], train1_model.TagsPerWord[word])