In [1]:
import numpy as np
import pandas as pd
import math
import time
import random
from scipy import optimize
from utils import preprocess, score, features, classifier

In [2]:
# params, maybe need to remove that
model_name = 'model.pickle'
model_matrices = 'model_matrices.pickle'
model_preprocess = 'model_preprocess.pickle'
load_model = False
load_matrices = False
load_preprocess = False
verbose = 1

# variables
limit_common_words = 5
threshold = 0
args = None
lamda = 0.1

# data files
comp1_path = 'data/comp1.words'
comp2_path = 'data/comp2.words'
test1_path = 'data/test1.wtag'
train1_path = 'data/train1.wtag'
train2_path = 'data/train2.wtag'

## Create FeatureGroup params

In [4]:
group_thresholds = {
    lambda t2, t1, w, i, t: tuple([t2, t1, t]):                               1,
    lambda t2, t1, w, i, t: tuple([t1, t]):                                   1,
    lambda t2, t1, w, i, t: tuple([w[i].lower(), t]):                         None,
#     lambda t2, t1, w, i, t: tuple([w[i-1].lower(), t]):                       5,
#     lambda t2, t1, w, i, t: tuple([w[i+1].lower(), t]):                       5,
    lambda t2, t1, w, i, t: tuple([w[i][:3].lower(), t]):                     5,
    lambda t2, t1, w, i, t: tuple([w[i][:2].lower(), t]):                     5,
#     lambda t2, t1, w, i, t: tuple([w[i+1][:3].lower(), t]):                   5,
#     lambda t2, t1, w, i, t: tuple([w[i-1][:3].lower(), t]):                   5,
    lambda t2, t1, w, i, t: tuple([w[i][-3:].lower(), t]):                    5,
    lambda t2, t1, w, i, t: tuple([w[i][-2:].lower(), t]):                    5,
#     lambda t2, t1, w, i, t: tuple([w[i+1][-3:].lower(), t]):                  5,
#     lambda t2, t1, w, i, t: tuple([w[i-1][-3:].lower(), t]):                  5,
    lambda t2, t1, w, i, t: tuple([w[i].isalnum(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isalpha(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isascii(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isdecimal(), t]):                     1,
    lambda t2, t1, w, i, t: tuple([w[i].isdigit(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].islower(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isnumeric(), t]):                     1,
    lambda t2, t1, w, i, t: tuple([w[i].istitle(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i].isupper(), t]):                       1,
    lambda t2, t1, w, i, t: tuple([len(w[i]) == 1, t]):                       1,
    lambda t2, t1, w, i, t: tuple([len(w[i]) == 2, t]):                       1,
    lambda t2, t1, w, i, t: tuple([len(w[i]) == 3, t]):                       1,
    lambda t2, t1, w, i, t: tuple([w[i][0].islower(), t]):                    1,
    lambda t2, t1, w, i, t: tuple([any(char.isdigit() for char in w[i]), t]): 1,
    lambda t2, t1, w, i, t: tuple([any(char.isupper() for char in w[i]), t]): 1,
}

In [5]:
%%time
dataset = preprocess.Dataset(train1_path)
feature_vector = features.create_feature_vector(dataset, group_thresholds=group_thresholds, pruning=True, get_stats=False, assertions=True)

CPU times: user 3.77 s, sys: 31.2 ms, total: 3.8 s
Wall time: 3.8 s


In [6]:
for feat in feature_vector.feats:
    print('feat_group:', feat, '| feats:', len(feat))
print('feat_groups:', len(feature_vector.feats), '| total_feats:', len(feature_vector))

feat_group: FeatureGroup(tuple([t2, t1, t])) | feats: 5192
feat_group: FeatureGroup(tuple([t1, t])) | feats: 908
feat_group: FeatureGroup(tuple([w[i].lower(), t])) | feats: 14719
feat_group: FeatureGroup(tuple([w[i][:3].lower(), t])) | feats: 2431
feat_group: FeatureGroup(tuple([w[i][:2].lower(), t])) | feats: 1432
feat_group: FeatureGroup(tuple([w[i][-3:].lower(), t])) | feats: 1757
feat_group: FeatureGroup(tuple([w[i][-2:].lower(), t])) | feats: 955
feat_group: FeatureGroup(tuple([w[i].isalnum(), t])) | feats: 60
feat_group: FeatureGroup(tuple([w[i].isalpha(), t])) | feats: 60
feat_group: FeatureGroup(tuple([w[i].isascii(), t])) | feats: 44
feat_group: FeatureGroup(tuple([w[i].isdecimal(), t])) | feats: 45
feat_group: FeatureGroup(tuple([w[i].isdigit(), t])) | feats: 45
feat_group: FeatureGroup(tuple([w[i].islower(), t])) | feats: 75
feat_group: FeatureGroup(tuple([w[i].isnumeric(), t])) | feats: 45
feat_group: FeatureGroup(tuple([w[i].istitle(), t])) | feats: 76
feat_group: FeatureG

In [7]:
# test run dataset
tic = time.time()
for t2, t1, w, i, t in dataset:
    feat_vec_t = feature_vector(t2, t1, w, i, t, fmt='vec')
print('fmt=vec: {:.3f} sec'.format(time.time() - tic))

tic = time.time()
for t2, t1, w, i, t in dataset:
    feat_list_t = feature_vector(t2, t1, w, i, t, fmt='list')
print('fmt=list: {:.3f} sec'.format(time.time() - tic))

tic = time.time()
for t2, t1, w, i, t in dataset:
    feat_vec_t, feat_list_t = feature_vector(t2, t1, w, i, t, fmt='both')
print('fmt=vec+list: {:.3f} sec'.format(time.time() - tic))

fmt=vec: 6.546 sec
fmt=list: 4.867 sec
fmt=vec+list: 7.496 sec


In [7]:
w0 = np.random.rand(len(feature_vector)).astype(np.float32)/30
seed = 42
models_path = None
save = False

model = classifier.Model(w0=w0,
                         tags=dataset.tags,
                         feature_vector=feature_vector,
                         seed=seed,
                         score_func=score.accuracy,
                         models_path=models_path,
                         save=save)

In [8]:
model.train(epochs=1,
            train_dataset=dataset,
            val_dataset=None,
            batch_size=256,
            weight_decay=0.0,
            iprint=-1,
            save=False,
            tqdm_bar=True)

 96%|█████████▌| 5967/6236 [00:13<00:00, 447.87it/s]
  0%|          | 1/6236 [00:00<00:21, 284.28it/s]


OverflowError: math range error

In [19]:
v = np.random.rand(len(feature_vector)).astype(np.float32)/30
weight_decay = 1e2

In [8]:
# def foo(x):
#     return x**2, 2*x

# x0 = 100
# v_min, f_min, d_min = optimize.fmin_l_bfgs_b(func=foo, x0=x0, maxiter=10, iprint=100)
# print(v_min, f_min, d_min)

In [11]:
# test run loss_and_grad
# tic = time.time()
loss = classifier.loss_and_grad(v, dataset, feature_vector, weight_decay, loss_only=False, batch_size=None, shuffle=True, seed=42, tqdm_bar=True)
# print('loss={:.5f}, grad={:.5f}: {:.3f} sec'.format(loss, grad, time.time() - tic))

100%|██████████| 121815/121815 [05:07<00:00, 396.46it/s]


In [16]:
loss, grad = classifier.loss_and_grad(v, dataset, feature_vector, weight_decay, loss_only=False, batch_size=128, shuffle=True, seed=42, tqdm_bar=True)

3288it [00:08, 395.85it/s]                          


In [17]:
loss

462420.57053219556

In [18]:
grad

array([-3.4995125e+01, -5.9916358e+00,  1.0795711e-03, ...,
        1.8605659e-02,  6.9600726e-03,  3.2265317e-02], dtype=float32)

In [25]:
%%time
v = np.random.rand(len(feature_vector)).astype(np.float32)/30
weight_decay = 1e6
v_min, f_min, d_min = optimize.fmin_l_bfgs_b(func=classifier.loss_and_grad,
                                             x0=v,
                                             args=(dataset,
                                                   feature_vector,
                                                   weight_decay,
                                                   False, # loss_only
                                                   None,  # batch_size
                                                   True, # shuffle
                                                   42, # seed
                                                   True), # tqdm_bar
                                             maxiter=5,
                                             iprint=-1)

100%|██████████| 121815/121815 [04:48<00:00, 422.06it/s]
100%|██████████| 121815/121815 [04:48<00:00, 422.59it/s]
100%|██████████| 121815/121815 [07:52<00:00, 258.06it/s]
100%|██████████| 121815/121815 [10:24<00:00, 194.93it/s]
100%|██████████| 121815/121815 [10:16<00:00, 197.57it/s]
100%|██████████| 121815/121815 [11:32<00:00, 175.96it/s]
100%|██████████| 121815/121815 [12:26<00:00, 163.20it/s]
100%|██████████| 121815/121815 [16:12<00:00, 125.31it/s]
100%|██████████| 121815/121815 [06:22<00:00, 318.06it/s]

CPU times: user 1h 24min 14s, sys: 52.4 s, total: 1h 25min 6s
Wall time: 1h 24min 44s





In [22]:
v_min

array([0.01296869, 0.0020813 , 0.00721218, ..., 0.00198896, 0.01266094,
       0.01284344])

In [23]:
f_min

393381.3427747049

In [None]:
            loss_and_grad(v=self.weights,
                          dataset=dataset,
                          feature_vector=self.feature_vector,
                          weight_decay=weight_decay,
                          loss_only=True,
                          batch_size=batch_size,
                          shuffle=False,
                          seed=self.seed,
                          tqdm_bar=tqdm_bar)


In [7]:
loss

-23659955.01495881

In [8]:
grad

array([1134.,  137.,   37., ...,    0.,    0.,    0.], dtype=float32)

In [3]:
list1 = [1,2,3,4,5,6,7,8,9]
list2 = [1,2,3,4,5,6,7,8,9]
list3 = list(zip(list1, list2))
list3

[(1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9)]

In [4]:
import random

In [12]:
random.choice(list1)

5

In [60]:
list1 = [1,2,3,4,5,6,7,8,9]
n = len(list1)
random.shuffle(list1)
list1

[1, 7, 6, 2, 5, 9, 4, 8, 3]

In [61]:
pop = list1[:n]
del list1[:n]
print('list1', list1)
print('pop', pop)

list1 []
pop [1, 7, 6, 2, 5, 9, 4, 8, 3]


In [22]:
pop

[9, 1]

In [23]:
list1

[5, 3, 6, 4, 7, 8, 2]

In [5]:
# %%time
# for ix in dataset:
#     for h in dataset.sentence(ix):
#         _ = feature_vector(*h)

In [9]:
# for tag in train1_statistics.words_per_tag:
#     if len(train1_statistics.words_per_tag[tag]) < 10:
#         print('{:5} tf: {:5d} unique_count: {:4d} words: {}'.format(tag, train1_statistics.tags_count[tag], len(train1_statistics.words_per_tag[tag]),
#                                                                     train1_statistics.words_per_tag[tag]))
#     else:
#         print('{:5} tf: {:5d} unique_count: {:4d}'.format(tag, train1_statistics.tags_count[tag], len(train1_statistics.words_per_tag[tag])))

In [10]:
# features = []

# # one-to-one features
# for word in strange_words:
#     features.append(Feature(f'w[i] == "{word}"', t=train1_model.tags_per_word[word][0]))
#     print(word, train1_model.WordCount[word], train1_model.TagsPerWord[word])

In [7]:
t2, t1, w, i, t = 'NN', 'VB', ['preprocessing' for _ in range(200)], 100, 'NN'
print(feat(t2, t1, w, i, t))

None
