In [1]:
%cd supervised
%load_ext autoreload
%autoreload 2

import data
import train
import evaluate
import models
import torch

/home/maarten/Documents/masterthesis/supervised


# Training and evaluation

In [3]:
def load_dataset(num_clusters=15, window_size=5, old_test=False):
    if window_size == 1:
        window_label = 0
    elif window_size % 2 == 0:
        window_label = window_size // 2
    else:
        window_label = (window_size // 2) + 1

    files = [f'../clustered_data/{num_clusters}/18{i:03d}.xml' for i in [1, 2, 3, 4, 5, 6, 210, 211]]
    valid_files = [f'../clustered_data/{num_clusters}/18{i:03d}.xml' for i in [7, 209]]
    test_files = [f'../clustered_data/{num_clusters}/{i}162.xml' for i in [14, 15, 16]]
    all_files = files + valid_files + test_files

    files_gmm = [f'../clustered_vgmm/{num_clusters}/18{i:03d}.xml' for i in [1, 2, 3, 4, 5, 6, 210, 211]]
    valid_files_gmm = [f'../clustered_vgmm/{num_clusters}/18{i:03d}.xml' for i in [7, 209]]
    test_files_gmm = [f'../clustered_vgmm/{num_clusters}/{i}162.xml' for i in [14, 15, 16]]
    all_files_gmm = files_gmm + valid_files_gmm + test_files_gmm

    vocab = data.GermanDataset(all_files, all_files_gmm, num_clusters, -1, window_size, window_label, char_tokens=True).vocab

    if old_test:
        dataset = data.GermanDataset(files, files_gmm, num_clusters, 1.0, window_size, window_label,
                                     char_tokens=True,
                                     vocab=vocab)
        testset = data.GermanDataset(test_files, test_files_gmm, num_clusters, 1.0, window_size, window_label,
                                     char_tokens=True,
                                     vocab=vocab)
    else:
        dataset = data.GermanDataset(files + test_files, files_gmm + test_files_gmm, num_clusters, 1.0, window_size, window_label,
                                     char_tokens=True,
                                     vocab=vocab)
    validset = data.GermanDataset(valid_files, valid_files_gmm, num_clusters, 1.0, window_size, window_label,
                                  char_tokens=True,
                                  vocab=vocab)
    
    if old_test:
        return dataset, validset, testset
    else:
        return dataset, validset

In [None]:
# main test

window_size = 2
num_clusters = 9
dataset, validset = load_dataset(num_clusters, window_size)

params = train.CNNParams(
    embed_size=100,
    dropout=0.5,
    epochs=100,
    filters=[(33, 3), (33, 5), (33, 7)],
    num_layers=1,
    max_norm=3,
)

optim_fn = lambda p: torch.optim.Adadelta(p)
model_fns = [lambda r: models.NoClusterLabels(r, params.dropout),
             lambda r: models.CategoricalClusterLabels(r, num_clusters, window_size, params.dropout)]
#             lambda r: models.CategoricalClusterLabels(r, num_clusters, window_size, params.dropout)]

baseline = {}
kmeans = {}
gmm = {}

for n in [50, 100, 400, 800, 1200, 1600, 2000, 2400]:
    values = evaluate.cross_val(10, n, model_fns, [False, False], optim_fn, dataset, params,
                                early_stopping=10,
                                validation_set=validset,
                                testset=None)
    baseline[n] = [v[0] for v in values]
    kmeans[n] = [v[1] for v in values]
#    gmm[n] = [v[2] for v in values]

In [None]:
d = {s: {'Baseline': baseline[s],
         'K-Means': kmeans[s],
#         'Mixture model': gmm[s],
        }
    for s in baseline.keys()
}
evaluate.analyze_tseries(d, 'number of training samples', 'model', '../report/figures/results/main_window4_oldtest')
evaluate.analyze_size(d, 'number of samples', 'model', '../report/figures/results/main_window4_oldtest')

In [None]:
evaluate.analyze_wrapper(baseline, kmeans, None, 'model', '../report/figures/results/main_window4_data_oldtest')
#evaluate.analyze_wrapper(baseline, kmeans, gmm, 'model')

In [None]:
# test on old data

window_size = 4
num_clusters = 9
dataset, validset, testset = load_dataset(num_clusters, window_size, True)

params = train.CNNParams(
    embed_size=100,
    dropout=0.5,
    epochs=100,
    filters=[(33, 3), (33, 5), (33, 7)],
    num_layers=1,
    max_norm=3,
)

optim_fn = lambda p: torch.optim.Adadelta(p)
model_fns = [lambda r: models.NoClusterLabels(r, params.dropout),
             lambda r: models.CategoricalClusterLabels(r, num_clusters, window_size, params.dropout)]
#             lambda r: models.CategoricalClusterLabels(r, num_clusters, window_size, params.dropout)]

baseline = {}
kmeans = {}
gmm = {}

for n in [50, 100, 400, 800, 1200, 1600, 2000, 2400]:
    values = evaluate.cross_val(10, n, model_fns, [False, False], optim_fn, dataset, params,
                                early_stopping=10,
                                validation_set=validset,
                                testset=testset)
    baseline[n] = [v[0] for v in values]
    kmeans[n] = [v[1] for v in values]
#    gmm[n] = [v[2] for v in values]

In [None]:
params = train.CNNParams(
    embed_size=100,
    dropout=0.5,
    epochs=100,
    filters=[(33, 3), (33, 5), (33, 7)],
    num_layers=1,
    max_norm=3,
)

optim_fn = lambda p: torch.optim.Adadelta(p)
model_fns = [lambda r: models.NoClusterLabels(r, params.dropout),
             lambda r: models.CategoricalClusterLabels(r, num_clusters, window_size, params.dropout)]
    
no_labels = {}
with_labels = {}

for window_size in 3, 5, 9, 7, 11, 15:
    num_clusters = 9
    dataset, validset = load_dataset(num_clusters, window_size)

    values = evaluate.cross_val(10, 1200, model_fns, [False, False], optim_fn, dataset, params,
                                early_stopping=10,
                                validation_set=validset,
                                testset=None)
    no_labels[window_size] = [v[0] for v in values]
    with_labels[window_size] = [v[1] for v in values]

In [12]:
params = train.CharCNNParams(
    dropout=0.5,
    epochs=100,
    max_norm=3,
)

optim_fn = lambda p: torch.optim.Adadelta(p)
model_fns = [lambda r: models.NoClusterLabels(r, params.dropout),
             lambda r: models.ClusterLabelsChar(r, num_clusters, window_size, params.dropout)]

no_labels = {}
with_labels = {}
for window_size in [3, 4, 5]:
    num_clusters = 9
    dataset, validset, testset = load_dataset(num_clusters, window_size, True)

    values = evaluate.cross_val(10, 1600, model_fns, [False, False], optim_fn, dataset, params,
                                early_stopping=10,
                                validation_set=validset,
                                testset=testset)
    no_labels[window_size] = [v[0] for v in values]
    with_labels[window_size] = [v[1] for v in values]


Creating dictionary:   0%|          | 0/13 [00:00<?, ?it/s][A
Creating dictionary:  15%|█▌        | 2/13 [00:00<00:00, 13.81it/s][A
Creating dictionary:  23%|██▎       | 3/13 [00:00<00:01,  6.96it/s][A
Creating dictionary:  31%|███       | 4/13 [00:00<00:01,  7.44it/s][A
Creating dictionary:  46%|████▌     | 6/13 [00:00<00:01,  6.98it/s][A
Creating dictionary:  54%|█████▍    | 7/13 [00:01<00:00,  6.59it/s]Exception in thread Thread-15:
Traceback (most recent call last):
  File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.6/site-packages/tqdm/_tqdm.py", line 148, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

Creating dictionary: 100%|██████████| 13/13 [00:02<00:00,  5.45it/s]


Retrieved 1254 positive samples, 58709 negative samples.
Retrieved 562 positive samples, 21031 negative samples.


  0%|          | 0/10 [00:00<?, ?it/s]

Retrieved 691 positive samples, 37708 negative samples.
1600 training samples, 1124 testing samples
[138]





RuntimeError: Given groups=1, weight[256, 83, 7], so expected input[50, 138, 83] to have 83 channels, but got 138 channels instead

In [13]:
dataset

<data.GermanDataset at 0x7fb6983bf8d0>

In [None]:
values = evaluate.cross_val(10, 1600, model_fns, [False, False], optim_fn, dataset, params,
                            early_stopping=10,
                            validation_set=validset,
                            testset=testset)

  0%|          | 0/10 [00:00<?, ?it/s]

1600 training samples, 1124 testing samples
[138]
[138]


In [None]:
d = {s: {'Baseline': no_labels[s],
         'K-Means': with_labels[s],
        }
    for s in no_labels.keys()
}
#evaluate.analyze_tseries(d, 'window size', 'model')
evaluate.analyze_tseries(d, 'window size', 'model', '../report/figures/results/800-windowsize_oldtest')

In [None]:
params = train.CNNParams(
    embed_size=100,
    dropout=0.5,
    epochs=100,
    filters=[(33, 3), (33, 5), (33, 7)],
    num_layers=1,
    max_norm=3,
)

optim_fn = lambda p: torch.optim.Adadelta(p)
model_fns = [lambda r: models.CategoricalClusterLabels(r, num_clusters, window_size, params.dropout)]
#             lambda r: models.CategoricalClusterLabels(r, num_clusters, window_size, params.dropout)]
    
kmeans = {}
gmm = {}

for num_clusters in [2, 3, 5, 7, 9, 15, 30]:
    window_size = 5
    dataset, validset = load_dataset(num_clusters, window_size)

    values = evaluate.cross_val(10, 1200, model_fns, [False], optim_fn, dataset, params,
                                early_stopping=10,
                                validation_set=validset,
                                testset=None)
    kmeans[num_clusters] = [v[0] for v in values]
    gmm[num_clusters] = [v[1] for v in values]

In [None]:
d = {s: {'K-Means': kmeans[s],
         #'Mixture model': gmm[s],
        }
    for s in kmeans.keys()
}
#evaluate.analyze_tseries(d, 'k', 'model')
evaluate.analyze_size(d, 'k', 'model', '../report/figures/results/800-numcluster')
evaluate.analyze_tseries(d, 'k', 'model', '../report/figures/results/800-numcluster')

In [None]:
# test num clusters on old set

params = train.CNNParams(
    embed_size=100,
    dropout=0.5,
    epochs=100,
    filters=[(33, 3), (33, 5), (33, 7)],
    num_layers=1,
    max_norm=3,
)

optim_fn = lambda p: torch.optim.Adadelta(p)
model_fns = [lambda r: models.CategoricalClusterLabels(r, num_clusters, window_size, params.dropout),
             lambda r: models.CategoricalClusterLabels(r, num_clusters, window_size, params.dropout)]
    
kmeans = {}
gmm = {}

for num_clusters in [2, 3, 5, 7, 9, 15, 30]:
    window_size = 4
    dataset, validset, testset = load_dataset(num_clusters, window_size, True)

    values = evaluate.cross_val(10, 1600, model_fns, [False, True], optim_fn, dataset, params,
                                early_stopping=10,
                                validation_set=validset,
                                testset=testset)
    kmeans[num_clusters] = [v[0] for v in values]
    gmm[num_clusters] = [v[1] for v in values]

In [None]:
d = {s: {'K-Means': kmeans[s],
         'Mixture model': gmm[s],
        }
    for s in kmeans.keys()
}
evaluate.analyze_tseries(d, 'k', 'model')
#evaluate.analyze_size(d, 'k', 'model', '../report/figures/results/800-numcluster_olddata')
#evaluate.analyze_tseries(d, 'k', 'model', '../report/figures/results/800-numcluster_olddata')

# Test regularization methods

In [None]:
params = train.CNNParams(
    embed_size=300,
    dropout=0.0,
    epochs=100,
    filters=[(33, 3), (34, 5), (33, 7)],
    num_layers=1,
    max_norm=0,
)

optim_no_decay = lambda p: torch.optim.Adadelta(p)

dropout_tests = []
dropout_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
for value in dropout_values:
    params.dropout = value
    dropout_tests.append(evaluate.cross_val(10, -1,
                                            lambda r: models.NoClusterLabels(r, params.dropout),
                                            optim_no_decay, only_idx_dataset, params))

In [None]:
evaluate.analyze(dict(zip(dropout_values, dropout_tests)), 'dropout rate', '../report/figures/results/dropout')

In [None]:
params = train.CNNParams(
    embed_size=300,
    dropout=0.5,
    epochs=100,
    filters=[(33, 3), (34, 5), (33, 7)],
    num_layers=1,
    max_norm=3,
)

optim = lambda p: torch.optim.Adadelta(p)

l2norm_tests = []
l2norm_values = [1, 3, 4, 5, 10, 0]
for value in l2norm_values:
    params.dropout = 0.5
    params.max_norm = value
    l2norm_tests.append(evaluate.cross_val(10, -1,
                                           lambda r: models.NoClusterLabels(r, params.dropout),
                                           optim, only_idx_dataset, params))

In [None]:
evaluate.analyze(dict(zip(map(lambda v: v if v != 0 else 'None', l2norm_values), l2norm_tests)),
                 'maximum L2 norm of weight vectors', '../report/figures/results/decay')

In [None]:
params = train.CNNParams(
    embed_size=300,
    dropout=0.5,
    epochs=100,
    filters=[(33, 3), (34, 5), (33, 7)],
    num_layers=1,
    max_norm=2,
)

bn_tests = []
optim_fn = lambda p: torch.optim.SGD(p, lr=0.01, momentum=0.9, nesterov=True)
bn_tests.append(evaluate.cross_val(10, 400,
                                   lambda r: models.NoClusterLabels(r, params.dropout, batch_norm=False),
                                   optim_fn, only_idx_dataset, params))
bn_tests.append(evaluate.cross_val(10, 400,
                                   lambda r: models.NoClusterLabels(r, params.dropout, batch_norm=True),
                                   optim_fn, only_idx_dataset, params))

In [None]:
evaluate.analyze({'no batchnorm': bn_tests[0],
                  'batchnorm': bn_tests[1]})