Skip to content

Commit

Permalink
Add checking of the vector file format version number.
Browse files Browse the repository at this point in the history
Also factor up the (sometimes re-)opening of the vector file.
  • Loading branch information
erikrose committed May 28, 2020
1 parent ddf2b77 commit 9032713
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 43 deletions.
17 changes: 8 additions & 9 deletions cli/fathom_web/commands/test.py
@@ -1,4 +1,4 @@
from json import JSONDecodeError, load, loads
from json import JSONDecodeError, loads
from pathlib import Path

import click
Expand Down Expand Up @@ -106,14 +106,13 @@ def main(testing_set, weights, confidence_threshold, ruleset, trainee, testing_c
if not trainee:
raise BadOptionUsage('trainee', 'A --trainee ID must be specified when TESTING_SET_FOLDER is passed a directory.')

with make_or_find_vectors(ruleset,
trainee,
testing_set,
testing_cache,
show_browser,
'testing',
delay).open(encoding='utf-8') as testing_file:
testing_data = load(testing_file)
testing_data = make_or_find_vectors(ruleset,
trainee,
testing_set,
testing_cache,
show_browser,
'testing',
delay)
testing_pages = testing_data['pages']
x, y, num_yes, num_prunes = tensors_from(testing_pages)
model = model_from_json(weights, len(y[0]), testing_data['header']['featureNames'])
Expand Down
37 changes: 18 additions & 19 deletions cli/fathom_web/commands/train.py
@@ -1,4 +1,3 @@
from json import load
from pathlib import Path
from pprint import pformat

Expand Down Expand Up @@ -210,29 +209,29 @@ def main(training_set, validation_set, ruleset, trainee, training_cache, validat
if not trainee:
raise BadOptionUsage('trainee', 'A --trainee ID must be specified when TRAINING_SET_FOLDER or --validation-set are passed a directory.')

with open(make_or_find_vectors(ruleset,
trainee,
training_set,
training_cache,
show_browser,
'training',
delay),
encoding='utf-8') as training_file:
training_data = exclude_features(exclude, load(training_file))
training_data = exclude_features(
exclude,
make_or_find_vectors(ruleset,
trainee,
training_set,
training_cache,
show_browser,
'training',
delay))
training_pages = training_data['pages']
x, y, num_yes, num_prunes = tensors_from(training_pages, shuffle=True)
num_samples = len(x) + num_prunes

if validation_set:
with open(make_or_find_vectors(ruleset,
trainee,
validation_set,
validation_cache,
show_browser,
'validation',
delay),
encoding='utf-8') as validation_file:
validation_pages = exclude_features(exclude, load(validation_file))['pages']
validation_pages = exclude_features(
exclude,
make_or_find_vectors(ruleset,
trainee,
validation_set,
validation_cache,
show_browser,
'validation',
delay))['pages']
validation_ins, validation_outs, validation_yes, validation_prunes = tensors_from(validation_pages)
validation_arg = validation_ins, validation_outs
else:
Expand Down
37 changes: 22 additions & 15 deletions cli/fathom_web/vectorizer.py
Expand Up @@ -45,7 +45,7 @@ class Timeout(Exception):


def make_or_find_vectors(ruleset, trainee, sample_set, sample_cache, show_browser, kind_of_set, delay):
"""Return a Path to the vector file to use, building it first if necessary.
"""Return the contents of a vector file, building it first if necessary.
If passed a vector file for ``sample_set``, we return it verbatim. If
passed a folder rather than a vector file, we use the cache if it's fresh.
Expand All @@ -57,20 +57,27 @@ def make_or_find_vectors(ruleset, trainee, sample_set, sample_cache, show_browse
"""
if not sample_set.is_dir():
return sample_set # It's just a vector file.
if not sample_cache:
sample_cache = ruleset.parent / 'vectors' / f'{kind_of_set}_{trainee}.json'
updated_hashes = out_of_date(sample_cache, ruleset, sample_set)
if updated_hashes:
# Make a vectors file, replacing it if already present:
vectorize(ruleset, trainee, sample_set, sample_cache, show_browser, kind_of_set, delay)
# Stick the new hashes in it:
with sample_cache.open(encoding='utf-8') as file:
json = load(file)
json['header'].update(updated_hashes)
with sample_cache.open('w', encoding='utf-8') as file:
dump(json, file, separators=(',', ':'))
return sample_cache
final_path = sample_set # It's just a vector file.
else:
if not sample_cache:
sample_cache = ruleset.parent / 'vectors' / f'{kind_of_set}_{trainee}.json'
updated_hashes = out_of_date(sample_cache, ruleset, sample_set)
if updated_hashes:
# Make a vectors file, replacing it if already present:
vectorize(ruleset, trainee, sample_set, sample_cache, show_browser, kind_of_set, delay)
# Stick the new hashes in it:
with sample_cache.open(encoding='utf-8') as file:
json = load(file)
json['header'].update(updated_hashes)
with sample_cache.open('w', encoding='utf-8') as file:
dump(json, file, separators=(',', ':'))
return json
final_path = sample_cache
with open(final_path, encoding='utf-8') as file:
json = load(file)
if json['header']['version'] > 2:
raise GracefulError(f'The vector file {final_path} has a newer format than these tools can handle. Please run `pip install -U fathom-web` to upgrade your tools.')
return json


def out_of_date(sample_cache, ruleset, sample_set):
Expand Down

0 comments on commit 9032713

Please sign in to comment.