forked from epfl-dlab/quootstrap
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.properties
47 lines (37 loc) · 1.47 KB
/
config.properties
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# Basic settings (news dataset path, people list dataset, etc.)
NEWS_DATASET_PATH=../data/QuootrapInputbak.tsv
PEOPLE_DATASET_PATH=../data/names.ALL.UNAMBIGUOUS.tsv
NUM_ITERATIONS=5
CASE_SENSITIVE=true
# Multiple languages are separated by |
LANGUAGE_FILTER=en|uk
# Provide the concrete implementation class name of DatasetLoader
NEWS_DATASET_LOADER=ch.epfl.dlab.quootstrap.WswDatasetLoader
# Settings for exporting results
EXPORT_RESULTS=true
EXPORT_PATH=result
# Set to true if you want to use Spark in local mode
LOCAL_MODE=true
# Hyperparameters
PATTERN_CONFIDENCE_THRESHOLD=0.7
PATTERN_CLUSTERING_THRESHOLDS=0|0.0002|0.001|0.005
# Quotation merging
ENABLE_QUOTATION_MERGING=true
MERGING_SHINGLE_SIZE=8
# Cache settings: some frequently used (and immutable) RDDs can be cached on disk
# in order to speed up the execution of the algorithm after the first time.
# Note that the cache must be invalidated manually (by deleting the files)
# if the code or the internal parameters are changed.
ENABLE_CACHE=true
CACHE_PATH=cache
# Evaluation settings
GROUND_TRUTH_PATH=ground_truth.json
# Enable the evaluation on the last iteration
ENABLE_FINAL_EVALUATION=false
# Enable the evaluation on intermediate iterations (slower)
ENABLE_INTERMEDIATE_EVALUATION=false
# Debug settings
# Set to true if you want to dump all new discovered patterns at each iteration
DEBUG_DUMP_PATTERNS=true
# Set to true if you want to convert the entire input data to lower case (not recommended)
DEBUG_CASE_FOLDING=false