config.properties

# Basic settings (news dataset path, people list dataset, etc.)
NEWS_DATASET_PATH=../data/QuootrapInputbak.tsv
PEOPLE_DATASET_PATH=../data/names.ALL.UNAMBIGUOUS.tsv
NUM_ITERATIONS=5
CASE_SENSITIVE=true

# Multiple languages are separated by |
LANGUAGE_FILTER=en|uk

# Provide the concrete implementation class name of DatasetLoader
NEWS_DATASET_LOADER=ch.epfl.dlab.quootstrap.WswDatasetLoader

# Settings for exporting results
EXPORT_RESULTS=true
EXPORT_PATH=result

# Set to true if you want to use Spark in local mode
LOCAL_MODE=true

# Hyperparameters
PATTERN_CONFIDENCE_THRESHOLD=0.7
PATTERN_CLUSTERING_THRESHOLDS=0|0.0002|0.001|0.005

# Quotation merging
ENABLE_QUOTATION_MERGING=true
MERGING_SHINGLE_SIZE=8

# Cache settings: some frequently used (and immutable) RDDs can be cached on disk
# in order to speed up the execution of the algorithm after the first time.
# Note that the cache must be invalidated manually (by deleting the files)
# if the code or the internal parameters are changed.
ENABLE_CACHE=true
CACHE_PATH=cache

# Evaluation settings
GROUND_TRUTH_PATH=ground_truth.json
# Enable the evaluation on the last iteration
ENABLE_FINAL_EVALUATION=false
# Enable the evaluation on intermediate iterations (slower)
ENABLE_INTERMEDIATE_EVALUATION=false

# Debug settings
# Set to true if you want to dump all new discovered patterns at each iteration
DEBUG_DUMP_PATTERNS=true

# Set to true if you want to convert the entire input data to lower case (not recommended)
DEBUG_CASE_FOLDING=false