Quality improvements (#29)

mozilla · Dec 6, 2021 · 3b3f33b · 3b3f33b
1 parent a09b0ac
commit 3b3f33b
Show file tree

Hide file tree

Showing 67 changed files with 912 additions and 871 deletions.
diff --git a/DAG.pdf b/DAG.pdf
diff --git a/Makefile b/Makefile
@@ -11,9 +11,12 @@ WORKSPACE=12000
 CLUSTER_CORES=16
 CONFIG=configs/config.prod.yml
 CONDA_PATH=$(SHARED_ROOT)/mambaforge
+SNAKEMAKE_OUTPUT_CACHE=$(SHARED_ROOT)/cache
+TARGET=
 ###
 
 CONDA_ACTIVATE=source $(CONDA_PATH)/etc/profile.d/conda.sh ; conda activate ; conda activate
+SNAKEMAKE=export SNAKEMAKE_OUTPUT_CACHE=$(SNAKEMAKE_OUTPUT_CACHE);  snakemake
 
 ### 2. setup
 
@@ -26,7 +29,8 @@ conda:
 
 snakemake:
 	$(CONDA_ACTIVATE) base
-	mamba create -c conda-forge -c bioconda -n snakemake snakemake==6.9.1 --yes
+	mamba create -c conda-forge -c bioconda -n snakemake snakemake==6.10.0 --yes
+	mkdir -p "$(SNAKEMAKE_OUTPUT_CACHE)"
 
 # build container image for cluster and run-local modes (preferred)
 build:
@@ -44,64 +48,78 @@ pull:
 
 dry-run:
 	$(CONDA_ACTIVATE) snakemake
-	snakemake \
+	$(SNAKEMAKE) \
 	  --use-conda \
 	  --cores all \
+	  --cache \
 	  --reason \
 	  --configfile $(CONFIG) \
 	  --config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) deps=true  \
-	  -n
+	  -n \
+	  $(TARGET)
 
 run-local:
+	echo "Running with config $(CONFIG)"
 	$(CONDA_ACTIVATE) snakemake
-	snakemake \
+	$(SNAKEMAKE) \
 	  --use-conda \
 	  --reason \
 	  --cores all \
+	  --cache \
 	  --resources gpu=$(GPUS) \
 	  --configfile $(CONFIG) \
-	  --config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) deps=true
+	  --config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) deps=true \
+	  $(TARGET)
+
+test: CONFIG=configs/config.test.yml
+test: run-local
 
 run-local-container:
 	$(CONDA_ACTIVATE) snakemake
 	module load singularity
-	snakemake \
+	$(SNAKEMAKE) \
 	  --use-conda \
 	  --use-singularity \
 	  --reason \
 	  --cores all \
+	  --cache \
 	  --resources gpu=$(GPUS) \
 	  --configfile $(CONFIG) \
 	  --config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) \
-	  --singularity-args="--bind $(SHARED_ROOT),$(CUDA_DIR) --nv"
+	  --singularity-args="--bind $(SHARED_ROOT),$(CUDA_DIR) --nv" \
+	  $(TARGET)
 
 run-slurm:
 	$(CONDA_ACTIVATE) snakemake
 	chmod +x profiles/slurm/*
-	snakemake \
+	$(SNAKEMAKE) \
 	  --use-conda \
 	  --reason \
 	  --cores $(CLUSTER_CORES) \
+	  --cache \
 	  --configfile $(CONFIG) \
 	  --config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) \
-	  --profile=profiles/slurm
+	  --profile=profiles/slurm \
+	  $(TARGET)
 
 run-slurm-container:
 	$(CONDA_ACTIVATE) snakemake
 	chmod +x profiles/slurm/*
 	module load singularity
-	snakemake \
+	$(SNAKEMAKE) \
 	  --use-conda \
 	  --use-singularity \
 	  --reason \
 	  --verbose \
 	  --cores $(CLUSTER_CORES) \
+	  --cache \
 	  --configfile $(CONFIG) \
 	  --config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) \
 	  --profile=profiles/slurm \
-	  --singularity-args="--bind $(SHARED_ROOT),$(CUDA_DIR),/tmp --nv --containall"
+	  --singularity-args="--bind $(SHARED_ROOT),$(CUDA_DIR),/tmp --nv --containall" \
+	  $(TARGET)
 # if CPU nodes don't have access to cuda dirs, use
-# export CUDA_DIR=$(CUDA_DIR)
+# export CUDA_DIR=$(CUDA_DIR); $(SNAKEMAKE) \
 # --singularity-args="--bind $(SHARED_ROOT),/tmp --nv --containall"
 
 
@@ -123,25 +141,11 @@ run-file-server:
 ### extra
 
 dag:
-	snakemake --dag | dot -Tpdf > DAG.pdf
-
-lint:
-	snakemake --lint
-
-install-monitor:
-	$(CONDA_ACTIVATE) base
-	conda create --name panoptes
-	conda install -c panoptes-organization panoptes-ui
-
-run-monitor:
-	$(CONDA_ACTIVATE) panoptes
-	panoptes
-
-run-with-monitor:
 	snakemake \
-	  --use-conda \
-	  --cores all \
-	  --wms-monitor http://127.0.0.1:5000
+	  --dag \
+	  --configfile $(CONFIG) \
+	  --config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) \
+	  | dot -Tpdf > DAG.pdf
 
 install-tensorboard:
 	$(CONDA_ACTIVATE) base
@@ -151,29 +155,4 @@ tensorboard:
 	$(CONDA_ACTIVATE) tensorboard
 	ls -d $(SHARED_ROOT)/models/*/*/* > tb-monitored-jobs; \
 	tensorboard --logdir=$$MODELS --host=0.0.0.0 &; \
-	python utils/tb_log_parser.py --prefix=
-
-install-snakepit-scheduler:
-	mkdir -p $(SHARED_ROOT)/snakepit
-	cd $(SHARED_ROOT)/snakepit
-
-	curl -sL https://deb.nodesource.com/setup_12.x | sudo -E bash -
-	sudo apt install nodejs
-
-	if [ ! -e snakepit-client ]; then
-	  git clone https://github.com/mozilla/snakepit-client.git
-	fi
-	cd snakepit-client
-	npm install
-	sudo npm link
-
-	echo "http://10.2.224.243" > /root/.pitconnect.txt
-
-	pit status
-
-run-snakepit:
-	chmod +x profiles/snakepit/*
-	snakemake \
-	  --use-conda \
-	  --cores all \
-	  --profile=profiles/snakepit
+	python utils/tb_log_parser.py --prefix=
diff --git a/README.md b/README.md
@@ -128,15 +128,24 @@ make dry-run
 
 ### Local mode
 
-Without containerization:
+#### Without containerization
+
 ```
 make run-local
 ```
-With containerization:
+To test the whole pipeline end to end (it supposed to run quickly and does not train anything useful):
+
+```
+make test
+```
+Or run
+#### With containerization
 ```
 make run-local-container
 ```
 
+
+
 ### Cluster mode
 
 To run on Slurm
@@ -149,6 +158,18 @@ with containerization (recommended):
 ```
 make run-slurm-container
 ```
+### Specific target
+
+By default, all Snakemake rules are executed. To run the pipeline up to a specific rule use:
+```
+make <run-command> TARGET=<non-wildcard-rule>
+```
+
+For example, collect corpus first:
+```
+make run-local TARGET=merge_corpus
+```
+
 
 ### Using Snakepit
 
@@ -209,20 +230,23 @@ Step | Description | Bottleneck | Comments
 --- | --- | --- | ---
 Installation | Installing dependencies and compiling | CPU | Takes ~1 hour
 Data downloading | Downloads datasets, samples sentences | Network, Disk | Time depends on dataset size, sampling of huge mono datasets (100M+ sentences) is the most intensive operation.
-Data cleaning | Basic preprocessing, language specific, rule based, deduplication,  and other attempts to clean noisy data in parallel and mono datasets | CPU | Good parallelization across CPU cores. To make cleaning of a new language more efficient add it to [clean_parallel.py](/pipeline/clean/clean_parallel.py).
-Bicleaner | Filters noisy sentence pairs in a parallel corpus using [bicleaner](https://github.com/bitextor/bicleaner) or [bicleaner-ai](https://github.com/bitextor/bicleaner-ai) depending on available language packs. | CPU, GPU | If there are no pretrained language packs for bicleaner-ai, it uses bicleaner. If there are no ones for bicleaner either, this step is skipped. Cleaning threshold is controlled by `BICLEANER_THRESHOLD` config setting.
+Data cleaning | Basic preprocessing, dataset specific, language specific, rule based and other attempts to clean noisy data in parallel and mono datasets | CPU | Good parallelization across CPU cores. To make cleaning of a new language more efficient add it to [clean_parallel.py](/pipeline/clean/tools/clean_parallel.py).
+Bicleaner | Filters noisy sentence pairs in a parallel corpus using [bicleaner](https://github.com/bitextor/bicleaner) or [bicleaner-ai](https://github.com/bitextor/bicleaner-ai) depending on available language packs. | CPU, GPU | If there are no pretrained language packs for bicleaner-ai, it uses bicleaner. If there are no ones for bicleaner either, this step is skipped. Cleaning thresholds are configurable per dataset, see [Dataset cleaning](##Dataset cleaning).
+Merge and dedupe | Merges clean dataset and applies deduplicaiton | CPU, Disk | 
 Training s2s | Trains a backward shallow s2s model, which is useful for back-translations and ce-filtering | GPU | Inspired by a [marian example](https://github.com/marian-nmt/marian-examples/tree/master/training-basics-sentencepiece).
-Augmentation with back-translations | Translates mono corpus combined from `MONO_DATASETS_TRG` using shallow s2s model. | GPU | It is more useful for low-resource languages and can be skipped for others.
-Training teacher | Trains one or multiple big transformer models | GPU | You might want to adjust [early stopping](pipeline/train/configs/training/teacher.transformer.train.yml) parameters depending on datasets size. Inspired by [transformer](https://github.com/marian-nmt/marian-examples/tree/master/transformer) and [wmt2017-uedin](https://github.com/marian-nmt/marian-examples/tree/master/wmt2017-uedin) marian examples and extended with [SentencePiece](https://github.com/google/sentencepiece).
+Augmentation with back-translations | Translates mono corpus combined from monolingual datasets in target language using shallow s2s model. | GPU | It is more useful for low-resource languages and can be skipped for others.
+Training teacher | Trains an ensemble of big transformer models on augmented dataset | GPU | You might want to adjust [early stopping](pipeline/train/configs/training/teacher.transformer.train.yml) or `after-epochs` parameters depending on datasets size.
+Continue training teacher | Continue training an ensemble of teachers on parallel data only | GPU | You might want to adjust [early stopping](pipeline/train/configs/training/teacher.transformer.train.yml) parameters depending on datasets size.
 Translation by teacher | Translates a corpus and monolingual data combined from `MONO_DATASETS_SRC` using the teacher model (ensemble is not supported yet) | GPU | The slowest part of the pipeline. Can take days. It is possible to speed it up launching the same scripts ([corpus](pipeline/translate/translate-corpus.sh), [mono](pipeline/translate/translate-mono.sh)) in parallel from another machine with access to the same network directory.
 Cross-entropy filtering | Scores translated corpus with backward s2s model and removes a part of the corpus with the lowest scores to reduce noise | GPU, CPU, Disk | At this point we work with huge datasets, so it utilizes copying to a local disk to make things faster.
 Training alignments and shortlist | Trains alignments using [fast_align](https://github.com/clab/fast_align) and extracts lexical shortlist using [extract_lex](https://github.com/marian-nmt/extract-lex) tool | CPU, Disk | Some tools requires uncompressed datasets on disk and they are huge at this point. Data is copied to a local disk to make things faster. Might take 100+GB of local disk depending on a dataset size. Good CPU parallelization.
-Training student | Trains a small transformer student model on filtered data and using alignments | GPU | Run [Tensorboard](utils/tensorboard/tensorboard.sh) manually to see training visualization.
+Training student | Trains a small transformer student model on filtered data and using alignments | GPU |
 Fine-tuning student | Finetunes the student model by emulating 8bit GEMM during training | GPU | Converges very quickly and then degrades. It's quick but you might want to reduce early stopping threshold.
 Quantizaiton |  Applies 8 bit quantization to the fined-tuned student model and evaluates on CPU | CPU | CPU threads must be set to 1 for this step.
+Evaluation |  Calculates metrics for all models (BLEU, chrf) using [SacreBLEU](https://github.com/mjpost/sacrebleu) | GPU | Uses `datasets.test` configuration section.
 Export | Exports trained model and shortlist to (bergamot-translator)(https://github.com/mozilla/bergamot-translator) format | |
 
-## Datasets importers
+## Dataset importers
 
 Dataset importers can be used in `datasets` sections of experiment config.
 
@@ -256,6 +280,119 @@ Example:
 Just add a shell script to [corpus](pipeline/data/importers/corpus) or [mono]() which is named as `<prefix>.sh` 
 and accepts the same parameters as the other scripts from the same folder.
 
+## Dataset fixing
+
+Some datasets require fixes like detokenization. Dataset and language specific fixes are implemented in [pipeline/clean/fixes]([pipeline/clean/fixes]).
+Naming convention: 
+- `<dataset_name>.sh` for parallel dataset cleaning
+- `<dataset_name>.<lang>.sh` for language specific cleaning of parallel or monolingual dataset
+- `/` in dataset name should be replaced with `_`
+
+## Dataset cleaning
+Some parallel datasets require more aggressive filtering.
+Dataset specific Bicleaner thretholds can be set in config. Example:
+
+```angular2html
+experiment:
+...
+  bicleaner:
+    default-threshold: 0.5
+    dataset-thresholds:
+      mtdata_neulab_tedtalksv1_train: 0.6
+```
+
+## Utilities
+
+### Tensorboard
+
+To see training graphs run tensorboard:
+
+```
+make install-tensorboard
+make tensorboard
+```
+
+Then port forward 6006.
+
+## Directory structure
+
+    ├ data
+    │   └ ru-en
+    │      └ test
+    │        ├ original
+    │        │   ├ corpus
+    │        │   │   ├ mtdata_JW300.en.gz
+    │        │   │   └ mtdata_JW300.ru.gz
+    │        │   ├ devset
+    │        │   │   ├ flores_dev.en.gz
+    │        │   │   └ flores_dev.ru.gz
+    │        │   ├ eval
+    │        │   │   ├ sacrebleu_wmt20.en.gz
+    │        │   │   └ sacrebleu_wmt20.ru.gz
+    │        │   ├ mono
+    │        │   │   ├ news-crawl_news.2020.ru.gz
+    │        │   │   └ news-crawl_news.2020.en.gz
+    │        │   ├ devset.ru.gz
+    │        │   └ devset.en.gz
+    │        ├ clean
+    │        │   ├ corpus
+    │        │   │   ├ mtdata_JW300.en.gz
+    │        │   │   └ mtdata_JW300.ru.gz
+    │        │   ├ mono
+    │        │   │   ├ news-crawl_news.2020.ru.gz
+    │        │   │   └ news-crawl_news.2020.en.gz
+    │        │   ├ mono.ru.gz
+    │        │   └ mono.en.gz
+    │        ├ biclean
+    │        │   ├ corpus
+    │        │   │   ├ mtdata_JW300.en.gz
+    │        │   │   └ mtdata_JW300.ru.gz
+    │        │   ├ corpus.ru.gz
+    │        │   ├ corpus.en.gz
+    │        ├ translated
+    │        │   ├ mono.ru.gz
+    │        │   └ mono.en.gz
+    │        ├ augmented
+    │        │   ├ corpus.ru.gz
+    │        │   └ corpus.en.gz
+    │        ├ alignment
+    │        │   ├ corpus.aln.gz
+    │        │   └ lex.s2t.pruned.gz
+    │        ├ merged
+    │        │   ├ corpus.ru.gz
+    │        │   └ corpus.en.gz
+    │        └ filtered
+    │            ├ corpus.ru.gz
+    │            └ corpus.en.gz
+    ├ models
+    │   ├ ru-en
+    │   │   └ test
+    │   │      ├ teacher
+    │   │      ├ student
+    │   │      ├ student-finetuned
+    │   │      ├ speed
+    │   │      ├ evaluation
+    │   │      │  ├ backward
+    │   │      │  ├ teacher0
+    │   │      │  ├ teacher1
+    │   │      │  ├ teacher-ensemble
+    │   │      │  ├ student
+    │   │      │  ├ student-finetuned
+    │   │      │  └ speed
+    │   │      └ exported
+    │   ├ en-ru
+    │      └ test
+    │         └ backward
+    │
+    ├ experiments
+    │   └ ru-en
+    │      └ test
+    │         └ config.sh
+    ├ logs
+    │   └ ru-en
+    │      └ test
+    │         └ clean_corpus.log
+
 ## Development
 
 ### Architecture
@@ -271,9 +408,6 @@ Snakemake parallelizes steps that can be executed simultniously. It is especiall
 The main snakemkae process (scheduler) should be launched interactively. It runs job processes on the worker nodes in cluster mode or on a local machine in local mode.
 
 ### Conventions
-
-- All scripts work with respect to repo root directory. 
-  It allows to not think about relative paths and execution folders.
 
 - Scripts inside the `pipeline` directory are independent and operate only using input arguments, input files 
   and global envs.