In [1]:
%cd /home/dvc-2-iris-demo

/home/dvc-2-iris-demo


# Remove old dvc files to avoid dependencies duplication error

In [2]:
!rm $(find . -name "*?.dvc")

rm: missing operand
Try 'rm --help' for more information.


# Overview main config

In [3]:
import yaml

config = yaml.load(open('config/pipeline_config.yml'), Loader=yaml.FullLoader)
config

{'project': '7labs/dvc-2-iris-demo',
 'name': 'vision',
 'tags': ['solution-0-prototype', 'dev'],
 'dataset': {'random_state': 42,
  'dataset_csv': 'data/raw/iris.csv',
  'featured_dataset_csv': 'data/interim/featured_iris.csv',
  'train_csv': 'data/processed/train_iris.csv',
  'test_csv': 'data/processed/test_iris.csv',
  'test_size': 0.2,
  'features_columns_range': ['sepal_length', 'petal_length_to_petal_width'],
  'target_column': 'species'},
 'train': {'estimator_name': 'knn',
  'grid_search_cv_config': {'param_grid': {'n_neighbors': [5, 10, 15],
    'leaf_size': [30, 60, 90],
    'p': [1, 2]},
   'cv': 10}},
 'evaluate': {'metrics_file': 'eval.txt'},
 'model': {'model_name': 'model.joblib', 'models_folder': 'models'},
 'report': {'reports_folder': 'experiments'},
 'split_config': {'folder': 'experiments'}}

# Create and run pipelines

## 1. Setup / config

In [4]:
!dvc run -f pipeline_prepare_configs.dvc \
        -d src/pipelines/prepare_configs.py \
        -d config/pipeline_config.yml \
        -o experiments/split_train_test_config.yml \
        -o experiments/featurize_config.yml \
        -o experiments/train_clf_config.yml \
        -o experiments/evaluate_model_config.yml \
        python src/pipelines/prepare_configs.py --config=config/pipeline_config.yml

[KRunning command:
	python src/pipelines/prepare_configs.py --config=config/pipeline_config.yml
[KSaving 'experiments/split_train_test_config.yml' to '.dvc/cache/44/481168099ddd8ff93db809f1379ba1'.
[KSaving 'experiments/featurize_config.yml' to '.dvc/cache/b3/4d1a901c0b24019042c0a7ac206f64'.
[KSaving 'experiments/train_clf_config.yml' to '.dvc/cache/87/60f7b6a461fcde2d8ca675a734ddab'.
[KSaving 'experiments/evaluate_model_config.yml' to '.dvc/cache/7e/fb2e1c27a3219e9b30a5fb7c1b7f1f'.
[KSaving information to 'pipeline_prepare_configs.dvc'.
[K
To track the changes with git run:

	git add pipeline_prepare_configs.dvc
[0m

## 2. Featurization

In [5]:
!dvc run -f pipeline_featurize.dvc \
         -d src/pipelines/featurize.py \
         -d experiments/featurize_config.yml \
         -d data/raw/iris.csv \
         -o data/interim/featured_iris.csv \
         python src/pipelines/featurize.py --config=experiments/featurize_config.yml

[KRunning command:
	python src/pipelines/featurize.py --config=experiments/featurize_config.yml
[KSaving 'data/interim/featured_iris.csv' to '.dvc/cache/e5/26aa4f251414ac605b5623eb76ecd8'.
[KSaving information to 'pipeline_featurize.dvc'.
[K
To track the changes with git run:

	git add pipeline_featurize.dvc
[0m

## 3. Split dataset into train/test

In [6]:
!dvc run -f pipeline_split_train_test.dvc \
         -d src/pipelines/split_train_test.py \
         -d experiments/split_train_test_config.yml \
         -d data/interim/featured_iris.csv \
         -o data/processed/train_iris.csv \
         -o data/processed/test_iris.csv \
         python src/pipelines/split_train_test.py --config=experiments/split_train_test_config.yml

[KRunning command:
	python src/pipelines/split_train_test.py --config=experiments/split_train_test_config.yml
[KSaving 'data/processed/train_iris.csv' to '.dvc/cache/b3/08921440d0e4eb08328f33c7b9f353'.
[KSaving 'data/processed/test_iris.csv' to '.dvc/cache/7c/3831b23101eadee4208ae805e82786'.
[KSaving information to 'pipeline_split_train_test.dvc'.
[K
To track the changes with git run:

	git add pipeline_split_train_test.dvc
[0m

## 4. Train

In [7]:
!dvc run -f pipeline_train.dvc \
         -d src/pipelines/train.py \
         -d experiments/train_clf_config.yml \
         -d data/processed/train_iris.csv \
         -o models/model.joblib \
         python src/pipelines/train.py --config=experiments/train_clf_config.yml

[KRunning command:
	python src/pipelines/train.py --config=experiments/train_clf_config.yml
[KSaving 'models/model.joblib' to '.dvc/cache/10/1bfadcb61f9b59fe9c92d7d7681617'.
[KSaving information to 'pipeline_train.dvc'.
[K
To track the changes with git run:

	git add pipeline_train.dvc
[0m

## 5. Evaluate 

In [8]:
!dvc run -f pipeline_evaluate.dvc \
         -d src/pipelines/evaluate.py \
         -d experiments/evaluate_model_config.yml \
         -d models/model.joblib \
         -m experiments/eval.txt \
         python src/pipelines/evaluate.py --config=experiments/evaluate_model_config.yml

[KRunning command:
	python src/pipelines/evaluate.py --config=experiments/evaluate_model_config.yml
[KSaving 'experiments/eval.txt' to '.dvc/cache/00/3cf81be2345d8e2f42c9985f5265cf'.
[KSaving information to 'pipeline_evaluate.dvc'.
[K
To track the changes with git run:

	git add pipeline_evaluate.dvc
[0m

# Reproduce pipelines

## Evaluate (final pipeline)

### ordinary repro

In [9]:
!dvc repro pipeline_evaluate.dvc

[KStage 'pipeline_prepare_configs.dvc' didn't change.
[KStage 'pipeline_featurize.dvc' didn't change.
[KStage 'pipeline_split_train_test.dvc' didn't change.
[KStage 'pipeline_train.dvc' didn't change.
[KStage 'pipeline_evaluate.dvc' didn't change.
[KPipeline is up to date. Nothing to reproduce.
[0m

### forcible repro

In [10]:
!dvc repro -f pipeline_evaluate.dvc

[KStage 'pipeline_prepare_configs.dvc' didn't change.
[KReproducing 'pipeline_prepare_configs.dvc'
[KRunning command:
	python src/pipelines/prepare_configs.py --config=config/pipeline_config.yml
[KOutput 'experiments/split_train_test_config.yml' didn't change. Skipping saving.
[KOutput 'experiments/featurize_config.yml' didn't change. Skipping saving.
[KOutput 'experiments/train_clf_config.yml' didn't change. Skipping saving.
[KOutput 'experiments/evaluate_model_config.yml' didn't change. Skipping saving.
[KSaving information to 'pipeline_prepare_configs.dvc'.
[KStage 'pipeline_featurize.dvc' didn't change.
[KReproducing 'pipeline_featurize.dvc'
[KRunning command:
	python src/pipelines/featurize.py --config=experiments/featurize_config.yml
[KOutput 'data/interim/featured_iris.csv' didn't change. Skipping saving.
[KSaving information to 'pipeline_featurize.dvc'.
[KStage 'pipeline_split_train_test.dvc' didn't change.
[KReproducing 'pipeline_split_train_test.dvc'
[KRunning