In [1]:
%cd /home/dvc-2-iris-demo

/home/dvc-2-iris-demo


In [12]:
# Remove old dvc files to avoid dependencies duplication error

# !rm $(find . -name "*?.dvc")

# Overview main config

In [3]:
import yaml

config = yaml.load(open('config/pipeline_config.yml'), Loader=yaml.FullLoader)
config

{'base': {'project': '7labs/dvc-2-iris-demo',
  'name': 'iris',
  'tags': ['solution-0-prototype', 'dev'],
  'model': {'model_name': 'model.joblib', 'models_folder': 'models'},
  'experiments': {'experiments_folder': 'experiments'},
  'random_state': 42},
 'split_train_test': {'folder': 'experiments',
  'train_csv': 'data/processed/train_iris.csv',
  'test_csv': 'data/processed/test_iris.csv',
  'test_size': 0.2},
 'featurize': {'dataset_csv': 'data/raw/iris.csv',
  'featured_dataset_csv': 'data/interim/featured_iris.csv',
  'features_columns_range': ['sepal_length', 'petal_length_to_petal_width'],
  'target_column': 'species'},
 'train': {'cv': 5,
  'estimator_name': 'logreg',
  'estimators': {'logreg': {'param_grid': {'C': [0.001, 0.01],
     'max_iter': [100],
     'solver': ['lbfgs'],
     'multi_class': ['multinomial']}},
   'knn': {'param_grid': {'n_neighbors': [5, 15], 'p': [1, 2]}},
   'svm': {'param_grid': {'C': [0.1, 1.0],
     'kernel': ['rbf', 'linear'],
     'gamma': ['sca

# Create and run pipelines

## 1. Setup / config

In [3]:
!dvc run -f stage_prepare_configs.dvc \
    -d src/pipelines/prepare_configs.py \
    -d config/pipeline_config.yml \
    -o experiments/split_train_test_config.yml \
    -o experiments/featurize_config.yml \
    -o experiments/train_config.yml \
    -o experiments/evaluate_config.yml \
    python src/pipelines/prepare_configs.py \
        --config=config/pipeline_config.yml

[KRunning command:
	python src/pipelines/prepare_configs.py --config=config/pipeline_config.yml
Save config: experiments/base_config.yml
Save config: experiments/split_train_test_config.yml
Save config: experiments/featurize_config.yml
Save config: experiments/train_config.yml
Save config: experiments/evaluate_config.yml
[KSaving 'experiments/split_train_test_config.yml' to '.dvc/cache/7f/cf7d4c56cd36d485c9776c8dfbafa7'.
[KSaving 'experiments/featurize_config.yml' to '.dvc/cache/3d/9c113b409731fba887d5a9cd43b486'.
[KSaving 'experiments/train_config.yml' to '.dvc/cache/09/62f3873bcc28078851867c92d22fc3'.
[KSaving 'experiments/evaluate_config.yml' to '.dvc/cache/91/c85b95fcb52f7b1957c02d11143a2d'.
[KSaving information to 'stage_prepare_configs.dvc'.
[K
To track the changes with git run:

	git add stage_prepare_configs.dvc
[0m

## 2. Featurization

In [4]:
!dvc run -f stage_featurize.dvc \
    -d src/pipelines/featurize.py \
    -d experiments/featurize_config.yml \
    -d data/raw/iris.csv \
    -o data/interim/featured_iris.csv \
    python src/pipelines/featurize.py \
        --config=experiments/featurize_config.yml

[KRunning command:
	python src/pipelines/featurize.py --config=experiments/featurize_config.yml
[KSaving information to 'stage_featurize.dvc'.
[K
To track the changes with git run:

	git add stage_featurize.dvc
[0m

## 3. Split dataset into train/test

In [5]:
!dvc run -f stage_split_train_test.dvc \
    -d src/pipelines/split_train_test.py \
    -d experiments/split_train_test_config.yml \
    -d data/interim/featured_iris.csv \
    -o data/processed/train_iris.csv \
    -o data/processed/test_iris.csv \
    python src/pipelines/split_train_test.py \
        --config=experiments/split_train_test_config.yml \
        --base_config=config/pipeline_config.yml

[KRunning command:
	python src/pipelines/split_train_test.py --config=experiments/split_train_test_config.yml --base_config=config/pipeline_config.yml
[KSaving 'data/processed/train_iris.csv' to '.dvc/cache/36/6af2683b508936c8c83e963e88cd40'.
[KSaving 'data/processed/test_iris.csv' to '.dvc/cache/76/51c34292c7d846debd07ed83897606'.
[KSaving information to 'stage_split_train_test.dvc'.
[K
To track the changes with git run:

	git add stage_split_train_test.dvc
[0m

## 4. Train

In [6]:
!dvc run -f stage_train.dvc \
    -d src/pipelines/train.py \
    -d experiments/train_config.yml \
    -d data/processed/train_iris.csv \
    -o models/model.joblib \
    python src/pipelines/train.py \
        --config=experiments/train_config.yml \
        --base_config=config/pipeline_config.yml

[KRunning command:
	python src/pipelines/train.py --config=experiments/train_config.yml --base_config=config/pipeline_config.yml
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished
0.8529086479396076
[KSaving 'models/model.joblib' to '.dvc/cache/d8/6199364bf884ace51244a2c5bdaa14'.
[KSaving information to 'stage_train.dvc'.
[K
To track the changes with git run:

	git add stage_train.dvc
[0m

## 5. Evaluate 

In [7]:
!dvc run -f stage_evaluate.dvc \
    -d src/pipelines/evaluate.py \
    -d experiments/evaluate_config.yml \
    -d models/model.joblib \
    -m experiments/eval.txt \
    python src/pipelines/evaluate.py \
        --config=experiments/evaluate_config.yml \
        --base_config=config/pipeline_config.yml

[KRunning command:
	python src/pipelines/evaluate.py --config=experiments/evaluate_config.yml --base_config=config/pipeline_config.yml
{'f1_score': 0.9305555555555555, 'confusion_matrix': [[10, 0, 0], [0, 7, 0], [0, 2, 11]]}
[KSaving 'experiments/eval.txt' to '.dvc/cache/d9/e6179f82ccc574b27763a38177f3d7'.
[KSaving information to 'stage_evaluate.dvc'.
[K
To track the changes with git run:

	git add stage_evaluate.dvc
[0m

# Reproduce pipeline

### Evaluate (final stage)

In [9]:
# Pipeline is up to date. Nothing to reproduce.

!dvc repro stage_evaluate.dvc

[KStage 'stage_prepare_configs.dvc' didn't change.
[KStage 'stage_featurize.dvc' didn't change.
[KStage 'stage_split_train_test.dvc' didn't change.
[KStage 'stage_train.dvc' didn't change.
[KStage 'stage_evaluate.dvc' didn't change.
[KPipeline is up to date. Nothing to reproduce.
[0m

In [13]:
# -f force reproducing pipeline

!dvc repro stage_evaluate.dvc -f

[KStage 'stage_prepare_configs.dvc' didn't change.
[KReproducing 'stage_prepare_configs.dvc'
[KRunning command:
	python src/pipelines/prepare_configs.py --config=config/pipeline_config.yml
Save config: experiments/base_config.yml
Save config: experiments/split_train_test_config.yml
Save config: experiments/featurize_config.yml
Save config: experiments/train_config.yml
Save config: experiments/evaluate_config.yml
[KOutput 'experiments/split_train_test_config.yml' didn't change. Skipping saving.
[KOutput 'experiments/featurize_config.yml' didn't change. Skipping saving.
[KOutput 'experiments/train_config.yml' didn't change. Skipping saving.
[KOutput 'experiments/evaluate_config.yml' didn't change. Skipping saving.
[KSaving information to 'stage_prepare_configs.dvc'.
[KStage 'stage_featurize.dvc' didn't change.
[KReproducing 'stage_featurize.dvc'
[KRunning command:
	python src/pipelines/featurize.py --config=experiments/featurize_config.yml
[KOutput 'data/interim/featured_iris