In [1]:
%cd /home/dvc-2-iris-demo/

/home/tutorial-dvc-2-demo-project-iris


In [2]:
!dvc status

[KPipeline is up to date. Nothing to reproduce.
[0m

# Remove old dvc files to avoid dependencies duplication error

In [3]:
!rm $(find . -name "*?.dvc")

rm: missing operand
Try 'rm --help' for more information.


# 1. Setup / configs

In [4]:
import yaml
config = yaml.load(open('config/pipeline_config.yml'), Loader=yaml.FullLoader)

config

{'project': '7labs.ru/vision',
 'name': 'vision',
 'tags': ['solution-0-prototype', 'dev'],
 'dataset': {'random_state': 42,
  'dataset_csv': 'data/raw/iris.csv',
  'featured_dataset_csv': 'data/interim/featured_iris.csv',
  'train_csv': 'data/processed/train_iris.csv',
  'test_csv': 'data/processed/test_iris.csv',
  'test_size': 0.2,
  'features_columns_range': ['sepal_length', 'petal_length_to_petal_width'],
  'target_column': 'species'},
 'train': {'estimator_name': 'logreg',
  'grid_search_cv_config': {'param_grid': {'C': [0.1, 1.0],
    'max_iter': [5000, 6000],
    'solver': ['lbfgs', 'sag'],
    'multi_class': ['multinomial']},
   'cv': 10}},
 'evaluate': {'metrics_file': 'eval.txt'},
 'model': {'model_name': 'model.joblib', 'models_folder': 'models'},
 'report': {'reports_folder': 'experiments'},
 'split_config': {'folder': 'experiments'}}

## unprotect pipeline configs (if they exist)

In [5]:
!dvc unprotect experiments/*.dvc

[K[31mERROR[39m: failed to unprotect 'experiments/*.dvc' - can't unprotect non-existing data 'experiments/*.dvc'

[33mHaving any troubles?[39m. Hit us up at [34mhttps://dvc.org/support[39m, we are always happy to help!
[0m

## prepare configs

In [6]:
!python3 src/pipelines/prepare_configs.py --config=config/pipeline_config.yml

## browse folder with configs

In [7]:
!ls experiments/

evaluate_model_config.yml  split_train_test_config.yml
featurize_config.yml	   train_clf_config.yml


## add configs under DVC control

In [8]:
!dvc add experiments/*.yml

[KSaving information to 'experiments/evaluate_model_config.yml.dvc'.
[K
To track the changes with git run:

	git add experiments/evaluate_model_config.yml.dvc
[KSaving information to 'experiments/featurize_config.yml.dvc'.
[K
To track the changes with git run:

	git add experiments/evaluate_model_config.yml.dvc experiments/featurize_config.yml.dvc
[KSaving information to 'experiments/split_train_test_config.yml.dvc'.
[K
To track the changes with git run:

	git add experiments/evaluate_model_config.yml.dvc experiments/featurize_config.yml.dvc experiments/split_train_test_config.yml.dvc
[KSaving 'experiments/train_clf_config.yml' to '.dvc/cache/ba/9d65ebb70207408c4e38dff508fd6c'.
[KSaving information to 'experiments/train_clf_config.yml.dvc'.
[K
To track the changes with git run:

	git add experiments/evaluate_model_config.yml.dvc experiments/featurize_config.yml.dvc experiments/split_train_test_config.yml.dvc experiments/train_clf_config.yml.dvc
[0m

In [9]:
!git status -s experiments/

[31m??[m experiments/evaluate_model_config.yml.dvc
[31m??[m experiments/featurize_config.yml.dvc
[31m??[m experiments/split_train_test_config.yml.dvc
[31m??[m experiments/train_clf_config.yml.dvc


## commit pipelines configs dvc files

In [None]:
%%bash

git add experiments/
git commit -m "create pipeline configs and put it under DVC control"

# 2. Featurization (features engineering)

## unprotect data/interim/featured_iris.csv (if it exists)

In [10]:
!dvc unprotect data/interim/featured_iris.csv.dvc

[K[31mERROR[39m: failed to unprotect 'data/interim/featured_iris.csv.dvc' - can't unprotect non-existing data 'data/interim/featured_iris.csv.dvc'

[33mHaving any troubles?[39m. Hit us up at [34mhttps://dvc.org/support[39m, we are always happy to help!
[0m

## featurization config

In [11]:
!cat experiments/featurize_config.yml

dataset_csv: data/raw/iris.csv
featured_dataset_csv: data/interim/featured_iris.csv


## create new features

In [12]:
!python3 src/pipelines/featurize.py --config=experiments/featurize_config.yml

In [13]:
# iris dataset with new features is created
!ls data/interim

featured_iris.csv


## add data/interim/featured_iris.csv under DVC control

In [14]:
!dvc add data/interim/featured_iris.csv

[KSaving information to 'data/interim/featured_iris.csv.dvc'.
[K
To track the changes with git run:

	git add data/interim/featured_iris.csv.dvc
[0m

In [15]:
!git status -s data/

[31m??[m data/interim/featured_iris.csv.dvc


## commit data/interim/featured_iris.csv.dvc file

In [None]:
%%bash

git add data/interim/featured_iris.csv.dvc data/interim/.gitignore
git commit -m "create featured_iris.csv and put it under DVC control"

# 3. Split dataset into train/test

## unprotect train and test datasets (if they exist)

In [16]:
!dvc unprotect data/processed/train_iris.csv.dvc data/processed/test_iris.csv.dvc

[K[31mERROR[39m: failed to unprotect 'data/processed/train_iris.csv.dvc' - can't unprotect non-existing data 'data/processed/train_iris.csv.dvc'

[33mHaving any troubles?[39m. Hit us up at [34mhttps://dvc.org/support[39m, we are always happy to help!
[0m

## split config

In [17]:
!cat experiments/split_train_test_config.yml

dataset_csv: data/interim/featured_iris.csv
random_state: 42
target_column: species
test_csv: data/processed/test_iris.csv
test_size: 0.2
train_csv: data/processed/train_iris.csv


## split source (raw) dataset into train/test

In [18]:
!python3 src/pipelines/split_train_test.py --config=experiments/split_train_test_config.yml

In [19]:
# train and test datsets are created
!ls data/processed/

test_iris.csv  train_iris.csv


## add train and test datasets under DVC control

In [20]:
!dvc add data/processed/train_iris.csv data/processed/test_iris.csv

[KSaving information to 'data/processed/train_iris.csv.dvc'.
[K
To track the changes with git run:

	git add data/processed/train_iris.csv.dvc
[KSaving information to 'data/processed/test_iris.csv.dvc'.
[K
To track the changes with git run:

	git add data/processed/train_iris.csv.dvc data/processed/test_iris.csv.dvc
[0m

In [21]:
!git status -s data/processed/

[31m??[m data/processed/test_iris.csv.dvc
[31m??[m data/processed/train_iris.csv.dvc


## commit train and test datasets dvc files

In [None]:
%%bash

git add data/processed/*.dvc data/processed/.gitignore
git commit -m "split source dataset into train/test and add them under DVC control"

# 4. Train

## unprotect model dvc file (if it exists)

In [22]:
!dvc unprotect models/model.joblib.dvc

[K[31mERROR[39m: failed to unprotect 'models/model.joblib.dvc' - can't unprotect non-existing data 'models/model.joblib.dvc'

[33mHaving any troubles?[39m. Hit us up at [34mhttps://dvc.org/support[39m, we are always happy to help!
[0m

## train config

In [23]:
!cat experiments/train_clf_config.yml

estimator_name: logreg
features_columns_range:
- sepal_length
- petal_length_to_petal_width
grid_search_cv_config:
  cv: 10
  param_grid:
    C:
    - 0.1
    - 1.0
    max_iter:
    - 5000
    - 6000
    multi_class:
    - multinomial
    solver:
    - lbfgs
    - sag
model_name: model.joblib
models_folder: models
target_column: species
train_csv: data/processed/train_iris.csv


## train classifier

In [24]:
!python3 src/pipelines/train.py --config=experiments/train_clf_config.yml



In [25]:
# model is created
!ls models/

model.joblib


## add model under DVC control

In [26]:
!dvc add models/model.joblib

[KSaving 'models/model.joblib' to '.dvc/cache/4d/d78bd9c37ae2d018abb787c7d6d03d'.
[KSaving information to 'models/model.joblib.dvc'.
[K
To track the changes with git run:

	git add models/model.joblib.dvc
[0m

## commit model dvc file

In [None]:
%%bash

git add models/model.joblib.dvc models/.gitignore
git commit -m "train classifier and put model under DVC control"

# 4. Evaluate model 

## unprotect metrics file (if it exists)

In [27]:
!dvc unprotect experiments/eval.txt

[K[31mERROR[39m: failed to unprotect 'experiments/eval.txt' - can't unprotect non-existing data 'experiments/eval.txt'

[33mHaving any troubles?[39m. Hit us up at [34mhttps://dvc.org/support[39m, we are always happy to help!
[0m

## evaluate config

In [28]:
!cat experiments/evaluate_model_config.yml

dataset_csv: data/processed/test_iris.csv
features_columns_range:
- sepal_length
- petal_length_to_petal_width
metrics_file: eval.txt
model_name: model.joblib
models_folder: models
reports_folder: experiments
target_column: species


## evaluate

In [29]:
!python3 src/pipelines/evaluate.py --config=experiments/evaluate_model_config.yml

In [30]:
# metrics file eval.txt is created
!ls experiments

eval.txt		       split_train_test_config.yml
evaluate_model_config.yml      split_train_test_config.yml.dvc
evaluate_model_config.yml.dvc  train_clf_config.yml
featurize_config.yml	       train_clf_config.yml.dvc
featurize_config.yml.dvc


In [31]:
!cat experiments/eval.txt

{
    "f1_score": 1.0,
    "confusion_matrix": [
        [
            10,
            0,
            0
        ],
        [
            0,
            9,
            0
        ],
        [
            0,
            0,
            11
        ]
    ]
}

## add metrics file under DVC control

In [32]:
!dvc add experiments/eval.txt

[KSaving 'experiments/eval.txt' to '.dvc/cache/5a/0cf88dc56448899be985d779ca7068'.
[KSaving information to 'experiments/eval.txt.dvc'.
[K
To track the changes with git run:

	git add experiments/eval.txt.dvc
[0m

## commit dvc file of metrics file 

In [None]:
%%bash

git add experiments/eval.txt.dvc experiments/.gitignore
git commit -m "evaluate model and put metrics file under DVC control"