In [1]:
import os
from pathlib import Path
import sys
import yaml

In [2]:
# Get project root path
project_root = Path().absolute().parent.as_posix()
sys.path.append(project_root)
# Change working directory
%cd {project_root}
# Set env var PYTHONPATH
%env PYTHONPATH={project_root}

/home/alex/Dev/Projects/tutorials/tutorials-dvc/dvc-5-demo-project-iris
env: PYTHONPATH=/home/alex/Dev/Projects/tutorials/tutorials-dvc/dvc-5-demo-project-iris


# Init DVC repository


## Checkout to new branch

```bash
git checkout -b experiments
```

## Init DVC repository and setup DVC remote storage


```bash
dvc init
dvc remote add -d myremote /tmp
```

## Add DVC repository under git control

```bash
git add .
git commit -m "Init and configure DVC"
```

# View config

In [3]:
# Look on pipelines config 
with open('params.yaml') as conf_file:
    config = conf_file.read()
print(config)

base:
  project: 7labs/dvc-5-demo-project-iris
  name: iris
  tags: [solution-0-prototype, dev]

  model:
    model_name: model.joblib
    models_folder: models

  experiments:
    experiments_folder: experiments

  random_state: 42 # random state for train/test split


data_load:
  dataset_csv: data/raw/iris.csv


featurize:
  features_path: data/interim/featured_iris.csv
  target_column: target


data_split:
  folder: experiments
  train_path: data/processed/train_iris.csv
  test_path: data/processed/test_iris.csv
  test_size: 0.2


train:
  cv: 5
  estimator_name: svm
  estimators:

    logreg: # sklearn.linear_model.LogisticRegression
      param_grid: # params of GridSearchCV constructor
        C: [0.001, 0.01]
        max_iter: [100]
        solver: ['lbfgs']
        multi_class: ['multinomial']

    svm: # sklearn.svm.SVC
      param_grid:
        C: [0.1, 1.0]
        kernel: ["rbf", "linear"]
        gamma: ["scale"]
        degree: [3, 5]
#
#    knn: # sklearn.neighbors.KNei

# Set PYTHONPATH

Inside project root:

```bash
export PYTHONPATH=.
```

# Create and run pipelines

## Get data

```bash
dvc run -n data_load \
    -d src/pipelines/data_load.py \
    -o data/raw/iris.csv \
    -p base,data_load \
    python src/pipelines/data_load.py \
        --config=params.yaml
```

## Featurization

```bash
dvc run -n featurize \
    -d src/pipelines/featurize.py \
    -d data/raw/iris.csv \
    -o data/interim/featured_iris.csv \
    -p base,data_load,featurize \
    python src/pipelines/featurize.py \
        --config=params.yaml
```

## Split dataset into train/test

```bash
dvc run -n data_split \
    -d src/pipelines/data_split.py \
    -d data/interim/featured_iris.csv \
    -o data/processed/train_iris.csv \
    -o data/processed/test_iris.csv \
    -p base,featurize,data_split \
    python src/pipelines/data_split.py \
        --config=params.yaml
```


## Train

```bash
dvc run -n train \
    -d src/pipelines/train.py \
    -d data/processed/train_iris.csv \
    -o models/model.joblib \
    -p base,featurize.target_column,data_split.train_path,train \
    python src/pipelines/train.py \
        --config=params.yaml

```

## Evaluate 

```bash
dvc run -n evaluate \
    -d src/pipelines/evaluate.py \
    -d data/processed/test_iris.csv \
    -d models/model.joblib \
    -o experiments/confusion_matrix.png \
    -m experiments/metrics.json \
    --plots experiments/classess.csv \
    -p base,featurize.target_column,data_split.test_path,evaluate \
    python src/pipelines/evaluate.py \
        --config=params.yaml
```

# Reproduce pipeline

## Pipeline is up to date. Nothing to reproduce.

```bash
dvc repro
```

## -f forced reproducing of pipeline

```bash
dvc repro -f
```

# Make commit

```bash
git add .
git commit -m "Create DVC pipeline"
```