In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import yaml

In [2]:
# Set the repository root as a working directory 

%cd ..

/home/alex/Dev/Projects/mlrepa/alex_kolosov/dvc-5-demo-project-iris


# Init DVC repository


1. Checkout to new branch

```bash
git checkout -b experiments
```

2. Init DVC repository and setup DVC remote storage


```bash
dvc init
```

3. Add DVC repository under git control

```bash
git add .
git commit -m "Init and configure DVC"
```

4. Add `local` as a defult DVC remote storage 

```bash
dvc remote add -d local /tmp/dvc/dvc-5-demo-project-iris
git add .
git commit -m "add dvc remote - local"
```

# View config

In [3]:
# Look on pipelines config 

with open('params.yaml') as conf_file:
    config = conf_file.read()
    
print(config)

base:
  project: dvc-5-demo-project-iris

  model:
    model_name: model.joblib
    models_folder: models

  reports:
    reports_folder: reports

  random_state: 42 # random state for train/test split


data_load:
  dataset_csv: data/raw/iris.csv


featurize:
  features_path: data/processed/featured_iris.csv
  target_column: target


data_split:
  folder: experiments
  train_path: data/processed/train_iris.csv
  test_path: data/processed/test_iris.csv
  test_size: 0.3


train:
  cv: 3
  estimator_name: svm
  estimators:

    logreg: # sklearn.linear_model.LogisticRegression
      param_grid: # params of GridSearchCV constructor
        C: [0.001, 0.01]
        max_iter: [100]
        solver: ['lbfgs']
        multi_class: ['multinomial']

    svm: # sklearn.svm.SVC
      param_grid:
        C: [0.1, 1.0]
        kernel: ["rbf", "linear"]
        gamma: ["scale"]
        degree: [3, 5]

evaluate:
  metrics_file: metrics.json
  confusion_matrix_png: confusion_matrix.png
  classes_path: 

# Create and run stages for a DVC pipeline

## Get data

```bash
dvc run -n data_load \
    -d src/pipelines/data_load.py \
    -o data/raw/iris.csv \
    -p base,data_load \
    python src/pipelines/data_load.py \
        --config=params.yaml
```

## Featurization

```bash
dvc run -n featurize \
    -d src/pipelines/featurize.py \
    -d data/raw/iris.csv \
    -o data/processed/featured_iris.csv \
    -p base,data_load,featurize \
    python src/pipelines/featurize.py \
        --config=params.yaml
```

## Split dataset into train/test

```bash
dvc run -n data_split \
    -d src/pipelines/data_split.py \
    -d data/processed/featured_iris.csv \
    -o data/processed/train_iris.csv \
    -o data/processed/test_iris.csv \
    -p base,featurize,data_split \
    python src/pipelines/data_split.py \
        --config=params.yaml
```


## Train

```bash
dvc run -n train \
    -d src/pipelines/train.py \
    -d data/processed/train_iris.csv \
    -o models/model.joblib \
    -p base,featurize.target_column,data_split.train_path,train \
    python src/pipelines/train.py \
        --config=params.yaml

```

## Evaluate 

```bash
dvc run -n evaluate \
    -d src/pipelines/evaluate.py \
    -d data/processed/test_iris.csv \
    -d models/model.joblib \
    -o reports/confusion_matrix.png \
    -m reports/metrics.json \
    --plots reports/classess.csv \
    -p base,featurize.target_column,data_split.test_path,evaluate \
    python src/pipelines/evaluate.py \
        --config=params.yaml
```

# Reproduce pipeline

a) Pipeline is up to date. Nothing to reproduce.

```bash
dvc repro
```

b) Use `-f` to forced reproducing of pipeline

```bash
dvc repro -f
```


# Commit code changes & store artifacts

```bash
git add .
git commit -m "Create DVC pipeline"
```


# Push data to a remote DVC storage

Push data to the remote storage

```bash
dvc push
```


# Push `experiments` branch to GitLab remote repository 


```bash 

git push origin experiments
```