# Preparation

In [1]:
%cd /home/dvc-2-iris-demo

/home/dvc-2-iris-demo


In [2]:
import yaml

# Look on pipelines config 
config = yaml.load(open('config/pipeline_config.yml'), Loader=yaml.FullLoader)

config

{'project': '7labs/dvc-2-iris-demo',
 'name': 'vision',
 'tags': ['solution-0-prototype', 'dev'],
 'dataset': {'random_state': 42,
  'dataset_csv': 'data/raw/iris.csv',
  'featured_dataset_csv': 'data/interim/featured_iris.csv',
  'train_csv': 'data/processed/train_iris.csv',
  'test_csv': 'data/processed/test_iris.csv',
  'test_size': 0.2,
  'features_columns_range': ['sepal_length', 'petal_length_to_petal_width'],
  'target_column': 'species'},
 'train': {'estimator_name': 'knn',
  'grid_search_cv_config': {'param_grid': {'n_neighbors': [5, 10, 15],
    'leaf_size': [30, 60, 90],
    'p': [1, 2]},
   'cv': 10}},
 'evaluate': {'metrics_file': 'eval.txt'},
 'model': {'model_name': 'model.joblib', 'models_folder': 'models'},
 'report': {'reports_folder': 'experiments'},
 'split_config': {'folder': 'experiments'}}

# Prepare configs

In [3]:
!python src/pipelines/prepare_configs.py --config=config/pipeline_config.yml

## Browse folder with configs

In [4]:
!ls experiments/

evaluate_model_config.yml  split_train_test_config.yml
featurize_config.yml	   train_clf_config.yml


# Extract features

In [5]:
# featurization config
!cat experiments/featurize_config.yml

dataset_csv: data/raw/iris.csv
featured_dataset_csv: data/interim/featured_iris.csv


In [6]:
!python src/pipelines/featurize.py --config=experiments/featurize_config.yml

In [7]:
# iris dataset with new features is created
!ls data/interim

featured_iris.csv


# Split train/test dataset

In [8]:
# split config
!cat experiments/split_train_test_config.yml

dataset_csv: data/interim/featured_iris.csv
random_state: 42
target_column: species
test_csv: data/processed/test_iris.csv
test_size: 0.2
train_csv: data/processed/train_iris.csv


In [9]:
!python src/pipelines/split_train_test.py --config=experiments/split_train_test_config.yml

In [10]:
# train and test datsets are created
!ls data/processed/

test_iris.csv  train_iris.csv


# Train model

In [11]:
# train config
!cat experiments/train_clf_config.yml

estimator_name: knn
features_columns_range:
- sepal_length
- petal_length_to_petal_width
grid_search_cv_config:
  cv: 10
  param_grid:
    leaf_size:
    - 30
    - 60
    - 90
    n_neighbors:
    - 5
    - 10
    - 15
    p:
    - 1
    - 2
model_name: model.joblib
models_folder: models
target_column: species
train_csv: data/processed/train_iris.csv


In [12]:
!python src/pipelines/train.py --config=experiments/train_clf_config.yml



In [13]:
# model is created
!ls models/

model.joblib


# Evaluate model 

In [14]:
# evaluate config
!cat experiments/evaluate_model_config.yml

dataset_csv: data/processed/test_iris.csv
features_columns_range:
- sepal_length
- petal_length_to_petal_width
metrics_file: eval.txt
model_name: model.joblib
models_folder: models
reports_folder: experiments
target_column: species


In [19]:
!python src/pipelines/evaluate.py \
    --config=experiments/evaluate_model_config.yml

In [20]:
# metrics file eval.txt is created
!ls experiments

eval.txt		   featurize_config.yml		train_clf_config.yml
evaluate_model_config.yml  split_train_test_config.yml


In [21]:
!cat experiments/eval.txt

{
    "f1_score": 0.9333333333333332,
    "confusion_matrix": [
        [
            10,
            0,
            0
        ],
        [
            0,
            9,
            2
        ],
        [
            0,
            0,
            9
        ]
    ]
}

In [18]:
evaluate_report = yaml.load(open('experiments/eval.txt'), Loader=yaml.FullLoader)
evaluate_report

{'f1_score': 0.9333333333333332,
 'confusion_matrix': [[10, 0, 0], [0, 9, 2], [0, 0, 9]]}