# Preparation

In [1]:
%cd /home/dvc-2-iris-demo

/home/dvc-2-iris-demo


In [2]:
import yaml

# Look on pipelines config 
config = yaml.load(open('config/pipeline_config.yml'), Loader=yaml.FullLoader)

config

{'base': {'project': '7labs/dvc-2-iris-demo',
  'name': 'iris',
  'tags': ['solution-0-prototype', 'dev'],
  'model': {'model_name': 'model.joblib', 'models_folder': 'models'},
  'experiments': {'experiments_folder': 'experiments'},
  'random_state': 42},
 'split_train_test': {'folder': 'experiments',
  'train_csv': 'data/processed/train_iris.csv',
  'test_csv': 'data/processed/test_iris.csv',
  'test_size': 0.2},
 'featurize': {'dataset_csv': 'data/raw/iris.csv',
  'featured_dataset_csv': 'data/interim/featured_iris.csv',
  'features_columns_range': ['sepal_length', 'petal_length_to_petal_width'],
  'target_column': 'species'},
 'train': {'cv': 5,
  'estimator_name': 'logreg',
  'estimators': {'logreg': {'param_grid': {'C': [0.001, 0.01],
     'max_iter': [100],
     'solver': ['lbfgs'],
     'multi_class': ['multinomial']}},
   'knn': {'param_grid': {'n_neighbors': [5, 15], 'p': [1, 2]}},
   'svm': {'param_grid': {'C': [0.1, 1.0],
     'kernel': ['rbf', 'linear'],
     'gamma': ['sca

# Prepare configs

In [3]:
!python src/pipelines/prepare_configs.py \
    --config=config/pipeline_config.yml

Save config: experiments/base_config.yml
Save config: experiments/split_train_test_config.yml
Save config: experiments/featurize_config.yml
Save config: experiments/train_config.yml
Save config: experiments/evaluate_config.yml


## Browse folder with configs

In [4]:
!ls experiments/

base_config.yml      featurize_config.yml	  train_config.yml
evaluate_config.yml  split_train_test_config.yml


# Extract features

In [5]:
# featurization config
!cat experiments/featurize_config.yml

dataset_csv: data/raw/iris.csv
featured_dataset_csv: data/interim/featured_iris.csv
features_columns_range:
- sepal_length
- petal_length_to_petal_width
target_column: species


In [6]:
!python src/pipelines/featurize.py \
    --config=experiments/featurize_config.yml

In [7]:
# iris dataset with new features is created
!ls data/interim

featured_iris.csv


# Split train/test dataset

In [8]:
# split config
!cat experiments/split_train_test_config.yml

folder: experiments
test_csv: data/processed/test_iris.csv
test_size: 0.2
train_csv: data/processed/train_iris.csv


In [11]:
!python src/pipelines/split_train_test.py \
    --config=experiments/split_train_test_config.yml \
    --base_config=config/pipeline_config.yml
    

In [12]:
# train and test datsets are created
!ls data/processed/

test_iris.csv  train_iris.csv


# Train model

In [13]:
# train config
!cat experiments/train_clf_config.yml

cat: experiments/train_clf_config.yml: No such file or directory


In [17]:
!python src/pipelines/train.py \
    --config=experiments/train_config.yml \
    --base_config=config/pipeline_config.yml

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished
0.8529086479396076


In [18]:
# model is created
!ls models/

model.joblib


# Evaluate model 

In [19]:
# evaluate config
!cat experiments/evaluate_config.yml

metrics_file: eval.txt


In [21]:
!python src/pipelines/evaluate.py \
    --config=experiments/evaluate_config.yml \
    --base_config=config/pipeline_config.yml

{'f1_score': 0.9305555555555555, 'confusion_matrix': [[10, 0, 0], [0, 7, 0], [0, 2, 11]]}


In [23]:
# metrics file eval.txt is created
!ls experiments

base_config.yml  evaluate_config.yml   split_train_test_config.yml
eval.txt	 featurize_config.yml  train_config.yml


In [21]:
!cat experiments/eval.txt

{
    "f1_score": 0.9333333333333332,
    "confusion_matrix": [
        [
            10,
            0,
            0
        ],
        [
            0,
            9,
            2
        ],
        [
            0,
            0,
            9
        ]
    ]
}

In [24]:
evaluate_report = yaml.load(open('experiments/eval.txt'), Loader=yaml.FullLoader)
evaluate_report

{'f1_score': 0.9305555555555555,
 'confusion_matrix': [[10, 0, 0], [0, 7, 0], [0, 2, 11]]}