# Preparation

In [None]:
%cd /home/dvc-2-iris-demo

In [None]:
from sklearn.model_selection import train_test_split

from src.data.dataset import get_dataset
from src.evaluate.evaluate import evaluate
from src.features.features import extract_features
from src.report.visualize import plot_confusion_matrix
from src.train.train import get_supported_estimator, train
from src.transforms.trainsforms import transform_target_values_to_labels

# Load dataset

In [None]:
dataset = get_dataset('data/raw/iris.csv')
dataset.shape

In [None]:
dataset.head()

In [None]:
# feature names
feature_names = dataset.columns.tolist()[:4]
feature_names 

In [None]:
# iris species
species = dataset['species'].unique().tolist()
species

# Features engineering

In [None]:
dataset = extract_features(dataset)
dataset.head()

# Split dataset

In [None]:
test_size = 0.2
random_state = 42

## transform targets (species) to numerics

In [None]:
dataset = transform_target_values_to_labels(dataset, 'species')
dataset.head()

## Split in train/test

In [None]:
train_dataset, test_dataset = train_test_split(dataset, test_size=test_size, random_state=42)
train_dataset.shape, test_dataset.shape

# Train

In [None]:
# supported classifiers
get_supported_estimator()

In [None]:
estimator_name = 'logreg'
param_grid = { 
        'C': [0.001,0.01],
        'max_iter': [5000,6000],
        'solver': ['lbfgs', 'sag'],
        'multi_class': ['multinomial']
    }

scoring = 'f1_macro'
cv = 3

In [None]:
model = train(
    df=dataset,
    target_column='species',
    estimator_name=estimator_name,
    param_grid=param_grid,
    cv=cv
)

In [None]:
model.best_estimator_


# Evaluate

In [None]:
f1, cm = evaluate(
    df=dataset,
    target_column='species',
    clf=model
)

In [None]:
f1

In [None]:
plot_confusion_matrix(cm, species, normalize=False)