# 02-iterations-with-datasets

In [1]:
!pip show mlops-ai

Name: mlops-ai
Version: 1.1.0
Summary: Mlops-ai library for managing machine learning projects, experiments, iterations and datasets.
Home-page: 
Author: Kacper Pękalski, Kajetan Szal, Jędrzej Rybczyński
Author-email: kac.pekalski1@gmail.com
License: Apache License 2.0
Location: c:\users\jedryb\anaconda3\lib\site-packages
Requires: requests
Required-by: 


## Setting active project & experiment

In [2]:
from mlops.tracking import get_project_by_name, set_active_project

project = get_project_by_name('Iris classification')
set_active_project(project['_id'])

'Active project set to: 6489ec527f8983d10183cc1b'

In [3]:
from mlops.tracking import get_experiment_by_name, set_active_experiment

experiment = get_experiment_by_name('Dataset models')
set_active_experiment(experiment_id=experiment['id'])

'Active experiment set to: 6489ec657f8983d10183cc1e'

## Creating another datasets

In [4]:
from mlops.tracking import create_dataset

dataset_v2 = create_dataset(
    dataset_name="Iris dataset",
    path_to_dataset="https://www.kaggle.com/datasets/uciml/iris",
    dataset_description="Famous Iris species dataset",
    tags="iris,kaggle,classification,multiclass",
    version="2.0"
)

In [5]:
dataset_v3 = create_dataset(
    dataset_name="Iris dataset",
    path_to_dataset="https://www.kaggle.com/datasets/arshid/iris-flower-dataset",
    dataset_description="Famous Iris species dataset",
    tags="iris,kaggle,classification,multiclass",
    version="3.0"
)

## Creating some iterations with datasets

In [6]:
import pandas as pd 
from sklearn.model_selection import train_test_split

url = 'https://raw.githubusercontent.com/TripathiAshutosh/dataset/main/iris.csv'
df = pd.read_csv(filepath_or_buffer=url, sep=',')
y = df['class']
X = df.drop(columns=['class'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify = y, random_state=42)

In [7]:
from mlops.tracking import start_iteration


def log_single_iteration(iteration_name: str,
                         model_params: dict = None,
                         metrics: dict = None,
                         model_path: str = None,
                         dataset_id: str = None,
                         interactive_charts: list = None):
    """
    Util function for creating single mlops iteration.
    
    Args:
        iteration_name (str): name of the whole iteration
        model_params (dict): parameters of model
        metrics (dict): model metrics
        model_path (str): path to saved model file
        dataset_id (str): id to dataset from datasets tab
    """
    with start_iteration(iteration_name=iteration_name) as iteration:
        if model_params:
            iteration.log_parameters(parameters=model_params)
            
        if metrics:
            iteration.log_metrics(metrics=metrics)
            
        if model_path:
            iteration.log_path_to_model(path_to_model=model_path)
        
        if dataset_id:
            iteration.log_dataset(dataset_id=dataset_id)

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

parameters = {'n_estimators': 100, 'max_depth': 5}
model = RandomForestClassifier(**parameters)
model.fit(X_train, y_train)
prediction = model.predict(X_test)
metrics = {
    'accuracy': round(accuracy_score(y_test, prediction), 3),
    'precision': round(precision_score(y_test, prediction, average='macro'), 3),
    'recall': round(recall_score(y_test, prediction, average='macro'), 3),
    'f1': round(f1_score(y_test, prediction, average='macro'), 3)
}

log_single_iteration(iteration_name='RF with dataset v2',
                     model_name='Random Forest',
                     model_params=parameters,
                     metrics=metrics,
                     dataset_id=dataset_v2['_id'])

In [9]:
parameters = {'n_estimators': 500, 'max_depth': 10}
model = RandomForestClassifier(**parameters)
model.fit(X_train, y_train)
prediction = model.predict(X_test)
metrics = {
    'accuracy': round(accuracy_score(y_test, prediction), 3),
    'precision': round(precision_score(y_test, prediction, average='macro'), 3),
    'recall': round(recall_score(y_test, prediction, average='macro'), 3),
    'f1': round(f1_score(y_test, prediction, average='macro'), 3)
}

log_single_iteration(iteration_name='RF with dataset v3',
                     model_name='Random Forest',
                     model_params=parameters,
                     metrics=metrics,
                     dataset_id=dataset_v3['_id'])