In [1]:
import mlflow, prefect
import warnings, git, hashlib, os
from prefect import flow, task, get_run_logger, unmapped
from prefect.task_runners import SequentialTaskRunner

In [2]:
#Github config

try:
    repo = git.Repo(search_parent_directories=True)
    branch = repo.active_branch.name
    sha = repo.head.commit.hexsha
except:
    warnings.warn('No github repository! Generating new SHA.')
    sha = hashlib.algorithms_guaranteed

In [3]:
#Mlflow config

name = 'Best Classifier'
experiment = mlflow.get_experiment_by_name(name)
if not experiment:
    experiment_id = mlflow.create_experiment(name)
experiment = mlflow.set_experiment(name)
run_params = {"experiment_id" : experiment.experiment_id,
              "description" : "Testing different classifiers for articial data with prefect and mlflow.",
              "tags" : {'release.version':'0.0.2'}}

In [4]:
print("Experiment ID: {}".format(experiment.experiment_id))
print("Artifact Location: {}".format(experiment.artifact_location))
print("Tags: {}".format(experiment.tags))
print("Lifecycle stage: {}".format(experiment.lifecycle_stage))

Experiment ID: 1
Artifact Location: file:///home/lpfgarcia/Desktop/practical-prefect/mlruns/1
Tags: {}
Lifecycle stage: active


In [5]:
from sklearn.datasets import make_classification

@task
def generate_data(n_samples, n_features):
    return make_classification(n_samples, n_features, n_informative=2, n_redundant=2, random_state=42)

In [6]:
from sklearn.model_selection import train_test_split

@task
def split_data(X, y):
    return train_test_split(X, y, shuffle=False, test_size=int(len(X)*0.1))

In [7]:
@flow
def generate_split(n_samples, n_features):
    X, y = generate_data(n_samples, n_features)
    return split_data(X, y)

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

def classifiers():
    
    clf_list = [
        (KNeighborsClassifier(3), "Nearest Neighbors"),
        (SVC(kernel="linear", C=0.025), "Linear SVM"),
        (DecisionTreeClassifier(max_depth=5), "Decision Tree"),
        (RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), "Random Forest"),
        (AdaBoostClassifier(), "AdaBoost"),
        (GaussianNB(), "Naive Bayes"),
    ]
        
    return clf_list

In [9]:
from sklearn.metrics import accuracy_score

@flow
def evaluate_classifier(clf, samples, features):
 
    X_train, X_test, y_train, y_test = generate_split(samples, features)

    logger = get_run_logger()
    flow = prefect.context.get_run_context()
    description = ' '.join(['prefect_flow_name',flow.flow_run.name,'prefect_flow_id',flow.flow_run.id.urn])
    
    with mlflow.start_run(description=description) as run:

        mlflow.log_param('name', clf[1])
        mlflow.log_params(clf[0].get_params())  

        logger.info(f'name {clf[1]}')
        logger.info(f'params {clf[0].get_params()}')
            
        model = clf[0].fit(X_train, y_train)
        
        mlflow.sklearn.log_model(
            sk_model = model,
            artifact_path = clf[1]
        )

        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)

        mlflow.log_metric('accuracy', acc)
        logger.info(f'accuracy {acc}')
        
    return model, y_pred, acc

In [10]:
@flow(description='Evaluating different classifier for each dataset.', 
      version=branch + ' ' + sha)
def evaluate(clf, samples, features):
    logger = get_run_logger()
    _, _, acc = evaluate_classifier(clf, samples, features)
    logger.info(f'Accuracy of {acc}')

In [11]:
clf_list = classifiers()
samples = range(10000, 80000, 10000)
features = range(20, 30, 2)

combination = [(x,y) for x in samples for y in features]    
for samples, features in combination:
    evaluate(clf_list[0], samples, features)

10:06:12.053 | INFO    | prefect.engine - Created flow run 'silver-sambar' for flow 'evaluate'
10:06:15.887 | INFO    | Flow run 'silver-sambar' - Created subflow run 'heavenly-quetzal' for flow 'evaluate-classifier'
10:06:18.432 | INFO    | Flow run 'heavenly-quetzal' - Created subflow run 'hissing-agama' for flow 'generate-split'
10:06:18.546 | INFO    | Flow run 'hissing-agama' - Created task run 'generate_data-eeb4694a-0' for task 'generate_data'
10:06:18.547 | INFO    | Flow run 'hissing-agama' - Executing 'generate_data-eeb4694a-0' immediately...
10:06:19.822 | INFO    | Task run 'generate_data-eeb4694a-0' - Finished in state Completed()
10:06:20.478 | INFO    | Flow run 'hissing-agama' - Created task run 'split_data-b2f518fa-0' for task 'split_data'
10:06:20.482 | INFO    | Flow run 'hissing-agama' - Executing 'split_data-b2f518fa-0' immediately...
10:06:21.768 | INFO    | Task run 'split_data-b2f518fa-0' - Finished in state Completed()
10:06:22.467 | INFO    | Flow run 'hissing

10:08:53.512 | INFO    | prefect.engine - Created flow run 'dainty-marmoset' for flow 'evaluate'
10:08:56.042 | INFO    | Flow run 'dainty-marmoset' - Created subflow run 'cherubic-beagle' for flow 'evaluate-classifier'
10:08:57.895 | INFO    | Flow run 'cherubic-beagle' - Created subflow run 'positive-muskox' for flow 'generate-split'
10:08:59.109 | INFO    | Flow run 'positive-muskox' - Created task run 'generate_data-eeb4694a-0' for task 'generate_data'
10:08:59.111 | INFO    | Flow run 'positive-muskox' - Executing 'generate_data-eeb4694a-0' immediately...
10:09:00.412 | INFO    | Task run 'generate_data-eeb4694a-0' - Finished in state Completed()
10:09:01.005 | INFO    | Flow run 'positive-muskox' - Created task run 'split_data-b2f518fa-0' for task 'split_data'
10:09:01.006 | INFO    | Flow run 'positive-muskox' - Executing 'split_data-b2f518fa-0' immediately...
10:09:02.278 | INFO    | Task run 'split_data-b2f518fa-0' - Finished in state Completed()
10:09:02.942 | INFO    | Flow 

10:11:48.524 | INFO    | prefect.engine - Created flow run 'manipulative-coyote' for flow 'evaluate'
10:11:51.771 | INFO    | Flow run 'manipulative-coyote' - Created subflow run 'hasty-hound' for flow 'evaluate-classifier'
10:11:53.709 | INFO    | Flow run 'hasty-hound' - Created subflow run 'big-panda' for flow 'generate-split'
10:11:54.983 | INFO    | Flow run 'big-panda' - Created task run 'generate_data-eeb4694a-0' for task 'generate_data'
10:11:54.984 | INFO    | Flow run 'big-panda' - Executing 'generate_data-eeb4694a-0' immediately...
10:11:56.949 | INFO    | Task run 'generate_data-eeb4694a-0' - Finished in state Completed()
10:11:57.570 | INFO    | Flow run 'big-panda' - Created task run 'split_data-b2f518fa-0' for task 'split_data'
10:11:57.572 | INFO    | Flow run 'big-panda' - Executing 'split_data-b2f518fa-0' immediately...
10:11:58.900 | INFO    | Task run 'split_data-b2f518fa-0' - Finished in state Completed()
10:11:59.652 | INFO    | Flow run 'big-panda' - Finished in 

10:14:18.666 | INFO    | Flow run 'masked-stork' - Finished in state Completed('All states completed.')
10:14:35.026 | INFO    | prefect.engine - Created flow run 'fervent-lyrebird' for flow 'evaluate'
10:14:38.161 | INFO    | Flow run 'fervent-lyrebird' - Created subflow run 'powerful-tarsier' for flow 'evaluate-classifier'
10:14:40.001 | INFO    | Flow run 'powerful-tarsier' - Created subflow run 'opalescent-wren' for flow 'generate-split'
10:14:41.794 | INFO    | Flow run 'opalescent-wren' - Created task run 'generate_data-eeb4694a-0' for task 'generate_data'
10:14:41.795 | INFO    | Flow run 'opalescent-wren' - Executing 'generate_data-eeb4694a-0' immediately...
10:14:43.236 | INFO    | Task run 'generate_data-eeb4694a-0' - Finished in state Completed()
10:14:43.857 | INFO    | Flow run 'opalescent-wren' - Created task run 'split_data-b2f518fa-0' for task 'split_data'
10:14:43.859 | INFO    | Flow run 'opalescent-wren' - Executing 'split_data-b2f518fa-0' immediately...
10:14:45.167

10:17:19.021 | INFO    | Flow run 'dark-seagull' - Accuracy of 0.9142
10:17:19.844 | INFO    | Flow run 'dark-seagull' - Finished in state Completed('All states completed.')
10:17:32.900 | INFO    | prefect.engine - Created flow run 'adamant-bug' for flow 'evaluate'
10:17:36.090 | INFO    | Flow run 'adamant-bug' - Created subflow run 'rainbow-spoonbill' for flow 'evaluate-classifier'
10:17:37.941 | INFO    | Flow run 'rainbow-spoonbill' - Created subflow run 'xanthic-cheetah' for flow 'generate-split'
10:17:39.175 | INFO    | Flow run 'xanthic-cheetah' - Created task run 'generate_data-eeb4694a-0' for task 'generate_data'
10:17:39.175 | INFO    | Flow run 'xanthic-cheetah' - Executing 'generate_data-eeb4694a-0' immediately...
10:17:40.643 | INFO    | Task run 'generate_data-eeb4694a-0' - Finished in state Completed()
10:17:41.257 | INFO    | Flow run 'xanthic-cheetah' - Created task run 'split_data-b2f518fa-0' for task 'split_data'
10:17:41.258 | INFO    | Flow run 'xanthic-cheetah' -

10:20:23.070 | INFO    | Flow run 'invaluable-whale' - Finished in state Completed()
10:20:23.071 | INFO    | Flow run 'calm-locust' - Accuracy of 0.8596666666666667
10:20:24.516 | INFO    | Flow run 'calm-locust' - Finished in state Completed('All states completed.')
10:20:36.338 | INFO    | prefect.engine - Created flow run 'bold-goldfish' for flow 'evaluate'
10:20:39.613 | INFO    | Flow run 'bold-goldfish' - Created subflow run 'polite-rottweiler' for flow 'evaluate-classifier'
10:20:41.836 | INFO    | Flow run 'polite-rottweiler' - Created subflow run 'fluorescent-chameleon' for flow 'generate-split'
10:20:43.279 | INFO    | Flow run 'fluorescent-chameleon' - Created task run 'generate_data-eeb4694a-0' for task 'generate_data'
10:20:43.282 | INFO    | Flow run 'fluorescent-chameleon' - Executing 'generate_data-eeb4694a-0' immediately...
10:20:45.937 | INFO    | Task run 'generate_data-eeb4694a-0' - Finished in state Completed()
10:20:46.568 | INFO    | Flow run 'fluorescent-chamel