In [1]:
import mlflow, prefect
import warnings, git, hashlib, os
from prefect import flow, task, get_run_logger, unmapped
from prefect.task_runners import SequentialTaskRunner

In [2]:
#Github config

try:
    repo = git.Repo(search_parent_directories=True)
    branch = repo.active_branch.name
    sha = repo.head.commit.hexsha
except:
    warnings.warn('No github repository! Generating new SHA.')
    sha = hashlib.algorithms_guaranteed

In [3]:
#Mlflow config

name = 'Best Classifier'
experiment = mlflow.get_experiment_by_name(name)
if not experiment:
    experiment_id = mlflow.create_experiment(name)
experiment = mlflow.set_experiment(name)
run_params = {"experiment_id" : experiment.experiment_id,
              "description" : "Testing different classifiers for articial data with prefect and mlflow.",
              "tags" : {'release.version':'0.0.1'}}

In [4]:
print("Experiment ID: {}".format(experiment.experiment_id))
print("Artifact Location: {}".format(experiment.artifact_location))
print("Tags: {}".format(experiment.tags))
print("Lifecycle stage: {}".format(experiment.lifecycle_stage))

Experiment ID: 1
Artifact Location: file:///home/lpfgarcia/Projects/MLOps/practical-prefect/mlruns/1
Tags: {}
Lifecycle stage: active


In [5]:
from sklearn.datasets import make_classification

@task
def generate_data(n_samples, n_features):
    return make_classification(n_samples, n_features, n_informative=2, n_redundant=2, random_state=42)

In [6]:
from sklearn.model_selection import train_test_split

@task
def split_data(X, y):
    return train_test_split(X, y, shuffle=False, test_size=int(len(X)*0.1))

In [7]:
@flow
def generate_split(n_samples, n_features):
    X, y = generate_data(n_samples, n_features)
    return split_data(X, y)

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

def classifiers():
    
    clf_list = [
        (KNeighborsClassifier(3), "Nearest Neighbors"),
        (SVC(kernel="linear", C=0.025), "Linear SVM"),
        (DecisionTreeClassifier(max_depth=5), "Decision Tree"),
        (RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), "Random Forest"),
        (AdaBoostClassifier(), "AdaBoost"),
        (GaussianNB(), "Naive Bayes"),
    ]
        
    return clf_list

In [9]:
from sklearn.metrics import accuracy_score

@task
def evaluate_classifier(clf, X_train, X_test, y_train, y_test):
 
    task = prefect.context.get_run_context()
    
    description = ' '.join(['prefect_task_name',task.task_run.name,'prefect_task_id',task.task_run.id.urn])

    with mlflow.start_run(description=description) as run:

        mlflow.log_param('name', clf[1])
        mlflow.log_params(clf[0].get_params())  
        
        model = clf[0].fit(X_train, y_train)
        
        mlflow.sklearn.log_model(
            sk_model = model,
            artifact_path = clf[1]
        )

        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)

        mlflow.log_metric('accuracy', acc)
        
    return model, y_pred, acc

In [10]:
@flow(description='Evaluating different classifier for each dataset.', 
      version=branch + ' ' + sha)
def evaluate(clf, samples, features):
    logger = get_run_logger()
    data = generate_split(samples, features)
    _, _, acc = evaluate_classifier(clf, *data)
    logger.info(f'Accuracy of {acc}')

In [11]:
clf_list = classifiers()
samples = range(10000, 50000, 10000)
features = range(20, 30, 2)

combination = [(x,y) for x in samples for y in features]    
for samples, features in combination:
    evaluate(clf_list[0], samples, features)

17:34:28.558 | INFO    | prefect.engine - Created flow run 'curly-walrus' for flow 'evaluate'
17:34:28.722 | INFO    | Flow run 'curly-walrus' - Created subflow run 'incredible-kakapo' for flow 'generate-split'
17:34:28.783 | INFO    | Flow run 'incredible-kakapo' - Created task run 'generate_data-eeb4694a-0' for task 'generate_data'
17:34:28.784 | INFO    | Flow run 'incredible-kakapo' - Executing 'generate_data-eeb4694a-0' immediately...
17:34:28.864 | INFO    | Task run 'generate_data-eeb4694a-0' - Finished in state Completed()
17:34:28.893 | INFO    | Flow run 'incredible-kakapo' - Created task run 'split_data-b2f518fa-0' for task 'split_data'
17:34:28.894 | INFO    | Flow run 'incredible-kakapo' - Executing 'split_data-b2f518fa-0' immediately...
17:34:28.961 | INFO    | Task run 'split_data-b2f518fa-0' - Finished in state Completed()
17:34:29.005 | INFO    | Flow run 'incredible-kakapo' - Finished in state Completed()
17:34:29.029 | INFO    | Flow run 'curly-walrus' - Created task

17:34:36.984 | INFO    | Flow run 'cryptic-termite' - Executing 'evaluate_classifier-b2b7f5e1-0' immediately...
17:34:37.962 | INFO    | Task run 'evaluate_classifier-b2b7f5e1-0' - Finished in state Completed()
17:34:37.963 | INFO    | Flow run 'cryptic-termite' - Accuracy of 0.8385
17:34:38.057 | INFO    | Flow run 'cryptic-termite' - Finished in state Completed('All states completed.')
17:34:38.134 | INFO    | prefect.engine - Created flow run 'delectable-chimpanzee' for flow 'evaluate'
17:34:38.311 | INFO    | Flow run 'delectable-chimpanzee' - Created subflow run 'hallowed-woodlouse' for flow 'generate-split'
17:34:38.368 | INFO    | Flow run 'hallowed-woodlouse' - Created task run 'generate_data-eeb4694a-0' for task 'generate_data'
17:34:38.369 | INFO    | Flow run 'hallowed-woodlouse' - Executing 'generate_data-eeb4694a-0' immediately...
17:34:38.474 | INFO    | Task run 'generate_data-eeb4694a-0' - Finished in state Completed()
17:34:38.498 | INFO    | Flow run 'hallowed-woodlou

17:34:47.333 | INFO    | Flow run 'uncovered-trout' - Created task run 'split_data-b2f518fa-0' for task 'split_data'
17:34:47.334 | INFO    | Flow run 'uncovered-trout' - Executing 'split_data-b2f518fa-0' immediately...
17:34:47.437 | INFO    | Task run 'split_data-b2f518fa-0' - Finished in state Completed()
17:34:47.512 | INFO    | Flow run 'uncovered-trout' - Finished in state Completed()
17:34:47.536 | INFO    | Flow run 'radical-harrier' - Created task run 'evaluate_classifier-b2b7f5e1-0' for task 'evaluate_classifier'
17:34:47.537 | INFO    | Flow run 'radical-harrier' - Executing 'evaluate_classifier-b2b7f5e1-0' immediately...
17:34:48.514 | INFO    | Task run 'evaluate_classifier-b2b7f5e1-0' - Finished in state Completed()
17:34:48.514 | INFO    | Flow run 'radical-harrier' - Accuracy of 0.817
17:34:48.627 | INFO    | Flow run 'radical-harrier' - Finished in state Completed('All states completed.')
17:34:48.705 | INFO    | prefect.engine - Created flow run 'arboreal-wildcat' for

17:34:57.940 | INFO    | prefect.engine - Created flow run 'laughing-asp' for flow 'evaluate'
17:34:58.117 | INFO    | Flow run 'laughing-asp' - Created subflow run 'brilliant-bonobo' for flow 'generate-split'
17:34:58.175 | INFO    | Flow run 'brilliant-bonobo' - Created task run 'generate_data-eeb4694a-0' for task 'generate_data'
17:34:58.176 | INFO    | Flow run 'brilliant-bonobo' - Executing 'generate_data-eeb4694a-0' immediately...
17:34:58.342 | INFO    | Task run 'generate_data-eeb4694a-0' - Finished in state Completed()
17:34:58.371 | INFO    | Flow run 'brilliant-bonobo' - Created task run 'split_data-b2f518fa-0' for task 'split_data'
17:34:58.371 | INFO    | Flow run 'brilliant-bonobo' - Executing 'split_data-b2f518fa-0' immediately...
17:34:58.490 | INFO    | Task run 'split_data-b2f518fa-0' - Finished in state Completed()
17:34:58.562 | INFO    | Flow run 'brilliant-bonobo' - Finished in state Completed()
17:34:58.583 | INFO    | Flow run 'laughing-asp' - Created task run '