In [1]:
# %load snippet/default_notebook_setup.py
%reload_ext autoreload
%autoreload 2
%load_ext dotenv

import sys
sys.path.append('/home/jovyan')

%dotenv

In [2]:
# %load snippet/default_spark.py
import pyspark
from pyspark.sql import SparkSession, Window, DataFrame
import pyspark.sql.functions as F
import pyspark.sql.types as T

spark = (
    SparkSession.builder
    .config('spark.jars.packages', 'ml.combust.mleap:mleap-spark-base_2.11:0.14.0,ml.combust.mleap:mleap-spark_2.11:0.14.0')
    .getOrCreate()
)

# Load Training Data

we load the training data using our project code base

In [3]:
from project.data import features

In [4]:
data = features.load_iris_features(3, '../').toPandas()
data['features'] = data['features'].apply(lambda x: x.toArray())
data.head()

Unnamed: 0,class,class_index,features
0,Iris-setosa,0.0,"[-0.8976738791967643, 0.8058183933921671, -0.7..."
1,Iris-setosa,0.0,"[-1.1392004834649512, 1.2977777415267786, -1.4..."
2,Iris-setosa,0.0,"[-1.3807270877331392, 1.906407290800036, -2.63..."
3,Iris-setosa,0.0,"[-1.5014903898672336, 2.254473390863657, -3.38..."
4,Iris-setosa,0.0,"[-1.0184371813308577, 1.0372142923171424, -1.0..."


# Train and log a model to Mlflow

There is no difference between a spark pipline model and a sklearn model when it comes to Mlflow.
Simply log the trained model.

In [32]:
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import numpy as np
import mlflow
import mlflow.sklearn

In [33]:
experiment = 'iris_classification'
mlflow.set_tracking_uri(os.environ['MLFLOW_TRACKING_URI'])
mlflow.set_experiment(experiment)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(
    np.stack(data['features'].values), data['class'].values, 
    test_size=0.25, random_state=0, stratify=data['class'].values
)

In [35]:
with mlflow.start_run() as run:
    model = LogisticRegression(
        C=0.1, solver='saga', multi_class='multinomial', 
        penalty='l1', max_iter=2000
    )
    mlflow.log_param('C', 0.1)
    mlflow.log_param('penalty', 'l1')
    mlflow.log_param('max_iter', 2000)
    mlflow.log_param('return_type', 'string')
    model.fit(X_train, y_train)
    f1score = f1_score(y_test, model.predict(X_test), average='macro')
    mlflow.log_metric("f1_score", f1score)
    mlflow.log_metric("n_iter_", model.n_iter_[0])
    mlflow.sklearn.log_model(model, 'iris_classification')
    run_id = run.info.run_id
    print(run.info)

<RunInfo: artifact_uri='s3://artifacts/1/6424c6b1d9d44f419edba6ae764fab5c/artifacts', end_time=None, experiment_id='1', lifecycle_stage='active', run_id='6424c6b1d9d44f419edba6ae764fab5c', run_uuid='6424c6b1d9d44f419edba6ae764fab5c', start_time=1572806695864, status='RUNNING', user_id='jovyan'>


In [36]:
c = mlflow.tracking.MlflowClient()
print(c.get_run(run_id))

<Run: data=<RunData: metrics={'f1_score': 0.9458874458874459, 'n_iter_': 1401.0}, params={'C': '0.1', 'max_iter': '2000', 'penalty': 'l1'}, tags={'mlflow.source.name': '/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py',
 'mlflow.source.type': 'LOCAL',
 'mlflow.user': 'jovyan'}>, info=<RunInfo: artifact_uri='s3://artifacts/1/6424c6b1d9d44f419edba6ae764fab5c/artifacts', end_time=1572806696366, experiment_id='1', lifecycle_stage='active', run_id='6424c6b1d9d44f419edba6ae764fab5c', run_uuid='6424c6b1d9d44f419edba6ae764fab5c', start_time=1572806695864, status='FINISHED', user_id='jovyan'>>
