In [1]:
import findspark
import pandas as pd

In [2]:
findspark.init("/opt/manual/spark/")

In [3]:
from pyspark.sql import SparkSession, functions as F

# Create SparkSession

In [4]:
spark = (SparkSession.builder
        .appName("Mlflow Example")
        .master("yarn")
        .enableHiveSupport()
        .getOrCreate())

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [6]:
#! wget https://github.com/databricks/LearningSparkV2/raw/master/mlflow-project-example/data/sf-airbnb-clean.parquet/part-00000-tid-4320459746949313749-5c3d407c-c844-4016-97ad-2edec446aa62-6688-1-c000.snappy.parquet

In [7]:
filePath = "file:///home/train/datasets/sf-airbnb-clean.parquet"

In [8]:
airbnbDF = spark.read.parquet(filePath)
(trainDF, testDF) = airbnbDF.randomSplit([.8, .2], seed=42)

In [9]:
categoricalCols = [field for (field, dataType) in trainDF.dtypes if dataType == "string"]

In [10]:
indexOutputCols = [x + "Index" for x in categoricalCols]

In [11]:
stringIndexer = StringIndexer(inputCols=categoricalCols,
                            outputCols=indexOutputCols,
                            handleInvalid="skip")

In [12]:
numericCols = [field for (field, dataType) in trainDF.dtypes if ((dataType == "double") & (field != "price"))]

In [13]:
assemblerInputs = indexOutputCols + numericCols

In [14]:
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

In [15]:
rf = RandomForestRegressor(labelCol="price", maxBins=40, maxDepth=5, numTrees=100, seed=42)

In [16]:
pipeline = Pipeline(stages=[stringIndexer, vecAssembler, rf])

# mlflow

In [23]:
# In Python
import mlflow
import mlflow.spark
import pandas as pd
import os

os.environ['MLFLOW_TRACKING_URI'] = 'http://localhost:5000/'
with mlflow.start_run(run_name="random-forest") as run:
    
    
    # Log params: num_trees and max_depth
    mlflow.log_param("num_trees", rf.getNumTrees())
    mlflow.log_param("max_depth", rf.getMaxDepth())
    
    # Log model
    pipelineModel = pipeline.fit(trainDF)
    mlflow.spark.log_model(pipelineModel, "model")
    
    # Log metrics: RMSE and R2
    predDF = pipelineModel.transform(testDF)
    regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price")
    rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
    r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
    mlflow.log_metrics({"rmse": rmse, "r2": r2})
    
    # Log artifact: feature importance scores
    rfModel = pipelineModel.stages[-1]
    pandasDF = (pd.DataFrame(list(zip(vecAssembler.getInputCols(), rfModel.featureImportances)),
                columns=["feature", "importance"]) .sort_values(by="importance", ascending=False))
    
    # First write to local filesystem, then tell MLflow where to find that file
    pandasDF.to_csv("feature-importance.csv", index=False)
    mlflow.log_artifact("hdfs://localhost:9000/user/train/mlflow")

FileNotFoundError: [Errno 2] No such file or directory: 'hdfs://localhost:9000/user/train/mlflow'