In [1]:
import findspark
import pandas as pd

In [2]:
findspark.init("/opt/manual/spark/")

In [3]:
from pyspark.sql import SparkSession, functions as F

# Create SparkSession

In [4]:
spark = (SparkSession.builder
        .appName("Mlflow Advertsing Regression")
        .master("yarn")
        .enableHiveSupport()
        .getOrCreate())

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [6]:
filePath = "file:///home/train/datasets/Advertising.csv"

In [8]:
df = spark.read.option("header",True).option("inferSchema",True).csv(filePath)
(train_df, test_df) = df.randomSplit([.8, .2], seed=142)

In [9]:
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- Sales: double (nullable = true)



In [11]:
numericCols = [field for (field, dataType) in train_df.dtypes 
               if ((dataType == "double") & (field not in ("Sales","ID")))]

In [12]:
print(numericCols)

['TV', 'Radio', 'Newspaper']


In [13]:
vecAssembler = VectorAssembler(inputCols=numericCols, outputCol="features")

In [14]:
rf = RandomForestRegressor(labelCol="Sales", maxBins=40, maxDepth=5, numTrees=100, seed=142)

In [16]:
pipeline = Pipeline(stages=[vecAssembler, rf])

# mlflow

In [18]:
# In Python
import mlflow
import mlflow.spark
import pandas as pd
import os

os.environ['MLFLOW_TRACKING_URI'] = 'http://localhost:5000/'
with mlflow.start_run(run_name="advertising-random-forest") as run:
    
    
    # Log params: num_trees and max_depth
    mlflow.log_param("num_trees", rf.getNumTrees())
    mlflow.log_param("max_depth", rf.getMaxDepth())
    
    # Log model
    pipelineModel = pipeline.fit(train_df)
    mlflow.spark.log_model(pipelineModel, "model")
    
    # Log metrics: RMSE and R2
    predDF = pipelineModel.transform(test_df)
    regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="Sales")
    rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
    r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
    mlflow.log_metrics({"rmse": rmse, "r2": r2})
    
    # Log artifact: feature importance scores
    rfModel = pipelineModel.stages[-1]
    pandasDF = (pd.DataFrame(list(zip(vecAssembler.getInputCols(), rfModel.featureImportances)),
                columns=["feature", "importance"]) .sort_values(by="importance", ascending=False))
    
    # First write to local filesystem, then tell MLflow where to find that file
    pandasDF.to_csv("advertising-feature-importance.csv", index=False)
    mlflow.log_artifact("advertising-feature-importance.csv")
    mlflow.log_artifact(df)

TypeError: stat: path should be string, bytes, os.PathLike or integer, not DataFrame