# Spark Job with MLRun
Using MLRun to run Spark job.
The Spark job will run a describe function, which generates profile report<br>
from an Apache Spark DataFrame (Based on pandas_profiling).<br>

For each column the following statistics - if relevant for the column type - are presented:

**Essentials:** `type`, `unique values`, `missing values`,

**Quantile statistics:** `minimum value`, `Q1`, `median`, `Q3`, `maximum`, `range`, `interquartile range`.

**Descriptive statistics:** `mean`, `mode`, `standard deviation`, `sum`, `median absolute deviation`,<br> 
                            `coefficient of variation`, `kurtosis`, `skewness`.<br>
                        
**Most frequent values:** for categorical data 

## Build Function

In [1]:
# install prerequisites
# prerequisites for the notebook is installing 2 packages yfinance yahoo_fin for uploading stocks data 
import importlib.util
import IPython

def install_missing_packages(packages):
    install_flag = False
    for package in packages:
        spec = importlib.util.find_spec(package)
        if spec is None:
            %pip install {package}
            install_flag = True
        else:     
            print("package {} installed".format(package))
        if install_flag:            
            print ("restarting kernerl due to package install")
            IPython.Application.instance().kernel.do_shutdown(True)
# For illustrative purposes.
packages  = ['mlrun', 'matplotlib']
install_missing_packages(packages)

package mlrun installed
package matplotlib installed


In [2]:
import mlrun
from mlrun.platforms.iguazio import mount_v3io, mount_v3iod
from mlrun.datastore import DataItem
from mlrun.execution import MLClientCtx

import os
from subprocess import run
import pandas as pd
import numpy as np

from pyspark.sql.types import LongType
from pyspark.sql import SparkSession

## Build Spark Describe Helper Functions

In [3]:
import sys
import base64 as b64
import warnings
warnings.filterwarnings("ignore")

from itertools import product
import matplotlib

import json
from matplotlib import pyplot as plt
from pkg_resources import resource_filename
import six
from pyspark.sql import DataFrame as SparkDataFrame
from pyspark.sql.functions import (abs as df_abs, col, count, countDistinct,
                                   max as df_max, mean, min as df_min,
                                   sum as df_sum, when
                                   )
from pyspark.sql.functions import variance, stddev, kurtosis, skewness


def describe(df, bins, corr_reject, config, **kwargs):
    if not isinstance(df, SparkDataFrame):
        raise TypeError("df must be of type pyspark.sql.DataFrame")

    # Number of rows:
    table_stats = {"n": df.count()}
    if table_stats["n"] == 0:
        raise ValueError("df cannot be empty")

    try:
        # reset matplotlib style before use
        matplotlib.style.use("default")
    except:
        pass

    # Function to "pretty name" floats:
    def pretty_name(x):
        x *= 100
        if x == int(x):
            return '%.0f%%' % x
        else:
            return '%.1f%%' % x

    # Function to compute the correlation matrix:
    def corr_matrix(df, columns=None):
        if columns is None:
            columns = df.columns
        combinations = list(product(columns,columns))

        def separate(l, n):
            for i in range(0, len(l), n):
                yield l[i:i+n]

        grouped = list(separate(combinations,len(columns)))
        df_cleaned = df.select(*columns).na.drop(how="any")

        for i in grouped:
            for j in enumerate(i):
                i[j[0]] = i[j[0]] + (df_cleaned.corr(str(j[1][0]), str(j[1][1])),)

        df_pandas = pd.DataFrame(grouped).applymap(lambda x: x[2])
        df_pandas.columns = columns
        df_pandas.index = columns
        
        return df_pandas

    # Compute histogram 
    def create_hist_data(df, column, minim, maxim, bins=10):

        def create_all_conditions(current_col, column, left_edges, count=1):
            """
            Recursive function that exploits the
            ability to call the Spark SQL Column method
            .when() in a recursive way.
            """
            left_edges = left_edges[:]
            if len(left_edges) == 0:
                return current_col
            if len(left_edges) == 1:
                next_col = current_col.when(col(column) >= float(left_edges[0]), count)
                left_edges.pop(0)
                return create_all_conditions(next_col, column, left_edges[:], count+1)
            next_col = current_col.when((float(left_edges[0]) <= col(column))
                                        & (col(column) < float(left_edges[1])), count)
            left_edges.pop(0)
            return create_all_conditions(next_col, column, left_edges[:], count+1)

        num_range = maxim - minim
        bin_width = num_range / float(bins)
        left_edges = [minim]
        for _bin in range(bins):
            left_edges = left_edges + [left_edges[-1] + bin_width]
        left_edges.pop()
        expression_col = when((float(left_edges[0]) <= col(column))
                              & (col(column) < float(left_edges[1])), 0)
        left_edges_copy = left_edges[:]
        left_edges_copy.pop(0)
        bin_data = (df.select(col(column))
                    .na.drop()
                    .select(col(column),
                            create_all_conditions(expression_col,
                                                  column,
                                                  left_edges_copy
                                                 ).alias("bin_id")
                           )
                    .groupBy("bin_id").count()
                   ).toPandas()

        # If no data goes into one bin, it won't 
        # appear in bin_data; so we should fill
        # in the blanks:
        bin_data.index = bin_data["bin_id"]
        new_index = list(range(bins))
        bin_data = bin_data.reindex(new_index)
        bin_data["bin_id"] = bin_data.index
        bin_data = bin_data.fillna(0)

        bin_data["left_edge"] = left_edges
        bin_data["width"] = bin_width
        

        return bin_data


    def describe_integer_1d(df, column, current_result, nrows):
        
        stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       variance(col(column)).alias("variance"),
                                                       kurtosis(col(column)).alias("kurtosis"),
                                                       stddev(col(column)).alias("std"),
                                                       skewness(col(column)).alias("skewness"),
                                                       df_sum(col(column)).alias("sum")
                                                       ).toPandas()


        for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]):
            stats_df[pretty_name(x)] = (df.select(column)
                                        .na.drop()
                                        .selectExpr("percentile(`{col}`,CAST({n} AS DOUBLE))"
                                                    .format(col=column, n=x)).toPandas().iloc[:,0]
                                        )
        stats = stats_df.iloc[0].copy()
        stats.name = column
        stats["range"] = stats["max"] - stats["min"]
        stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)]
        stats["cv"] = stats["std"] / float(stats["mean"])
        stats["mad"] = (df.select(column)
                        .na.drop()
                        .select(df_abs(col(column)-stats["mean"]).alias("delta"))
                        .agg(df_sum(col("delta"))).toPandas().iloc[0,0] / float(current_result["count"]))
        stats["type"] = "NUM"
        stats['n_zeros'] = df.select(column).where(col(column)==0.0).count()
        stats['p_zeros'] = stats['n_zeros'] / float(nrows)

        hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins)

        return stats

    def describe_float_1d(df, column, current_result, nrows):
        stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"),
                                                       df_min(col(column)).alias("min"),
                                                       df_max(col(column)).alias("max"),
                                                       variance(col(column)).alias("variance"),
                                                       kurtosis(col(column)).alias("kurtosis"),
                                                       stddev(col(column)).alias("std"),
                                                       skewness(col(column)).alias("skewness"),
                                                       df_sum(col(column)).alias("sum")
                                                       ).toPandas()

        for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]):
            stats_df[pretty_name(x)] = (df.select(column)
                                        .na.drop()
                                        .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))"
                                                    .format(col=column, n=x)).toPandas().iloc[:,0]
                                        )
        stats = stats_df.iloc[0].copy()
        stats.name = column
        stats["range"] = stats["max"] - stats["min"]
        stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)]
        stats["cv"] = stats["std"] / float(stats["mean"])
        stats["mad"] = (df.select(column)
                        .na.drop()
                        .select(df_abs(col(column)-stats["mean"]).alias("delta"))
                        .agg(df_sum(col("delta"))).toPandas().iloc[0,0] / float(current_result["count"]))
        stats["type"] = "NUM"
        stats['n_zeros'] = df.select(column).where(col(column)==0.0).count()
        stats['p_zeros'] = stats['n_zeros'] / float(nrows)

        hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins)

        return stats

    def describe_date_1d(df, column):
        stats_df = df.select(column).na.drop().agg(df_min(col(column)).alias("min"),
                                                   df_max(col(column)).alias("max")
                                                  ).toPandas()
        stats = stats_df.iloc[0].copy()
        stats.name = column

        if isinstance(stats["max"], pd.Timestamp):
            stats = stats.astype(object)
            stats["max"] = str(stats["max"].to_pydatetime())
            stats["min"] = str(stats["min"].to_pydatetime())

        else:
            stats["range"] = stats["max"] - stats["min"]
        stats["type"] = "DATE"
        return stats

    def guess_json_type(string_value):
        try:
            obj = json.loads(string_value)
        except:
            return None

        return type(obj)

    def describe_categorical_1d(df, column):
        value_counts = (df.select(column).na.drop()
                        .groupBy(column)
                        .agg(count(col(column)))
                        .orderBy("count({c})".format(c=column),ascending=False)
                       ).cache()

        # Get the most frequent class:
        stats = (value_counts
                 .limit(1)
                 .withColumnRenamed(column, "top")
                 .withColumnRenamed("count({c})".format(c=column), "freq")
                ).toPandas().iloc[0]

        # Get the top 50 classes by value count,
        # and put the rest of them grouped at the
        # end of the Series:
        top_50 = value_counts.limit(50).toPandas().sort_values("count({c})".format(c=column),
                                                               ascending=False)
        top_50_categories = top_50[column].values.tolist()

        others_count = pd.Series([df.select(column).na.drop()
                        .where(~(col(column).isin(*top_50_categories)))
                        .count()
                        ], index=["***Other Values***"])
        others_distinct_count = pd.Series([value_counts
                                .where(~(col(column).isin(*top_50_categories)))
                                .count()
                                ], index=["***Other Values Distinct Count***"])

        top = top_50.set_index(column)["count({c})".format(c=column)]
        top = top.append(others_count)
        top = top.append(others_distinct_count)
        stats["value_counts"] = top
        stats["type"] = "CAT"
        value_counts.unpersist()
        unparsed_valid_jsons = df.select(column).na.drop().rdd.map(
            lambda x: guess_json_type(x[column])).filter(
            lambda x: x).distinct().collect()
        stats["unparsed_json_types"] = unparsed_valid_jsons
        return stats

    def describe_constant_1d(df, column):
        stats = pd.Series(['CONST'], index=['type'], name=column)
        stats["value_counts"] = (df.select(column)
                                 .na.drop()
                                 .limit(1)).toPandas().iloc[:,0].value_counts()
        return stats

    def describe_unique_1d(df, column):
        stats = pd.Series(['UNIQUE'], index=['type'], name=column)
        stats["value_counts"] = (df.select(column)
                                 .na.drop()
                                 .limit(50)).toPandas().iloc[:,0].value_counts()
        return stats

    def describe_1d(df, column, nrows, lookup_config=None):
        column_type = df.select(column).dtypes[0][1]
        if ("array" in column_type) or ("stuct" in column_type) or ("map" in column_type):
            raise NotImplementedError("Column {c} is of type {t} and cannot be analyzed".format(c=column, t=column_type))

        distinct_count = df.select(column).agg(countDistinct(col(column)).alias("distinct_count")).toPandas()
        non_nan_count = df.select(column).na.drop().select(count(col(column)).alias("count")).toPandas()
        results_data = pd.concat([distinct_count, non_nan_count],axis=1)
        results_data["p_unique"] = results_data["distinct_count"] / float(results_data["count"])
        results_data["is_unique"] = results_data["distinct_count"] == nrows
        results_data["n_missing"] = nrows - results_data["count"]
        results_data["p_missing"] = results_data["n_missing"] / float(nrows)
        results_data["p_infinite"] = 0
        results_data["n_infinite"] = 0
        result = results_data.iloc[0].copy()
        result["memorysize"] = 0
        result.name = column

        if result["distinct_count"] <= 1:
            result =  pd.concat([result,describe_constant_1d(df, column)])
        elif column_type in {"tinyint", "smallint", "int", "bigint"}:
            result =  pd.concat([result,describe_integer_1d(df, column, result, nrows)])                                
        elif column_type in {"float", "double", "decimal"}:
            result =  pd.concat([result,describe_float_1d(df, column, result, nrows)])                                                                            
        elif column_type in {"date", "timestamp"}:
            result =  pd.concat([result,describe_date_1d(df, column)])                                                                                                            
        elif result["is_unique"] == True:
            result =  pd.concat([result,describe_unique_1d(df, column)])                                
        else:
            result =  pd.concat([result,describe_categorical_1d(df, column)])                    
            result = result.append(describe_categorical_1d(df, column))
            # Fix to also count MISSING value in the distict_count field:
            if result["n_missing"] > 0:
                result["distinct_count"] = result["distinct_count"] + 1

        if (result["count"] > result["distinct_count"] > 1):
            try:
                result["mode"] = result["top"]
            except KeyError:
                result["mode"] = 0
        else:
            try:
                result["mode"] = result["value_counts"].index[0]
            except KeyError:
                result["mode"] = 0
            # If and IndexError happens,
            # it is because all column are NULLs:
            except IndexError:
                result["mode"] = "MISSING"

        if lookup_config:
            lookup_object = lookup_config['object']
            col_name_in_db = lookup_config['col_name_in_db'] if 'col_name_in_db' in lookup_config else None
            try:
                matched, unmatched = lookup_object.lookup(df.select(column), col_name_in_db)
                result['lookedup_values'] = str(matched.count()) + "/" + str(df.select(column).count())
            except:
                result['lookedup_values'] = 'FAILED'
        else:
            result['lookedup_values'] = ''

        return result


    # build final report:
    ldesc = {}
    for colum in df.columns:
        if colum in config:
            if 'lookup' in config[colum]:
                lookup_config = config[colum]['lookup']
                desc = describe_1d(df, colum, table_stats["n"], lookup_config=lookup_config)
            else:
                desc = describe_1d(df, colum, table_stats["n"])
        else:
            desc = describe_1d(df, colum, table_stats["n"])
        ldesc.update({colum: desc})

    # Compute correlation matrix
    if corr_reject is not None:
        computable_corrs = [colum for colum in ldesc if ldesc[colum]["type"] in {"NUM"}]

        if len(computable_corrs) > 0:
            corr = corr_matrix(df, columns=computable_corrs)
            for x, corr_x in corr.iterrows():
                for y, corr in corr_x.items():
                    if x == y:
                        break

    # Convert ldesc (final report) to a DataFrame
    variable_stats = pd.DataFrame(ldesc)

    # General statistics
    table_stats["nvar"] = len(df.columns)
    table_stats["total_missing"] = float(variable_stats.loc["n_missing"].sum()) / (table_stats["n"] * table_stats["nvar"])
    memsize = 0
    table_stats['memsize'] = fmt_bytesize(memsize)
    table_stats['recordsize'] = fmt_bytesize(memsize / table_stats['n'])
    table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR")})
    table_stats.update(dict(variable_stats.loc['type'].value_counts()))
    table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR']

    freq_dict = {}
    for var in variable_stats:
        if "value_counts" not in variable_stats[var]:
            pass
        elif not(variable_stats[var]["value_counts"] is np.nan):
            freq_dict[var] = variable_stats[var]["value_counts"]
        else:
            pass
    try:
        variable_stats = variable_stats.drop("value_counts")
    except (ValueError, KeyError):
        pass

    return table_stats, variable_stats.T, freq_dict

import numpy as np
from pyspark.sql.functions import abs as absou

SKEWNESS_CUTOFF = 20
DEFAULT_FLOAT_FORMATTER = u'spark_df_profiling.__default_float_formatter'

# formmating functions
def gradient_format(value, limit1, limit2, c1, c2):
    def LerpColour(c1,c2,t):
        return (int(c1[0]+(c2[0]-c1[0])*t),int(c1[1]+(c2[1]-c1[1])*t),int(c1[2]+(c2[2]-c1[2])*t))
    c = LerpColour(c1, c2, (value-limit1)/(limit2-limit1))
    return fmt_color(value,"rgb{}".format(str(c)))


def fmt_color(text, color):
    return(u'<span style="color:{color}">{text}</span>'.format(color=color,text=str(text)))


def fmt_class(text, cls):
    return(u'<span class="{cls}">{text}</span>'.format(cls=cls,text=str(text)))


def fmt_bytesize(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if num < 0:
            num = num*-1
            if num < 1024.0:
                return "%3.1f %s%s" % (num, unit, suffix)
            num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)


def fmt_percent(v):
    return  "{:2.1f}%".format(v*100)

def fmt_varname(v):
    return u'<code>{0}</code>'.format(v)


value_formatters={
        u'freq': (lambda v: gradient_format(v, 0, 62000, (30, 198, 244), (99, 200, 72))),
        u'p_missing': fmt_percent,
        u'p_infinite': fmt_percent,
        u'p_unique': fmt_percent,
        u'p_zeros': fmt_percent,
        u'memorysize': fmt_bytesize,
        u'total_missing': fmt_percent,
        DEFAULT_FLOAT_FORMATTER: lambda v: str(float('{:.5g}'.format(v))).rstrip('0').rstrip('.'),
        u'correlation_var': lambda v: fmt_varname(v),
        u'unparsed_json_types': lambda v: ', '.join([s.__name__ for s in v])
        }

def fmt_row_severity(v):
    if np.isnan(v) or v<= 0.01:
        return "ignore"
    else:
        return "alert"

def fmt_skewness(v):
    if not np.isnan(v) and (v<-SKEWNESS_CUTOFF or v> SKEWNESS_CUTOFF):
        return "alert"
    else:
        return ""

row_formatters={
    u'p_zeros': fmt_row_severity,
    u'p_missing': fmt_row_severity,
    u'p_infinite': fmt_row_severity,
    u'n_duplicates': fmt_row_severity,
    u'skewness': fmt_skewness,
}

## Set Spark Describe Function

In [4]:
def describe_spark(context: MLClientCtx, 
                   dataset: DataItem,
                   bins: int=30,
                   describe_extended: bool=True)-> None:
    """
    Generates profile reports from an Apache Spark DataFrame. 
    Based on pandas_profiling, but for Spark's DataFrames instead of pandas.
    For each column the following statistics - if relevant for the column type - are presented:
    
    Essentials: type, unique values, missing values
    
    Quantile statistics: minimum value, Q1, median, Q3, maximum, range, interquartile range
    
    Descriptive statistics: mean, mode, standard deviation, sum, median absolute deviation, 
                            coefficient of variation, kurtosis, skewness
                            
    Most frequent values: for categorical data 
    --------------------------------------------------------------------------------------------
    Parameters:
                context : MLClientCtx
                          MLRun introduces a concept of a runtime "context", 
                          the code can be set up to get parameters and inputs from the context, 
                          as well as log run outputs, artifacts, tags, and time-series metrics in the context.
                                      
                dataset : csv_file
                          csv file which needs to be local (on our machine)
                          the default location will be "/v3io/projects/<file_name> 
                          which can be change by using mlrun.mount_v3io later in the function specs
                          
                bins :    Integer
                          Number of bin in histograms
                          
                describe_extended : Bool 
                         (True) set to False if the aim is to get a simple 
                         pandas.DataFrame.describe() like infomration
    ---------------------------------------------------------------------------------------------
    Examples: 
               run mlrun function example, inputs will be part of the function inputs.
               artifact_path is part of mlrun function parameters which set the path 
               for logging artifacts, results, dataset, etc.
               
               function.run(inputs={"dataset": "iris.csv",
                                    "bins": 30,
                                    "describe_extended": True},
                                     artifact_path=artifact_path)
    """
    
    # get file location
    location = dataset.local()
    
    # build spark session
    spark = SparkSession.builder.appName("Spark job").config("spark.executor.memory","6g").getOrCreate()
    
    # read csv
    df = spark.read.csv(location, header=True, inferSchema= True)

    # No use for now
    kwargs = []
    
    # take only numric column
    float_cols = [item[0] for item in df.dtypes if item[1].startswith('float') or item[1].startswith('double')]
    
    if describe_extended == True:
        
        # run describe function
        table, variables, freq = describe(df, bins, float_cols, kwargs)

        # get summary table
        tbl_1 = variables.reset_index()

        # prep report 
        if len(freq) != 0:
            tbl_2 = pd.DataFrame.from_dict(freq, orient = "index").sort_index().stack().reset_index()
            tbl_2.columns = ['col', 'key', 'val']
            tbl_2['Merged'] = [{key: val} for key, val in zip(tbl_2.key, tbl_2.val)]
            tbl_2 = tbl_2.groupby('col', as_index=False).agg(lambda x: tuple(x))[['col','Merged']]

            # get summary
            summary = pd.merge(tbl_1, tbl_2, how='left', left_on='index', right_on='col')

        else:
            summary = tbl_1

        # log final report
        context.log_dataset("summary_stats", 
                            df=summary,
                            format="csv", index=False,
                            artifact_path=context.artifact_subpath('data'))

        # log overview
        context.log_results(table)
    
    else:
        # run simple describe and save to pandas
        tbl_1 = df.describe().toPandas()
        
        # save final report and transpose 
        summary = tbl_1.T
        
        # log final report
        context.log_dataset("summary_stats", 
                            df=summary,
                            format="csv", index=False,
                            artifact_path=context.artifact_subpath('data'))
    
    # stop spark session
    spark.stop()


In [5]:
# nuclio: end-code

### Download iris dataset

In [6]:
import requests
import shutil

def download_file(url,path):
    local_filename = url.split('/')[-1]
    
    #file_path = path+"/"+local_filename
    with requests.get(url, stream=True) as r:
        with open("/v3io/projects/"+local_filename, 'wb') as f:
            shutil.copyfileobj(r.raw, f)

    return local_filename

url = "https://s3.wasabisys.com/iguazio/data/iris/iris_dataset.csv"

download_file(url,'/v3io/projects')

'iris_dataset.csv'

Please don't remove the # nuclio: end-code cell above
### Set MLRun Function Specs

In [7]:
#get spark service name
from configparser import ConfigParser
from itertools import chain

parser = ConfigParser()
configFilePath = os.environ['SPARK_HOME']+'/conf/spark-defaults.conf'
print(configFilePath)
with open(configFilePath) as lines:
    lines = chain(("[top]",), lines)  # This line does the trick.
    parser.read_file(lines)
    spark_service_name = parser["top"]["spark.master"].split("://")[1].split("-master")[0]   
print(spark_service_name) 

/spark/conf/spark-defaults.conf
spark


In [8]:
# mlrun will transform the code above (up to nuclio: end-code cell) into serverless function 
# which will run in k8s pods
fn = mlrun.code_to_function(handler="describe_spark", kind="remote-spark")



In [9]:
fn.with_spark_service(spark_service=spark_service_name)

fn.spec.build.commands = ['pip install matplotlib pyspark']
fn.deploy()

> 2023-10-25 08:06:22,787 [info] Started building image: .mlrun/func-default-spark-mlrun-describe:latest
[36mINFO[0m[0000] Retrieving image manifest gcr.io/iguazio/shell:3.5.4-b688.20230907171855 
[36mINFO[0m[0000] Retrieving image gcr.io/iguazio/shell:3.5.4-b688.20230907171855 from registry gcr.io 
[36mINFO[0m[0000] Built cross stage deps: map[]                
[36mINFO[0m[0000] Retrieving image manifest gcr.io/iguazio/shell:3.5.4-b688.20230907171855 
[36mINFO[0m[0000] Returning cached image manifest              
[36mINFO[0m[0000] Executing 0 build triggers                   
[36mINFO[0m[0000] Building stage 'gcr.io/iguazio/shell:3.5.4-b688.20230907171855' [idx: '0', base-idx: '-1'] 
[36mINFO[0m[0000] Unpacking rootfs as cmd RUN pip install matplotlib pyspark requires it. 
[36mINFO[0m[0051] RUN pip install matplotlib pyspark           
[36mINFO[0m[0051] Initializing snapshotter ...                 
[36mINFO[0m[0051] Taking snapshot of full filesystem...        


True

### Set MLRun and Run Function
Once running the function get be monitored here and our projects dashbaord<br>

In [10]:
# set mlrun api path and arrtifact path for logging
artifact_path = mlrun.set_environment(api_path = 'http://mlrun-api:8080',
                                      artifact_path = os.path.abspath('./'))

In [11]:
# run our functions with the relevant params
run_res = fn.run(inputs={"dataset": "iris_dataset.csv"},
                 artifact_path=artifact_path[1], watch=True)

> 2023-10-25 08:12:08,217 [info] Storing function: {'name': 'spark-mlrun-describe-describe-spark', 'uid': '731d733b58e149fcaeb99af47b4ce372', 'db': 'http://mlrun-api:8080'}
> 2023-10-25 08:12:08,528 [info] Job is running in the background, pod: spark-mlrun-describe-describe-spark-kbr4x
23/10/25 08:13:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
package mlrun installed
package matplotlib installed
> 2023-10-25 08:14:08,965 [info] Run execution finished: {'status': 'completed', 'name': 'spark-mlrun-describe-describe-spark'}
                                                                                

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...7b4ce372,0,Oct 25 08:13:15,completed,spark-mlrun-describe-describe-spark,v3io_user=aviakind=remote-sparkowner=aviamlrun/client_version=1.6.0-rc2mlrun/client_python_version=3.9.16host=spark-mlrun-describe-describe-spark-kbr4x,dataset,,n=150nvar=5total_missing=0.0memsize=0.0 YiBrecordsize=0.0 YiBNUM=5DATE=0CONST=0CAT=0UNIQUE=0CORR=0REJECTED=0,summary_stats





> 2023-10-25 08:14:17,144 [info] Run execution finished: {'status': 'completed', 'name': 'spark-mlrun-describe-describe-spark'}


In [12]:
run_res.show()

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...7b4ce372,0,Oct 25 08:13:15,completed,spark-mlrun-describe-describe-spark,v3io_user=aviakind=remote-sparkowner=aviamlrun/client_version=1.6.0-rc2mlrun/client_python_version=3.9.16host=spark-mlrun-describe-describe-spark-kbr4x,dataset,,n=150nvar=5total_missing=0.0memsize=0.0 YiBrecordsize=0.0 YiBNUM=5DATE=0CONST=0CAT=0UNIQUE=0CORR=0REJECTED=0,summary_stats
