In [1]:
#****************************************************************************
# (C) Cloudera, Inc. 2020-2023
#  All rights reserved.
#
#  Applicable Open Source License: GNU Affero General Public License v3.0
#
#  NOTE: Cloudera open source products are modular software products
#  made up of hundreds of individual components, each of which was
#  individually copyrighted.  Each Cloudera open source product is a
#  collective work under U.S. Copyright Law. Your license to use the
#  collective work is as provided in your written agreement with
#  Cloudera.  Used apart from the collective work, this file is
#  licensed for your use pursuant to the open source license
#  identified above.
#
#  This code is provided to you pursuant a written agreement with
#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
#  this code. If you do not have a written agreement with Cloudera nor
#  with an authorized and properly licensed third party, you do not
#  have any rights to access nor to use this code.
#
#  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
#  DATA.
#
# #  Author(s): Paul de Fusco
#***************************************************************************/

In [2]:
import mlflow.spark

In [3]:
import os
import warnings
import sys
import mlflow
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet

In [4]:
import logging
import json
import shutil
import datetime
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
import cml.data_v1 as cmldata

In [5]:
import cml.data_v1 as cmldata

# Sample in-code customization of spark configurations
#from pyspark import SparkContext
SparkContext.setSystemProperty('spark.executor.cores', '2')
SparkContext.setSystemProperty('spark.executor.memory', '2g')

CONNECTION_NAME = "go01-aw-dl"
conn = cmldata.get_connection(CONNECTION_NAME)
spark = conn.get_spark_session()

# Sample usage to run query through spark
EXAMPLE_SQL_QUERY = "show databases"
spark.sql(EXAMPLE_SQL_QUERY).show()

23/11/06 22:31:56 WARN SparkConf: The configuration key 'spark.yarn.access.hadoopFileSystems' has been deprecated as of Spark 3.0 and may be removed in the future. Please use the new key 'spark.kerberos.access.hadoopFileSystems' instead.
23/11/06 22:31:56 WARN SparkConf: The configuration key 'spark.yarn.access.hadoopFileSystems' has been deprecated as of Spark 3.0 and may be removed in the future. Please use the new key 'spark.kerberos.access.hadoopFileSystems' instead.
Setting spark.hadoop.yarn.resourcemanager.principal to pauldefusco
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/06 22:31:57 WARN SparkConf: The configuration key 'spark.yarn.access.hadoopFileSystems' has been deprecated as of Spark 3.0 and may be removed in the future. Please use the new key 'spark.kerberos.access.hadoopFileSystems' instead.
23/11/06 

+--------------------+
|           namespace|
+--------------------+
|         01_car_data|
|           01_car_dw|
|              adb101|
|            airlines|
|        airlines_csv|
|    airlines_iceberg|
|airlines_iceberg_...|
|      airlines_mjain|
|          airquality|
|                ajvp|
|          atlas_demo|
|            bankdemo|
|          bca_jps_l0|
|     bri_ranger_demo|
|cde_demo_pauldefusco|
|   cde_demo_pdefusco|
|        cde_workshop|
|cde_workshop_pdef...|
| cde_workshop_smohan|
|             cdedemo|
+--------------------+
only showing top 20 rows



In [22]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
from pyspark.sql.types import LongType, IntegerType, StringType, FloatType
from pyspark.sql import functions as F
import dbldatagen as dg
import dbldatagen.distributions as dist
from dbldatagen import FakerTextFactory, DataGenerator, fakerText

class LabeledTextGen:

    '''Class to Generate Text Data'''

    def __init__(self, spark):
        self.spark = spark

    def dataGen(self, shuffle_partitions_requested = 8, partitions_requested = 8, data_rows = 10000):

        # setup use of Faker
        FakerTextUS = FakerTextFactory(locale=['en_US'])

        # partition parameters etc.
        self.spark.conf.set("spark.sql.shuffle.partitions", shuffle_partitions_requested)

        fakerDataspec = (DataGenerator(self.spark, rows=data_rows, partitions=partitions_requested)
                    .withColumnSpec("id", minValue=1, maxValue=data_rows, step=1)
                    .withColumn("text", text=FakerTextUS("address"))
                    .withColumn("label", "string", values=["0", "1"],random=True)
                    )
        df = fakerDataspec.build()
                
        df = df.withColumn("idStr", F.col("id").cast(StringType()))\
            .drop("id")\
            .withColumnRenamed("idStr", "id")
     
        df = df.withColumn("labelStr", F.col("label").cast(FloatType()))\
            .drop("label")\
            .withColumnRenamed("labelStr", "label")
        
        return df

In [29]:
dg = LabeledTextGen(spark)

training_df = dg.dataGen()



In [30]:
training_df.show()

[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+---+-----+
|                text| id|label|
+--------------------+---+-----+
|894 Tiffany Court...|  0|  1.0|
|2085 Knapp Traffi...|  1|  1.0|
|3202 Aimee Circle...|  2|  1.0|
|857 Kenneth Lodge...|  3|  1.0|
|40115 Arnold Keys...|  4|  1.0|
|03216 Lisa Street...|  5|  0.0|
|14358 Phillips Po...|  6|  0.0|
|7131 Carr Street
...|  7|  1.0|
|622 Ashley Point
...|  8|  1.0|
|63004 Brittany Gr...|  9|  0.0|
|1122 Morrow Tunne...| 10|  1.0|
|USNV Moore
FPO AP...| 11|  1.0|
|507 Becky Extensi...| 12|  1.0|
|662 Lee Key
North...| 13|  0.0|
|38622 Barr Spring...| 14|  0.0|
|9674 Peter Dale
W...| 15|  1.0|
|179 Maddox Path S...| 16|  1.0|
|4971 Gilbert Inle...| 17|  1.0|
|7981 Valdez Pine ...| 18|  1.0|
|7566 Brenda Ridge...| 19|  0.0|
+--------------------+---+-----+
only showing top 20 rows



                                                                                

In [32]:
def exp1(df):

    mlflow.set_experiment("sparkml-experiment")

    ##EXPERIMENT 1

    df.writeTo("spark_catalog.default.training").using("iceberg").createOrReplace()
    spark.sql("SELECT * FROM spark_catalog.default.training").show()

    ### SHOW TABLE HISTORY AND SNAPSHOTS
    spark.read.format("iceberg").load("spark_catalog.default.training.history").show(20, False)
    spark.read.format("iceberg").load("spark_catalog.default.training.snapshots").show(20, False)

    snapshot_id = spark.read.format("iceberg").load("spark_catalog.default.training.snapshots").select("snapshot_id").tail(1)[0][0]
    committed_at = spark.read.format("iceberg").load("spark_catalog.default.training.snapshots").select("committed_at").tail(1)[0][0].strftime('%m/%d/%Y')
    parent_id = spark.read.format("iceberg").load("spark_catalog.default.training.snapshots").select("parent_id").tail(1)[0][0]
    
    tags = {
      "iceberg_snapshot_id": snapshot_id,
      "iceberg_snapshot_committed_at": committed_at,
      "iceberg_parent_id": parent_id,
      "row_count": training_df.count()
    }
    
    ### MLFLOW EXPERIMENT RUN
    with mlflow.start_run() as run:

        maxIter=8
        regParam=0.01

        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
        lr = LogisticRegression(maxIter=maxIter, regParam=regParam)
        pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
        model = pipeline.fit(training_df)

        mlflow.log_param("maxIter", maxIter)
        mlflow.log_param("regParam", regParam)

        #prediction = model.transform(test)
        mlflow.set_tags(tags)
        mlflow.spark.log_model(model, artifact_path="artifacts")

    mlflow.end_run()
    
    experiment_id = mlflow.get_experiment_by_name("sparkml-experiment").experiment_id
    runs_df = mlflow.search_runs(experiment_id, run_view_type=1)
    
    return runs_df

In [33]:
def exp2(df):
    
    mlflow.set_experiment("sparkml-experiment")
    
    ##EXPERIMENT 2

    ### ICEBERG INSERT DATA - APPEND FROM DATAFRAME

    # PRE-INSERT
    spark.sql("SELECT * FROM spark_catalog.default.training").show()

    temp_df = spark.sql("SELECT * FROM spark_catalog.default.training")
    temp_df.writeTo("spark_catalog.default.training").append()
    df = spark.sql("SELECT * FROM spark_catalog.default.training")

    # PROST-INSERT
    spark.sql("SELECT * FROM spark_catalog.default.training").show()

    spark.read.format("iceberg").load("spark_catalog.default.training.history").show(20, False)
    spark.read.format("iceberg").load("spark_catalog.default.training.snapshots").show(20, False)

    snapshot_id = spark.read.format("iceberg").load("spark_catalog.default.training.snapshots").select("snapshot_id").tail(1)[0][0]
    committed_at = spark.read.format("iceberg").load("spark_catalog.default.training.snapshots").select("committed_at").tail(1)[0][0].strftime('%m/%d/%Y')
    parent_id = spark.read.format("iceberg").load("spark_catalog.default.training.snapshots").select("parent_id").tail(1)[0][0]
    
    tags = {
      "iceberg_snapshot_id": snapshot_id,
      "iceberg_snapshot_committed_at": committed_at,
      "iceberg_parent_id": parent_id,
      "row_count": df.count()
    }
    
    ### MLFLOW EXPERIMENT RUN
    with mlflow.start_run() as run:

        maxIter=10
        regParam=0.002

        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
        lr = LogisticRegression(maxIter=maxIter, regParam=regParam)
        pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
        model = pipeline.fit(training_df)

        mlflow.log_param("maxIter", maxIter)
        mlflow.log_param("regParam", regParam)

        #prediction = model.transform(test)
        mlflow.set_tags(tags)
        mlflow.spark.log_model(model, artifact_path="artifacts")

    mlflow.end_run()
    
    experiment_id = mlflow.get_experiment_by_name("sparkml-experiment").experiment_id
    runs_df = mlflow.search_runs(experiment_id, run_view_type=1)
    
    return runs_df

In [40]:
def exp3(df, snapshot_id):
    ##EXPERIMENT 3

    df = spark.read.option("snapshot-id", snapshot_id).table("spark_catalog.default.training")

    committed_at = spark.sql("SELECT committed_at FROM spark_catalog.default.training.snapshots WHERE snapshot_id = {};".format(snapshot_id)).collect()[0][0].strftime('%m/%d/%Y')
    parent_id = str(spark.sql("SELECT parent_id FROM spark_catalog.default.training.snapshots WHERE snapshot_id = {};".format(snapshot_id)).tail(1)[0][0])

    tags = {
      "iceberg_snapshot_id": snapshot_id,
      "iceberg_snapshot_committed_at": committed_at,
      "iceberg_parent_id": parent_id,
      "row_count": training_df.count()
    }

    ### MLFLOW EXPERIMENT RUN
    with mlflow.start_run() as run:

        maxIter=7
        regParam=0.005

        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
        lr = LogisticRegression(maxIter=maxIter, regParam=regParam)
        pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
        model = pipeline.fit(training_df)

        mlflow.log_param("maxIter", maxIter)
        mlflow.log_param("regParam", regParam)

        #prediction = model.transform(test)
        mlflow.set_tags(tags)
        mlflow.spark.log_model(model, artifact_path="artifacts")#registered_model_name="spark-iceberg-"+username
        
    mlflow.end_run()
    
    experiment_id = mlflow.get_experiment_by_name("sparkml-experiment").experiment_id
    runs_df = mlflow.search_runs(experiment_id, run_view_type=1)

    #spark.stop()
    
    return runs_df

In [35]:
exp1(training_df)

                                                                                

+--------------------+---+-----+
|                text| id|label|
+--------------------+---+-----+
|1545 Amanda Overp...|  0|  1.0|
|31857 Linda Union...|  1|  1.0|
|USNS Payne
FPO AE...|  2|  1.0|
|554 Lopez Via Sui...|  3|  1.0|
|232 Patricia Dale...|  4|  1.0|
|30483 Nathan Port...|  5|  0.0|
|84192 Eddie Missi...|  6|  0.0|
|72145 James Wall
...|  7|  1.0|
|454 Powell Statio...|  8|  1.0|
|41317 Melissa Loc...|  9|  0.0|
|2968 Salazar Park...| 10|  1.0|
|155 Reid Fort Sui...| 11|  1.0|
|949 Gregory Sprin...| 12|  1.0|
|2698 Sylvia Plain...| 13|  0.0|
|56736 Gregory Fie...| 14|  0.0|
|0240 Freeman Hill...| 15|  1.0|
|5343 Jackson Squa...| 16|  1.0|
|86137 Ross Drives...| 17|  1.0|
|35262 Gomez Lodge...| 18|  1.0|
|82798 Orozco Turn...| 19|  0.0|
+--------------------+---+-----+
only showing top 20 rows

+-----------------------+-------------------+---------+-------------------+
|made_current_at        |snapshot_id        |parent_id|is_current_ancestor|
+-----------------------+-----

23/11/06 22:41:48 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/11/06 22:41:48 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
23/11/06 22:41:51 WARN SparkConf: The configuration key 'spark.yarn.access.hadoopFileSystems' has been deprecated as of Spark 3.0 and may be removed in the future. Please use the new key 'spark.kerberos.access.hadoopFileSystems' instead.
23/11/06 22:41:51 WARN SparkConf: The configuration key 'spark.yarn.access.hadoopFileSystems' has been deprecated as of Spark 3.0 and may be removed in the future. Please use the new key 'spark.kerberos.access.hadoopFileSystems' instead.
23/11/06 22:41:52 WARN SparkConf: The configuration key 'spark.yarn.access.hadoopFileSystems' has been deprecated as of Spark 3.0 and may be removed in the future. Please use the new key 'spark.kerberos.access.hadoopFileSystems' instead.
23/11/06 22:41:52 WARN SparkConf: The configuration key 'spark.yarn.access.

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,params.regParam,params.maxIter,tags.mlflow.source.name,tags.engineID,tags.mlflow.user,tags.row_count,tags.mlflow.source.type,tags.iceberg_snapshot_id,tags.iceberg_parent_id,tags.mlflow.source.git.commit,tags.iceberg_snapshot_committed_at,tags.mlflow.log-model.history
0,illa-fz4d-d00d-dpn9,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/il...,2023-11-06 17:08:30.259664128+00:00,2023-11-06 17:08:36.633999872+00:00,0.01,8,/usr/local/lib/python3.9/site-packages/ipykern...,utp7rbgxetohb8t3,pauldefusco,4,LOCAL,1283984402985061615,,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,
1,ahd5-4200-pzi1-rodx,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/ah...,2023-11-06 17:08:43.845902848+00:00,2023-11-06 17:08:45.760000+00:00,0.002,10,/usr/local/lib/python3.9/site-packages/ipykern...,utp7rbgxetohb8t3,pauldefusco,8,LOCAL,985373206992651716,1.2839844029850616e+18,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,
2,otx4-plub-mk9d-rykp,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/ot...,2023-11-06 17:08:52.620167936+00:00,2023-11-06 17:08:53.932000+00:00,0.005,7,/usr/local/lib/python3.9/site-packages/ipykern...,utp7rbgxetohb8t3,pauldefusco,4,LOCAL,1283984402985061615,,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,
3,kydf-fcki-cs41-5u9m,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/ky...,2023-11-06 17:15:16.696163840+00:00,2023-11-06 17:15:35.303000064+00:00,0.01,8,/usr/local/lib/python3.9/site-packages/ipykern...,utp7rbgxetohb8t3,pauldefusco,4,LOCAL,4748800489323277205,,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,"[{""run_id"": ""kydf-fcki-cs41-5u9m"", ""artifact_p..."
4,8dcm-rk2o-qd1w-li1s,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/8d...,2023-11-06 17:17:25.318180096+00:00,2023-11-06 17:17:45.268999936+00:00,0.002,10,/usr/local/lib/python3.9/site-packages/ipykern...,utp7rbgxetohb8t3,pauldefusco,8,LOCAL,7803587387117571530,4.748800489323276e+18,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,"[{""run_id"": ""8dcm-rk2o-qd1w-li1s"", ""artifact_p..."
5,aw2q-9mcc-1v6r-uh9r,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/aw...,2023-11-06 17:19:39.189926144+00:00,2023-11-06 17:19:56.536000+00:00,0.005,7,/usr/local/lib/python3.9/site-packages/ipykern...,utp7rbgxetohb8t3,pauldefusco,4,LOCAL,4748800489323277205,,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,"[{""run_id"": ""aw2q-9mcc-1v6r-uh9r"", ""artifact_p..."
6,qko2-4065-n95i-6mry,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/qk...,2023-11-06 17:38:08.899602688+00:00,2023-11-06 17:38:27.249999872+00:00,0.01,8,/usr/local/lib/python3.9/site-packages/ipykern...,utp7rbgxetohb8t3,pauldefusco,4,LOCAL,7226728052987263076,,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,"[{""run_id"": ""qko2-4065-n95i-6mry"", ""artifact_p..."
7,owsh-99w0-hj4c-g29y,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/ow...,2023-11-06 17:38:47.414222848+00:00,2023-11-06 17:39:00.593999872+00:00,0.002,10,/usr/local/lib/python3.9/site-packages/ipykern...,utp7rbgxetohb8t3,pauldefusco,8,LOCAL,4665871762421835805,7.226728052987262e+18,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,"[{""run_id"": ""owsh-99w0-hj4c-g29y"", ""artifact_p..."
8,dh37-xfvn-dvca-jo9s,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/dh...,2023-11-06 17:39:01.099144960+00:00,2023-11-06 17:39:12.844999936+00:00,0.005,7,/usr/local/lib/python3.9/site-packages/ipykern...,utp7rbgxetohb8t3,pauldefusco,4,LOCAL,7226728052987263076,,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,"[{""run_id"": ""dh37-xfvn-dvca-jo9s"", ""artifact_p..."
9,4frl-sp0o-covm-4u8b,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/4f...,2023-11-06 22:41:20.998869760+00:00,2023-11-06 22:42:06.180999936+00:00,0.01,8,/usr/local/lib/python3.9/site-packages/ipykern...,bte84r0ixoc09uga,pauldefusco,10000,LOCAL,1869813012622947967,,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,"[{""run_id"": ""4frl-sp0o-covm-4u8b"", ""artifact_p..."


In [36]:
exp2(training_df)

+--------------------+---+-----+
|                text| id|label|
+--------------------+---+-----+
|1545 Amanda Overp...|  0|  1.0|
|31857 Linda Union...|  1|  1.0|
|USNS Payne
FPO AE...|  2|  1.0|
|554 Lopez Via Sui...|  3|  1.0|
|232 Patricia Dale...|  4|  1.0|
|30483 Nathan Port...|  5|  0.0|
|84192 Eddie Missi...|  6|  0.0|
|72145 James Wall
...|  7|  1.0|
|454 Powell Statio...|  8|  1.0|
|41317 Melissa Loc...|  9|  0.0|
|2968 Salazar Park...| 10|  1.0|
|155 Reid Fort Sui...| 11|  1.0|
|949 Gregory Sprin...| 12|  1.0|
|2698 Sylvia Plain...| 13|  0.0|
|56736 Gregory Fie...| 14|  0.0|
|0240 Freeman Hill...| 15|  1.0|
|5343 Jackson Squa...| 16|  1.0|
|86137 Ross Drives...| 17|  1.0|
|35262 Gomez Lodge...| 18|  1.0|
|82798 Orozco Turn...| 19|  0.0|
+--------------------+---+-----+
only showing top 20 rows



                                                                                

+--------------------+---+-----+
|                text| id|label|
+--------------------+---+-----+
|1545 Amanda Overp...|  0|  1.0|
|31857 Linda Union...|  1|  1.0|
|USNS Payne
FPO AE...|  2|  1.0|
|554 Lopez Via Sui...|  3|  1.0|
|232 Patricia Dale...|  4|  1.0|
|30483 Nathan Port...|  5|  0.0|
|84192 Eddie Missi...|  6|  0.0|
|72145 James Wall
...|  7|  1.0|
|454 Powell Statio...|  8|  1.0|
|41317 Melissa Loc...|  9|  0.0|
|2968 Salazar Park...| 10|  1.0|
|155 Reid Fort Sui...| 11|  1.0|
|949 Gregory Sprin...| 12|  1.0|
|2698 Sylvia Plain...| 13|  0.0|
|56736 Gregory Fie...| 14|  0.0|
|0240 Freeman Hill...| 15|  1.0|
|5343 Jackson Squa...| 16|  1.0|
|86137 Ross Drives...| 17|  1.0|
|35262 Gomez Lodge...| 18|  1.0|
|82798 Orozco Turn...| 19|  0.0|
+--------------------+---+-----+
only showing top 20 rows

+-----------------------+-------------------+-------------------+-------------------+
|made_current_at        |snapshot_id        |parent_id          |is_current_ancestor|
+---------

                                                                                

+-----------------------+-------------------+-------------------+---------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|committed_at           |snapshot_id        |parent_id          |operation|manifest_list                                                                                                                                                         |summary                                                                                                                                                                                                   

23/11/06 22:42:44 WARN SparkConf: The configuration key 'spark.yarn.access.hadoopFileSystems' has been deprecated as of Spark 3.0 and may be removed in the future. Please use the new key 'spark.kerberos.access.hadoopFileSystems' instead.
23/11/06 22:42:44 WARN SparkConf: The configuration key 'spark.yarn.access.hadoopFileSystems' has been deprecated as of Spark 3.0 and may be removed in the future. Please use the new key 'spark.kerberos.access.hadoopFileSystems' instead.
23/11/06 22:42:44 WARN SparkConf: The configuration key 'spark.yarn.access.hadoopFileSystems' has been deprecated as of Spark 3.0 and may be removed in the future. Please use the new key 'spark.kerberos.access.hadoopFileSystems' instead.
23/11/06 22:42:45 WARN SparkConf: The configuration key 'spark.yarn.access.hadoopFileSystems' has been deprecated as of Spark 3.0 and may be removed in the future. Please use the new key 'spark.kerberos.access.hadoopFileSystems' instead.


Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,params.regParam,params.maxIter,tags.mlflow.source.name,tags.engineID,tags.mlflow.user,tags.row_count,tags.mlflow.source.type,tags.iceberg_snapshot_id,tags.iceberg_parent_id,tags.mlflow.source.git.commit,tags.iceberg_snapshot_committed_at,tags.mlflow.log-model.history
0,illa-fz4d-d00d-dpn9,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/il...,2023-11-06 17:08:30.259664128+00:00,2023-11-06 17:08:36.633999872+00:00,0.01,8,/usr/local/lib/python3.9/site-packages/ipykern...,utp7rbgxetohb8t3,pauldefusco,4,LOCAL,1283984402985061615,,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,
1,ahd5-4200-pzi1-rodx,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/ah...,2023-11-06 17:08:43.845902848+00:00,2023-11-06 17:08:45.760000+00:00,0.002,10,/usr/local/lib/python3.9/site-packages/ipykern...,utp7rbgxetohb8t3,pauldefusco,8,LOCAL,985373206992651716,1.2839844029850616e+18,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,
2,otx4-plub-mk9d-rykp,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/ot...,2023-11-06 17:08:52.620167936+00:00,2023-11-06 17:08:53.932000+00:00,0.005,7,/usr/local/lib/python3.9/site-packages/ipykern...,utp7rbgxetohb8t3,pauldefusco,4,LOCAL,1283984402985061615,,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,
3,kydf-fcki-cs41-5u9m,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/ky...,2023-11-06 17:15:16.696163840+00:00,2023-11-06 17:15:35.303000064+00:00,0.01,8,/usr/local/lib/python3.9/site-packages/ipykern...,utp7rbgxetohb8t3,pauldefusco,4,LOCAL,4748800489323277205,,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,"[{""run_id"": ""kydf-fcki-cs41-5u9m"", ""artifact_p..."
4,8dcm-rk2o-qd1w-li1s,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/8d...,2023-11-06 17:17:25.318180096+00:00,2023-11-06 17:17:45.268999936+00:00,0.002,10,/usr/local/lib/python3.9/site-packages/ipykern...,utp7rbgxetohb8t3,pauldefusco,8,LOCAL,7803587387117571530,4.748800489323276e+18,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,"[{""run_id"": ""8dcm-rk2o-qd1w-li1s"", ""artifact_p..."
5,aw2q-9mcc-1v6r-uh9r,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/aw...,2023-11-06 17:19:39.189926144+00:00,2023-11-06 17:19:56.536000+00:00,0.005,7,/usr/local/lib/python3.9/site-packages/ipykern...,utp7rbgxetohb8t3,pauldefusco,4,LOCAL,4748800489323277205,,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,"[{""run_id"": ""aw2q-9mcc-1v6r-uh9r"", ""artifact_p..."
6,qko2-4065-n95i-6mry,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/qk...,2023-11-06 17:38:08.899602688+00:00,2023-11-06 17:38:27.249999872+00:00,0.01,8,/usr/local/lib/python3.9/site-packages/ipykern...,utp7rbgxetohb8t3,pauldefusco,4,LOCAL,7226728052987263076,,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,"[{""run_id"": ""qko2-4065-n95i-6mry"", ""artifact_p..."
7,owsh-99w0-hj4c-g29y,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/ow...,2023-11-06 17:38:47.414222848+00:00,2023-11-06 17:39:00.593999872+00:00,0.002,10,/usr/local/lib/python3.9/site-packages/ipykern...,utp7rbgxetohb8t3,pauldefusco,8,LOCAL,4665871762421835805,7.226728052987262e+18,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,"[{""run_id"": ""owsh-99w0-hj4c-g29y"", ""artifact_p..."
8,dh37-xfvn-dvca-jo9s,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/dh...,2023-11-06 17:39:01.099144960+00:00,2023-11-06 17:39:12.844999936+00:00,0.005,7,/usr/local/lib/python3.9/site-packages/ipykern...,utp7rbgxetohb8t3,pauldefusco,4,LOCAL,7226728052987263076,,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,"[{""run_id"": ""dh37-xfvn-dvca-jo9s"", ""artifact_p..."
9,4frl-sp0o-covm-4u8b,8aul-mgvw-shwq-6a2k,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/8aul-mgvw-shwq-6a2k/4f...,2023-11-06 22:41:20.998869760+00:00,2023-11-06 22:42:06.180999936+00:00,0.01,8,/usr/local/lib/python3.9/site-packages/ipykern...,bte84r0ixoc09uga,pauldefusco,10000,LOCAL,1869813012622947967,,06b7bed0031ad636f6b3ade4189ea15b164906f3,11/06/2023,"[{""run_id"": ""4frl-sp0o-covm-4u8b"", ""artifact_p..."


In [41]:
#Retrieve snapshot_id from Experiments page or above dataframe. Use the Snapshot ID from the first experiment.
snapshot_id = "1869813012622947967"
exp3(training_df, snapshot_id)

AnalysisException: The namespace in session catalog must have exactly one name part: spark_catalog.default.training.snapshots