In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

from xgboost.spark import SparkXGBClassifier

spark = SparkSession.builder.getOrCreate()

## WANDB Exploration using Pandas

In [7]:
import wandb
import random

wandb.init(
    project="wandb-minimal-example",
    
    config={
    "learning_rate": 0.02,
    "architecture": "CNN",
    "dataset": "CIFAR-100",
    "epochs": 10,
    }
)

# training with mock data
epochs = 10
offset = random.random() / 5
for epoch in range(2, epochs):
    acc = 1 - 2 ** -epoch - random.random() / epoch - offset
    loss = 2 ** -epoch + random.random() / epoch + offset
    
    # log metrics to wandb
    wandb.log({"acc": acc, "loss": loss})
    
# this needs to be done in notebooks
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mireneisdoomed[0m ([33mopen-targets[0m). Use [1m`wandb login --relogin`[0m to force relogin


0,1
acc,▄▁█▆▇▇▇▇
loss,█▄▃▁▁▃▁▁

0,1
acc,0.72942
loss,0.22413


## WANDB Exploration Using Spark

In [None]:
indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
xgb_classifier = SparkXGBClassifier(
  label_col="indexedLabel",
  missing=0.0,
  n_estimators = 20
)
pipeline = Pipeline(stages=[indexer, xgb_classifier])
model = pipeline.fit(training)


In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import wandb

scoreAndLabels = map(
	lambda x: (
		Vectors.dense([1.0 - x[0], x[0]]), x[1], x[2]),
    			      [(0.1, 0.0, 0.0), (0.1, 1.0, 0.0), 
			       (0.4, 0.0, 0.0), (0.6, 0.0, 0.1), 
			       (0.6, 1.0, 1.0), (0.6, 1.0, 0.0), 
                               (0.8, 1.0, 1.0)])
dataset = spark.createDataFrame(scoreAndLabels, ["probability", "label", "prediction"])
evaluator = MulticlassClassificationEvaluator()
evaluator.setPredictionCol("prediction")
run = wandb.init(project = "sparkml-example", 
		 job_type = "multiclass-classification-eval")
wandb_evaluator = WandbEvaluator(sparkMlEvaluator = evaluator)
wandb_evaluator.setWandbRun(run)
wandb_evaluator.setMetricPrefix("test/")
wandb_evaluator.evaluate(dataset)
run.finish()


In [None]:
indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
xgb_classifier = SparkXGBClassifier(
  label_col="indexedLabel",
  missing=0.0,
  n_estimators = 20
)
pipeline = Pipeline(stages=[indexer, xgb_classifier])
# Create an evaluator.  In this case, use "weightedPrecision".
evaluator = MulticlassClassificationEvaluator(
	labelCol="indexedLabel", 
	metricName="weightedPrecision"
)
wandb_evaluator = WandbEvaluator(sparkMlEvaluator = evaluator, metricPrefix = "cv/")


grid = ParamGridBuilder() \
  .addGrid(xgb_classifier.learning_rate, [0.0, 0.01, 0.1]) \
  .addGrid(xgb_classifier.max_depth, [2, 3, 5]) \
  .build()


cv = WandbCrossValidator(estimator=pipeline, 
                         evaluator=wandb_evaluator, 
                         estimatorParamMaps=grid, 
                         numFolds=3)
cv.fit(training)


In [4]:
import pandas as pd
import numpy as np

# create mock data
num_rows = 1000
num_cols = 10
data = np.random.rand(num_rows, num_cols + 1)
columns = [f'feature_{str(i)}' for i in range(num_cols)] + ['label']
df = pd.DataFrame(data=data, columns=columns)



In [5]:
import pyspark.ml.feature as ft
import pyspark.ml.regression as reg
import wandb

wandb.init(project='xgboost-spark-example', )

# Load data
data = spark.createDataFrame(df)

# Prepare data for model training
feature_cols = data.columns[:-1]
label_col = data.columns[-1]
assembler = ft.VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data).select("features", label_col)

# Split data into training and test sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=123)

# Train XGBoost model
xgb = reg.XGBoostRegressor()
xgb_params = {"eta": 0.1, "maxDepth": 3, "objective": "reg:squarederror", "numRound": 10}
xgb.setParams(**xgb_params)
model = xgb.fit(train_data)

# Evaluate model
predictions = model.transform(test_data)
rmse_evaluator = reg.RegressionEvaluator(labelCol=label_col, metricName="rmse")
rmse = rmse_evaluator.evaluate(predictions)

# Log results to Weights and Biases
wandb.log({"rmse": rmse})

# Save model to Weights and Biases
model_path = "xgboost_pyspark_ml_model"
model.save(model_path)
wandb.save(model_path)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 45
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit e

AttributeError: module 'pyspark.ml.regression' has no attribute 'XGBoostRegressor'

In [9]:
import wandb
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors


wandb.init(project="wandb-spark-example",)

data = spark.createDataFrame(df)

# Prepare the data
assembler = VectorAssembler(inputCols=['feature1', 'feature2', 'feature3'], outputCol='features')
data = assembler.transform(data).select(['features', 'label'])
train_data, test_data = data.randomSplit([0.8, 0.2])


lr = LogisticRegression(featuresCol='features', labelCol='label')
model = lr.fit(train_data)

# Evaluate the model
predictions = model.transform(test_data)
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')
auc = evaluator.evaluate(predictions)
wandb.log({'auc': auc})


wandb.finish()


Problem at: /var/folders/54/2j7x_lqn343_d6hjm7mcv9rc0000gn/T/ipykernel_80237/3518243712.py 10 <module>


Traceback (most recent call last):
  File "/Users/irenelopez/MEGAsync/EBI/repos/genetics_etl_python/.venv/lib/python3.8/site-packages/wandb/sdk/wandb_init.py", line 1108, in init
    run = wi.init()
  File "/Users/irenelopez/MEGAsync/EBI/repos/genetics_etl_python/.venv/lib/python3.8/site-packages/wandb/sdk/wandb_init.py", line 576, in init
    manager._inform_init(settings=self.settings, run_id=self.settings.run_id)
  File "/Users/irenelopez/MEGAsync/EBI/repos/genetics_etl_python/.venv/lib/python3.8/site-packages/wandb/sdk/wandb_manager.py", line 174, in _inform_init
    svc_iface._svc_inform_init(settings=settings, run_id=run_id)
  File "/Users/irenelopez/MEGAsync/EBI/repos/genetics_etl_python/.venv/lib/python3.8/site-packages/wandb/sdk/service/service_sock.py", line 38, in _svc_inform_init
    self._sock_client.send(inform_init=inform_init)
  File "/Users/irenelopez/MEGAsync/EBI/repos/genetics_etl_python/.venv/lib/python3.8/site-packages/wandb/sdk/lib/sock_client.py", line 211, in se

Exception: problem