
# Examples of using Spark 

In [None]:
#note that Spark only works with ray-on-aml version 0.1.8 or lower for now.
#do pip install ray-on-aml==0.1.8 to run the following examples.

In [None]:
from azureml.core import Workspace, Experiment, Environment,ScriptRunConfig
# from azureml.widgets import RunDetails
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.environment import Environment
import time

In [None]:
from ray_on_aml.core import Ray_On_AML
ws = Workspace.from_config()
ray_on_aml =Ray_On_AML(ws=ws, compute_cluster ="d12-v2-ssh", maxnode=5)
ray = ray_on_aml.getRay()
ray.cluster_resources()

In [None]:
#shut down when you're done
ray_on_aml.shutdown()

## Reading Delta Lake data from ADLS Gen2 

In [None]:
storage_account_name ="adlsdatalakegen6"
storage_account_key=ws.get_default_keyvault().get_secret("adlsdatalakegen6")
additional_spark_configs ={f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net":f"{storage_account_key}"}
spark = ray_on_aml.getSpark(executor_cores =3,num_executors =3 ,executor_memory='10GB', additional_spark_configs=additional_spark_configs)
#Number of nodes (including head node) can be set as number of executor. 

In [None]:
df = spark.read.format("delta").load("abfss://mltraining@adlsdatalakegen6.dfs.core.windows.net/ISDWeatherDelta")

In [None]:
df.count()
73,696,631

## Using Koalas (pyspark Pandas) API 

In [None]:
from pyspark.pandas import read_delta

In [None]:
test_delta = read_delta("abfss://mltraining@adlsdatalakegen6.dfs.core.windows.net/ISDWeatherDelta")

In [None]:
test_delta.describe()

In [None]:
test_delta["snowDepth"].plot.area()

In [None]:
test_delta.groupby("stationName").count().head(20)

In [None]:
adls_data = spark.read.format("delta").load("abfss://mltraining@adlsdatalakegen6.dfs.core.windows.net/ISDWeatherDelta")
adls_data.groupby("stationName").count().head(20)


## Synapse SQL Pool Data Access (Dedicated or Serverless)

In [None]:
server_name = "jdbc:sqlserver://sy2qwhqqkv7eacsws1-ondemand.sql.azuresynapse.net:1433"
database_name = "mydbname"
url = server_name + ";" + "databaseName=" + database_name + ";"

table_name = """
SELECT TOP 10 *
FROM OPENROWSET
  (
      BULK 'csv/population/*.csv',
      DATA_SOURCE = 'SqlOnDemandDemo',
      FORMAT = 'CSV', PARSER_VERSION = '2.0'
  )
WITH
  (
      country_code VARCHAR (5)
    , country_name VARCHAR (100)
    , year smallint
    , population bigint
  ) AS r
WHERE
  country_name = 'Luxembourg' AND year = 2017

"""
username = "azureuser"
password = "abcd@12345" # Please specify password here


jdbcDF = spark.read \
        .format("jdbc") \
        .option("url", url) \
        .option("query", table_name) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .option("user", username) \
        .option("password", password).load()

jdbcDF.show()

## Spark ML with Collaborative Filtering 
(https://spark.apache.org/docs/3.2.1/ml-collaborative-filtering.html)

In [None]:

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

lines = spark.read.text("abfss://mltraining@adlsdatalakegen6.dfs.core.windows.net/mllib/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
# Generate top 10 user recommendations for a specified set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)



## ML with Spark & XGBoost 
(transformation script and example are copied from https://github.com/oap-project/raydp/tree/master/python/raydp)

In [None]:
#Loading NYC taxi
# nyc_taxi_df = spark.read.format("parquet").load("wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/yellow")

In [None]:
import sys
sys.path.append("../") # go to parent dir

In [None]:
import ray
import numpy as np
from pyspark.sql.functions import *
# XGBoost on ray is needed to run this example.
# Please refer to https://docs.ray.io/en/latest/xgboost-ray.html to install it.
from xgboost_ray import RayDMatrix, train, RayParams
import raydp
from raydp.utils import random_split
from raydp.spark import RayMLDataset


from spark.data_process import nyc_taxi_preprocess, NYC_TRAIN_CSV

# connect to ray cluster
# ray.init(address='auto')
# # After ray.init, you can use the raydp api to get a spark session

data = spark.read.format("csv").option("header", "true") \
        .option("inferSchema", "true") \
        .load(NYC_TRAIN_CSV)
# Set spark timezone for processing datetime
spark.conf.set("spark.sql.session.timeZone", "UTC")
# Transform the dataset
data = nyc_taxi_preprocess(data)
# Split data into train_dataset and test_dataset
train_df, test_df = random_split(data, [0.9, 0.1], 0)
# Convert spark dataframe into ray dataset
train_dataset = ray.data.from_spark(train_df)
test_dataset = ray.data.from_spark(test_df)
# Then convert them into DMatrix used by xgboost
dtrain = RayDMatrix(train_dataset, label='fare_amount')
dtest = RayDMatrix(test_dataset, label='fare_amount')
# Configure the XGBoost model
config = {
    "tree_method": "hist",
    "eval_metric": ["logloss", "error"],
}
evals_result = {}
# Train the model
bst = train(
        config,
        dtrain,
        evals=[(dtest, "eval")],
        evals_result=evals_result,
        ray_params=RayParams(max_actor_restarts=2, num_actors=2, cpus_per_actor=2),
        num_boost_round=10)
# print evaluation stats
print("Final validation error: {:.4f}".format(
        evals_result["eval"]["error"][-1]))
