## Assumptions

* N Synapse workspaces can be linked

## Get Azure ML Workspace

In [None]:
from azureml.core import Workspace

In [None]:
ws = ws.from_config()

## Link Synapse workspace

Should we allow linking to a Synapse workspace from Azure ML, or enforce one-way linking from Synapse only?

In [None]:
synapse_params = 
{
    workspace_name  = 'my_synapse_ws',      # required input
    resource_group  = ws.resource_group,    # default to same as AML Workspace 
    subscription_id = ws.subscription_id    # default to same as AML Workspace
    **AUTH_OPTIONS                          # details pending design
}

# minimal API
ws.link_synapse_workspace(**synapse_params) # all Datastores and Compute Targets imported 

# full API
ws.link_synapse_workspace(**synapse_params,          # required inputs
                          linked_services    = True, # optional - default to True
                          spark_pools        = True, # optional - default to True 
                         )

###############################################
######## OUT OF SCOPE FOR MANGANESE ###########
###############################################

linked_services_map = 
{
    'my_linked_service': 
    {
        'datastores': 
        {
            'BlobContainers': ['container1', 'container2'],
            'FileSystems': ['fs1', 'fs2']
        }
    }
    'my_other_linked_service': 
    {
        'datastores': 
        {
            'SQLDBs': ['MySQLDB1', 'MySQLDB2']
        }
    }
}

spark_pools = ['pool1', 'pool2', 'pool3']

# full API
ws.link_synapse_workspace(**synapse_params,                         # required inputs
                          linked_services    = linked_services_map, # select specific datastores
                          spark_pools        = spark_pools          # select specific compute targets 
                         )

# register datastores from Synapse into Azure ML
ws.register_synapse_datastores(linked_services_map)

# unregister Synapse datastores from Azure ML 
ws.unregister_synapse_datastores(linked_services_map)

# attach spark pools from Synapse into Azure ML
ws.attach_synapse_spark_pools(spark_pools) 

# detach Synapse spark pools from Azure ML
ws.detach_syanpse_spark_pools(spark_pools) 

###############################################
######## OUT OF SCOPE FOR MANGANESE ###########
###############################################

## Use the Compute and Datastore

In [None]:
from azureml.core import Dataset

In [None]:
dataset = Dataset.Tabular.from_delimited_files(path = [(blob_datastore, 'data/my_data/*.parquet')])

In [None]:
%%writefile scripts/dataprep.py

from azureml.core import Dataset, Run

run     = Run.get_context()
dataset = run.input_datasets['my_data']
df      = dataset.to_spark_dataframe()

### data preparation and/or training code
df['temperature'] = df['temperature']*(9/5) + 32


###############################################
################## NEW API ####################
###############################################

# minimal API
new_dataset = Dataset.Tabular.from_spark_dataframe(df) # new API

# full API
new_dataset = Dataset.Tabular.from_spark_dataframe(df,  # required inputs
                    # optional parameters w/ sensible defaults 
                    datastore   = ws.get_default_datastore(),
                    target_path = f'from_spark_df/{GUID}/{version}/data/part-*.{compression}.parquet',
                    compression = 'lz4',
                    overwrite   = False,
                    new_version = True
                    )

new_dataset = new_dataset.register(ws, name) # existing API

## Specifics for runs on Synapse tbd

Pending LT/Python SDK feedback on best way to expose Synapse compute targets, and whether there are technical considerations for adding new concepts such as a `SynapseEstimator` or `SynapseStep`. Regardless, Synapse Spark compute targets should be usable in:

* regular Python script runs
* Pipelines as a step (including Designer)
* notebooks for interactive PySpark on datasets 

In [None]:
from azureml.core import ScriptRunConfig, RunConfiguration, Experiment, CondaDependencies

In [None]:
# use pyspark framework
synapse_config = RunConfiguration(framework="pyspark")

# Set compute target to the attached Synapse Spark cluster
syanpse_config.target = ws.compute_targets['SynapsePool']

# get the environment - use Environments to tailor as needed 
syanpse_config.environment = ws.environments['Synapse-AzureML-Spark']  # new curated environment w/ default packages

# create ScriptRunConfig object  
config = ScriptRunConfig(source_directory='.', script='dataprep.py', run_config=run_SynapseSpark)

In [None]:
# create experiment, submit run 
exp = Experiment(ws, 'synapse_dprep')
run = exp.submit(config)

## Monitor the run

In [None]:
from azureml.widgets import RunDetails

In [None]:
RunDetails.show(run)

## Pipeline step

In [None]:
step1   = SynapseStep(name             = 'synapseStep'
                      source_directory = 'scripts',
                      entry_script     = 'dataprep.py',
                      inputs           = [dataset.as_named_input('my_data')],
                      compute_target   = synapse_pool,
                      workers          = 100,
                      allow_resuse     = True
                     )

step2   = ......

pipeline = Pipeline([step1, step2]).publish()