Load sparkmagic before start


In [None]:
%load_ext sparkmagic.magics

### Start Spark Session

In [None]:
# You can change to any Synapse workspace you can access
# Please expect long waiting time, especially at first run

%spark start --workspace hochsynapse --sparkpool hochpool

### Working with Dataset

In [None]:
%%spark

# get workspace
from azureml.core import Workspace
subscription_id = '35f16a99-532a-4a47-9e93-00305f6c40f2'
resource_group = 'arthfa_test'
workspace_name = 'arthfa_test_azureml_ws'
workspace = Workspace(subscription_id, resource_group, workspace_name)

# get dataset
from azureml.core import Dataset
ds = Dataset.get(workspace, name = 'movies-yang')
df = ds.to_pandas_dataframe()

# apply spark transforms
from  pyspark.sql.functions import desc
spark_df = sqlContext.createDataFrame(df)
spark_df.groupBy('Distributor').count().orderBy(desc('count')).show(10)

### Show Session Details

In [None]:
%spark info

### Stop Session

In [None]:
%spark stop

### Override Session Config

In [None]:
%%spark config
{
    "driverMemory":"8g",
    "driverCores":2,
    "executorMemory":"8g",
    "executorCores":2,
    "numExecutors":2
}

### Example: RDD operation

In [None]:
%%spark

import time
b=sc.broadcast([3,5]) #Creating a broadcast variable available on all executors
a=sc.accumulator(0)   #Creating an accumulator for adding values across executors
RDD0=sc.parallelize([y for y in range(0,5)]) #RDD from input python collection
RDD2=sc.parallelize([z for z in range(10,15)])
RDD1=RDD0.cartesian(RDD2) 
cached=RDD2.cache() #Testing cached RDD
RDD22=RDD1.map(lambda x:x[0]+x[1]+b.value[0])
RDD3=RDD22.repartition(5) # To trigger a new stage.
RDD4=RDD2.map(lambda x: 3*x-b.value[0])
RDD5=RDD3.filter(lambda x:x%2==0)
RDD6=RDD4.filter(lambda x:x%2!=0)
RDD7=RDD5.cartesian(RDD6)
RDD8=RDD7.flatMap(lambda x: [x[i] for i in range(0,2)])
RDD9=RDD8.union(cached)
ans=RDD9.reduce(lambda x,y: x+y) # Doing a simple sum on the random data.
print(ans)
def f(x):
    global a
    time.sleep(0.7) #Making the job run a little longer
    a+=x
RDD9.foreach(f)
print(a.value)

### Example: Spark Pi

In [None]:
%%spark

from random import random
from operator import add

partitions = 10
n = 100000 * partitions

def f(_):
    x = random() * 2 - 1
    y = random() * 2 - 1
    return 1 if x ** 2 + y ** 2 <= 1 else 0

count = sc.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
print("Pi is roughly %f" % (4.0 * count / n))