In [None]:
from os import environ, getenv
from operator import add
from random import random
from socket import gethostname, gethostbyname

from IPython.display import Markdown
from pyspark import SparkConf
from pyspark.sql import SparkSession

Let's populate some variables that will be useful later on

In [None]:
hostname = gethostname()
IPAddr = gethostbyname(hostname)
nb_prefix = getenv('NB_PREFIX')

with open('/var/run/secrets/kubernetes.io/serviceaccount/namespace', 'r') as f:
    current_namespace = f.readline()

Configure the Spark cluster

In [None]:
sparkConf = SparkConf()

sparkConf.setMaster(f"k8s://https://{environ['KUBERNETES_SERVICE_HOST']}:443")
sparkConf.set("spark.submit.deployMode", "client")
sparkConf.set(
    "spark.kubernetes.container.image",
    "quay.io/opendatahub-contrib/pyspark:s3.3.1-h3.3.4_v0.1.1"
)
sparkConf.set("spark.pyspark.python", "3")
sparkConf.set("spark.pyspark.driver.python", "3")
sparkConf.set("spark.kubernetes.namespace", current_namespace)
sparkConf.set("spark.driver.blockManager.port", "7777")
sparkConf.set("spark.driver.host", IPAddr)
sparkConf.set("spark.driver.port", "2222")
sparkConf.set("spark.driver.bindAddress", "0.0.0.0")

sparkConf.set("spark.executor.instances", "3")
sparkConf.set("spark.executor.memory", "512m")
sparkConf.set("spark.executor.cores", "1")

sparkConf.set("spark.ui.proxyBase", nb_prefix + "/proxy/4040/")

Initialize our Spark cluster. This will actually generate the worker nodes.

In [None]:
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
sc = spark.sparkContext

# Give information about the Spark UI access.
Markdown(f'While this Spark context is running, the UI is accessible'
         f'[here]({nb_prefix}/proxy/4040/jobs/).')

Basis Spark job as demo

In [None]:
partitions = 7
n = 10000000 * partitions


def f(_):
    x = random() * 2 - 1
    y = random() * 2 - 1

    return 1 if x ** 2 + y ** 2 <= 1 else 0


count = sc.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
print("Pi is roughly %f" % (4.0 * count / n))

In [None]:
sc.stop()