### Readme

Welcome to Jupyter Notebooks on [Neu.ro](https://neu.ro).

This guide contains simplistic example of running Jupyter instance with PySpark application driver.

It assumes you have got a kubectl [config file](https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/) and mounted it into the job at `/home/jovyan/.kube/config`. If the mount point is not a default, set `KUBECONFIG` environment variable ponting to the file location.

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from operator import add
from random import random
import os

#### Example context configuration

In [None]:
spark_app_namespace = ""  # use namespace provided by Neu.ro team
driver_service_account = ""  # use service account name provided by Neu.ro team
k8s_api_address = "" # kubernetes API endpoint provided by Neu.ro team, e.g. https://x.x.x.x:6443

config = {
    
    "spark.kubernetes.namespace": spark_app_namespace,
    # If changed, should contain the same version of PySpark, Hadoop and Hadoop-AWS libs
    "spark.kubernetes.container.image": "ghcr.io/neuro-inc/pyspark:pipelines",
    "spark.kubernetes.authenticate.driver.serviceAccountName": driver_service_account,
    "spark.kubernetes.authenticate.serviceAccountName": driver_service_account,
    # This might be handy for debugging
    #"spark.kubernetes.executor.deleteOnTermination": "false",
    "spark.executor.instances": "2",
    "spark.executor.memory": "1g",
    "spark.executor.cores": "1",
    "spark.driver.blockManager.port": "7777",
    "spark.driver.port": "2222",
    # Driver job internal hostname
    "spark.driver.host": os.environ.get("NEURO_JOB_INTERNAL_HOSTNAME"),
    "spark.driver.bindAddress": "0.0.0.0",
    # If neuro blob access is needed, uncomment and adjust the following configurations:
    # "spark.hadoop.fs.s3a.endpoint": endpoint_url from `neuro blob mkcredentials`,
    # "spark.hadoop.fs.s3a.connection.ssl.enabled": "true",
    # "spark.hadoop.fs.s3a.path.style.access": "true",
    # "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
    # "spark.hadoop.com.amazonaws.services.s3.enableV4": "true",
    # "spark.hadoop.fs.s3a.access.key": access_key_id from `neuro blob mkcredentials`,
    # "spark.hadoop.fs.fs.s3a.secret.key": secret_access_key from `neuro blob mkcredentials`,
    # "spark.hadoop.fs.s3a.aws.credentials.provider": "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider",
}

def get_spark_session(app_name: str, conf: SparkConf):
    conf.setMaster(f"k8s://{k8s_api_address}")
    for key, value in config.items():
        conf.set(key, value)    
    return SparkSession.builder.appName(app_name).config(conf=conf).getOrCreate()
spark = get_spark_session("MyTestApp", conf)

In [None]:
partitions = 2
n = 100000 * partitions

def f(_):
    x = random() * 2 - 1
    y = random() * 2 - 1
    return 1 if x ** 2 + y ** 2 <= 1 else 0

count = spark.sparkContext.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
print("Pi is roughly %f" % (4.0 * count / n))

In [None]:
spark.stop()