# Setup

In [55]:
SPARK_START_FROM_SCRATCH = True
DOCKER_INTERNAL_HOST = "host.docker.internal"
DOCKER_DNS = ["10.15.20.1"]

SPARK_DOCKER_BASE = "spark:3.5.7-scala2.12-java17-python3-ubuntu"
SPARK_JUPYTER_LAB_DOCKER_TAG = "spark-jupyter:3.5.7-scala2.12-java17-python3-ubuntu"
SPARK_JOB_VENV_DOCKER_TAG = "spark-job-venv:3.5.7-scala2.12-java17-python3-ubuntu"
SPARK_JOB_VENV_BUILD_DIR = "/opt/spark/venv-build"

SPARK_MASTER_NAME = "spark-master"
SPARK_MASTER_HOSTNAME = f"{SPARK_MASTER_NAME}.mavasbel.vpn.itam.mx"
SPARK_MASTER_IP = "10.15.20.2"
SPARK_MASTER_WUBUI_PORT = 6080
SPARK_MASTER_PORT = 6077

SPARK_TOTAL_WORKERS = 3
SPARK_WORKER_NAMES = [f"spark-worker-{i+1}" for i in range(SPARK_TOTAL_WORKERS)]
SPARK_WORKER_HOSTNAMES = [
    f"{SPARK_WORKER_NAMES[i]}.mavasbel.vpn.itam.mx" for i in range(SPARK_TOTAL_WORKERS)
]
SPARK_WORKER_IPS = ["10.15.20.2"] * SPARK_TOTAL_WORKERS
SPARK_WORKER_WEBUI_PORTS = [6080 + (i + 1) for i in range(SPARK_TOTAL_WORKERS)]

SPARK_WORKDIR = "/opt/spark/work-dir"

JUPYTER_LAB_NAME = "spark-jupyter"
JUPYTER_LAB_HOSTNAME = "spark-jupyter.mavasbel.vpn.itam.mx"
JUPYTER_LAB_IP = "10.15.20.2"
JUPYTER_LAB_PORT = 6888
JUPYTER_LAB_MONITOR_PORT = 4040
JUPYTER_LAB_TOKEN = ""

In [56]:
HADOOP_NAMENODE_HOSTNAME = "namenode.mavasbel.vpn.itam.mx"
HADOOP_NAMENODE_IP = "10.15.20.2"
HADOOP_NAMENODE_PORT = 8020

In [57]:
import os
from pathlib import Path

LOCALHOST_WORKDIR = f"{os.path.join(os.path.relpath(Path.cwd()))}"
DOCKER_MOUNTDIR = os.path.join(LOCALHOST_WORKDIR, "mount")

path = Path(LOCALHOST_WORKDIR)
path.mkdir(parents=True, exist_ok=True)

# Stop spark-cluster.docker-compose.yml

In [58]:
!docker compose -f spark-cluster.docker-compose.yml down -v

 Container spark-worker-3  Stopping
 Container spark-worker-3  Stopped
 Container spark-worker-3  Removing
 Container spark-worker-3  Removed
 Container spark-worker-2  Stopping
 Container spark-worker-2  Stopped
 Container spark-worker-2  Removing
 Container spark-worker-2  Removed
 Container spark-worker-1  Stopping
 Container spark-worker-1  Stopped
 Container spark-worker-1  Removing
 Container spark-worker-1  Removed
 Container spark-jupyter  Stopping
 Container spark-master  Stopping
 Container spark-master  Stopped
 Container spark-master  Removing
 Container spark-master  Removed
 Container spark-jupyter  Stopped
 Container spark-jupyter  Removing
 Container spark-jupyter  Removed
 Network spark-cluster_spark-cluster  Removing
 Network spark-cluster_spark-cluster  Removed


In [None]:
import shutil

if SPARK_START_FROM_SCRATCH or not os.path.exists(
    os.path.join(DOCKER_MOUNTDIR, "data")
):
    shutil.rmtree(os.path.join(DOCKER_MOUNTDIR, "data"), ignore_errors=True)
    shutil.rmtree(os.path.join(DOCKER_MOUNTDIR, "spark-warehouse"), ignore_errors=True)
    shutil.rmtree(
        os.path.join(DOCKER_MOUNTDIR, "iceberg-warehouse"), ignore_errors=True
    )
    shutil.rmtree(os.path.join(DOCKER_MOUNTDIR, SPARK_MASTER_NAME), ignore_errors=True)
    for spark_worker_name in SPARK_WORKER_NAMES:
        shutil.rmtree(
            os.path.join(DOCKER_MOUNTDIR, spark_worker_name), ignore_errors=True
        )

    Path(os.path.join(DOCKER_MOUNTDIR, "data")).mkdir(parents=True, exist_ok=True)

### Build spark-jupyter

In [60]:
import os
from IPython.display import Markdown, display

dockerfile_spark_jupyter_python_packages = (
    "pyspark==3.5.7 delta-spark==3.2.0 jupyterlab pandas pyarrow"
)

dockerfile_spark_jupyter_name = "dockerfile.spark-jupyter"

# language=dockerfile
dockerfile_spark_jupyter_contents = f""" 

# Use the official Spark image as the base
FROM apache/{SPARK_DOCKER_BASE}

# Switch to root to install software
USER root

# Set the working directory
WORKDIR {SPARK_WORKDIR}

# Expose the Jupyter port
EXPOSE 8888

# Install Python dependencies
RUN apt-get update && apt-get install -y python3-venv
RUN python3 -m pip install --no-cache-dir {dockerfile_spark_jupyter_python_packages}

# Set the default command to launch Jupyter Lab
CMD ["jupyter", "lab", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root", "--NotebookApp.token=$$JUPYTER_LAB_TOKEN"]
"""

with open(
    os.path.join(LOCALHOST_WORKDIR, dockerfile_spark_jupyter_name), "w"
) as spark_compose_yaml_file:
    spark_compose_yaml_file.write(dockerfile_spark_jupyter_contents.strip())

print(
    f"Successfully created: '{os.path.relpath(os.path.join(LOCALHOST_WORKDIR,dockerfile_spark_jupyter_name))}'"
)
display(Markdown(f"```dockerfile\n{dockerfile_spark_jupyter_contents}\n```"))

Successfully created: 'dockerfile.spark-jupyter'


```dockerfile
 

# Use the official Spark image as the base
FROM apache/spark:3.5.7-scala2.12-java17-python3-ubuntu

# Switch to root to install software
USER root

# Set the working directory
WORKDIR /opt/spark/work-dir

# Expose the Jupyter port
EXPOSE 8888

# Install Python dependencies
RUN apt-get update && apt-get install -y python3-venv
RUN python3 -m pip install --no-cache-dir pyspark==3.5.7 delta-spark==3.2.0 jupyterlab pandas pyarrow

# Set the default command to launch Jupyter Lab
CMD ["jupyter", "lab", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root", "--NotebookApp.token=$$JUPYTER_LAB_TOKEN"]

```

In [61]:
!docker build -t {SPARK_JUPYTER_LAB_DOCKER_TAG} -f dockerfile.spark-jupyter .

#0 building with "desktop-linux" instance using docker driver

#1 [internal] load build definition from dockerfile.spark-jupyter
#1 transferring dockerfile: 677B done
#1 DONE 0.0s

#2 [internal] load metadata for docker.io/apache/spark:3.5.7-scala2.12-java17-python3-ubuntu
#2 DONE 0.0s

#3 [internal] load .dockerignore
#3 transferring context: 2B done
#3 DONE 0.0s

#4 [1/4] FROM docker.io/apache/spark:3.5.7-scala2.12-java17-python3-ubuntu
#4 DONE 0.0s

#5 [2/4] WORKDIR /opt/spark/work-dir
#5 CACHED

#6 [3/4] RUN apt-get update && apt-get install -y python3-venv
#6 CACHED

#7 [4/4] RUN python3 -m pip install --no-cache-dir pyspark==3.5.7 delta-spark==3.2.0 jupyterlab pandas pyarrow
#7 CACHED

#8 exporting to image
#8 exporting layers done
#8 writing image sha256:46a33deeab39c7bf9e01dd919d60bb5a70e9180456c6426246260c043416d7c8 done
#8 naming to docker.io/library/spark-jupyter:3.5.7-scala2.12-java17-python3-ubuntu done
#8 DONE 0.0s

View build details: docker-desktop://dashboard/build/des

### Build spark-job-venv

In [62]:
import os
from IPython.display import Markdown, display

dockerfile_spark_job_venv_name = "dockerfile.spark-job-venv"
dockerfile_spark_job_venv_contents = f"""
# Use the previously generated spark-jupyter image as the base
FROM {SPARK_JUPYTER_LAB_DOCKER_TAG}

# Create virtual env for spark jobs
RUN mkdir -p {SPARK_JOB_VENV_BUILD_DIR} && \\
        cd {SPARK_JOB_VENV_BUILD_DIR} && \\
        python3 -m venv --copies spark_job_env && \\
        {SPARK_JOB_VENV_BUILD_DIR}/spark_job_env/bin/pip install venv-pack pandas pyarrow faker faker-commerce mimesis && \\
        {SPARK_JOB_VENV_BUILD_DIR}/spark_job_env/bin/venv-pack -p spark_job_env -o spark_job_env.tar.gz
"""

with open(os.path.join(LOCALHOST_WORKDIR, dockerfile_spark_job_venv_name), "w") as spark_compose_yaml_file:
    spark_compose_yaml_file.write(dockerfile_spark_job_venv_contents.strip())

print(f"Successfully created: '{os.path.relpath(os.path.join(LOCALHOST_WORKDIR,dockerfile_spark_jupyter_name))}'")
display(Markdown(f"```dockerfile\n{dockerfile_spark_job_venv_contents}\n```"))

Successfully created: 'dockerfile.spark-jupyter'


```dockerfile

# Use the previously generated spark-jupyter image as the base
FROM spark-jupyter:3.5.7-scala2.12-java17-python3-ubuntu

# Create virtual env for spark jobs
RUN mkdir -p /opt/spark/venv-build && \
        cd /opt/spark/venv-build && \
        python3 -m venv --copies spark_job_env && \
        /opt/spark/venv-build/spark_job_env/bin/pip install venv-pack pandas pyarrow faker faker-commerce mimesis && \
        /opt/spark/venv-build/spark_job_env/bin/venv-pack -p spark_job_env -o spark_job_env.tar.gz

```

In [63]:
from pathlib import Path
path = Path(DOCKER_MOUNTDIR)
path.mkdir(parents=True, exist_ok=True)

!docker build -t {SPARK_JOB_VENV_DOCKER_TAG} -f dockerfile.spark-job-venv .
!docker create --name spark-job-venv {SPARK_JOB_VENV_DOCKER_TAG}
!docker cp spark-job-venv:{SPARK_JOB_VENV_BUILD_DIR}/spark_job_env.tar.gz "{DOCKER_MOUNTDIR}/spark_job_env.tar.gz"
!docker rm spark-job-venv

#0 building with "desktop-linux" instance using docker driver

#1 [internal] load build definition from dockerfile.spark-job-venv
#1 transferring dockerfile: 566B 0.0s done
#1 DONE 0.0s

#2 [internal] load metadata for docker.io/library/spark-jupyter:3.5.7-scala2.12-java17-python3-ubuntu
#2 DONE 0.0s

#3 [internal] load .dockerignore
#3 transferring context: 2B done
#3 DONE 0.0s

#4 [1/2] FROM docker.io/library/spark-jupyter:3.5.7-scala2.12-java17-python3-ubuntu
#4 DONE 0.0s

#5 [2/2] RUN mkdir -p /opt/spark/venv-build &&         cd /opt/spark/venv-build &&         python3 -m venv --copies spark_job_env &&         /opt/spark/venv-build/spark_job_env/bin/pip install venv-pack pandas pyarrow faker faker-commerce mimesis &&         /opt/spark/venv-build/spark_job_env/bin/venv-pack -p spark_job_env -o spark_job_env.tar.gz
#5 CACHED

#6 exporting to image
#6 exporting layers done
#6 writing image sha256:7398bf2b436d655d12125d7b6a95669a618aa6aad2a3d809b8f2ce03ef92b65d done
#6 naming to docke

25cf9ddf89779e81cdcccdff0d287fcb613b8818d3bbc3af0208a6f38ad5b533
spark-job-venv


# Start spark.docker-compose.yml

In [64]:
import os
import yaml
from IPython.display import Markdown, display

SPARK_VSCODE_SERVER_DIR = os.path.join(LOCALHOST_WORKDIR, "vscode_server")
SPARK_MOUNT_JARS = [
    f"{os.path.join(LOCALHOST_WORKDIR,"jars",file)}:/opt/spark/jars/{file}"
    for file in os.listdir(os.path.join(LOCALHOST_WORKDIR, "jars"))
    if file.endswith("jar")
]

spark_compose_dict = {
    "name": "spark-cluster",
    "networks": {"spark-cluster": {"driver": "bridge"}},
    "services": {
        SPARK_MASTER_NAME: {
            "image": f"apache/{SPARK_DOCKER_BASE}",
            "container_name": SPARK_MASTER_NAME,
            "user": "root",
            "command": f'bash -c "/opt/spark/bin/spark-class org.apache.spark.deploy.$$SPARK_MODE.$${{SPARK_MODE^}} --host {SPARK_MASTER_HOSTNAME} --port $$SPARK_MASTER_PORT --webui-port $$SPARK_MASTER_WEBUI_PORT"',
            "environment": [
                "PYSPARK_PYTHON=python3",
                "SPARK_MODE=master",
                f"SPARK_MASTER_PORT={SPARK_MASTER_PORT}",
                f"SPARK_MASTER_WEBUI_PORT={SPARK_MASTER_WUBUI_PORT}",
                "SPARK_DAEMON_MEMORY=1G",
            ],
            "volumes": [
                f"{os.path.join(DOCKER_MOUNTDIR,SPARK_MASTER_NAME)}:{SPARK_WORKDIR}"
            ]
            + SPARK_MOUNT_JARS,
            "networks": ["spark-cluster"],
            "hostname": SPARK_MASTER_HOSTNAME,
            "ports": [
                f"{SPARK_MASTER_WUBUI_PORT}:{SPARK_MASTER_WUBUI_PORT}",
                f"{SPARK_MASTER_PORT}:{SPARK_MASTER_PORT}",
            ],
            "extra_hosts": [
                f"{DOCKER_INTERNAL_HOST}:host-gateway",
            ],
            "dns": DOCKER_DNS,
            "deploy": {"resources": {"limits": {"cpus": "2.0", "memory": "1024M"}}},
            "healthcheck": {
                "test": [
                    "CMD",
                    "curl",
                    "-f",
                    f"http://{SPARK_MASTER_HOSTNAME}:{SPARK_MASTER_WUBUI_PORT}",
                ],
                "interval": "10s",
                "timeout": "10s",
                "retries": 10,
                "start_period": "10s",
            },
        },
        "spark-jupyter": {
            "image": SPARK_JUPYTER_LAB_DOCKER_TAG,
            "container_name": "spark-jupyter",
            "user": "root",
            "command": [
                "bash",
                "-c",
                " ".join(
                    [
                        "jupyter lab",
                        "--ip=0.0.0.0",
                        f"--port={JUPYTER_LAB_PORT}",
                        "--no-browser",
                        "--allow-root",
                        f"--NotebookApp.token='{JUPYTER_LAB_TOKEN}'",
                        "--NotebookApp.password=''",
                        "--NotebookApp.allow_origin='*'",
                        "--ServerApp.disable_check_xsrf=True",
                        f"--ServerApp.root_dir={SPARK_WORKDIR}",
                    ]
                ),
            ],
            "environment": [
                "PYSPARK_PYTHON=python3",
                # f"JUPYTER_LAB_PORT={JUPYTER_LAB_PORT}",
                # f"JUPYTER_LAB_TOKEN={JUPYTER_LAB_TOKEN}",
                "SPARK_EXECUTOR_MEMORY=1536M",
            ],
            "volumes": [
                f"{DOCKER_MOUNTDIR}:{SPARK_WORKDIR}",
                f"{SPARK_VSCODE_SERVER_DIR}:/root/.vscode-server",
            ]
            + SPARK_MOUNT_JARS,
            "networks": ["spark-cluster"],
            "hostname": JUPYTER_LAB_HOSTNAME,
            "ports": [
                f"{JUPYTER_LAB_PORT}:{JUPYTER_LAB_PORT}",
                f"{JUPYTER_LAB_MONITOR_PORT}:{JUPYTER_LAB_MONITOR_PORT}",
            ],
            "extra_hosts": [
                f"{DOCKER_INTERNAL_HOST}:host-gateway",
            ],
            "dns": DOCKER_DNS,
            "deploy": {"resources": {"limits": {"cpus": "2.0", "memory": "2048M"}}},
            "healthcheck": {
                "test": [
                    "CMD",
                    "curl",
                    "-f",
                    f"http://{JUPYTER_LAB_HOSTNAME}:{JUPYTER_LAB_PORT}",
                ],
                "interval": "10s",
                "timeout": "10s",
                "retries": 10,
                "start_period": "10s",
            },
        },
    },
}

for i in range(SPARK_TOTAL_WORKERS):

    spark_compose_dict["services"][SPARK_WORKER_NAMES[i]] = {
        "image": f"apache/{SPARK_DOCKER_BASE}",
        "container_name": SPARK_WORKER_NAMES[i],
        "user": "root",
        "command": f'bash -c "/opt/spark/bin/spark-class org.apache.spark.deploy.$$SPARK_MODE.$${{SPARK_MODE^}} $$SPARK_MASTER_URL --host {SPARK_WORKER_HOSTNAMES[i]} --webui-port $$SPARK_WORKER_WEBUI_PORT"',
        "environment": [
            "PYSPARK_PYTHON=python3",
            "SPARK_MODE=worker",
            "SPARK_WORKER_CORES=2",
            "SPARK_DAEMON_MEMORY=512M",
            "SPARK_WORKER_MEMORY=2048M",
            f"SPARK_WORKER_WEBUI_PORT={SPARK_WORKER_WEBUI_PORTS[i]}",
            f"SPARK_MASTER_URL=spark://{SPARK_MASTER_HOSTNAME}:{SPARK_MASTER_PORT}",
        ],
        "volumes": [
            f"{os.path.join(DOCKER_MOUNTDIR,SPARK_WORKER_NAMES[i])}:{SPARK_WORKDIR}"
        ]
        + SPARK_MOUNT_JARS,
        "networks": ["spark-cluster"],
        "hostname": SPARK_WORKER_HOSTNAMES[i],
        "ports": [f"{SPARK_WORKER_WEBUI_PORTS[i]}:{SPARK_WORKER_WEBUI_PORTS[i]}"],
        "extra_hosts": [
            f"{DOCKER_INTERNAL_HOST}:host-gateway",
        ],
        "dns": DOCKER_DNS,
        "deploy": {"resources": {"limits": {"cpus": "2.0", "memory": "3G"}}},
        "depends_on": {
            "spark-master": {"condition": "service_healthy"},
            "spark-jupyter": {"condition": "service_healthy"},
        }
        | {
            SPARK_WORKER_NAMES[j]: {"condition": "service_started"} for j in range(0, i)
        },
        "healthcheck": {
            "test": [
                "CMD",
                "curl",
                "-f",
                f"http://{SPARK_WORKER_HOSTNAMES[i]}:{SPARK_WORKER_WEBUI_PORTS[i]}",
            ],
            "interval": "10s",
            "timeout": "10s",
            "retries": 10,
            "start_period": "10s",
        },
    }

# 3. Dump the dictionary to a YAML file
spark_compose_yaml_path = os.path.join(
    LOCALHOST_WORKDIR, "spark-cluster.docker-compose.yml"
)
spark_compose_yaml_contents = yaml.dump(
    spark_compose_dict, default_flow_style=False, sort_keys=False, indent=4
)
with open(spark_compose_yaml_path, "w") as spark_compose_yaml_file:
    spark_compose_yaml_file.write(spark_compose_yaml_contents)

print(f"Successfully created: '{os.path.relpath(spark_compose_yaml_path)}'")
display(Markdown(f"```yaml\n{spark_compose_yaml_contents}\n```"))

Successfully created: 'spark-cluster.docker-compose.yml'


```yaml
name: spark-cluster
networks:
    spark-cluster:
        driver: bridge
services:
    spark-master:
        image: apache/spark:3.5.7-scala2.12-java17-python3-ubuntu
        container_name: spark-master
        user: root
        command: bash -c "/opt/spark/bin/spark-class org.apache.spark.deploy.$$SPARK_MODE.$${SPARK_MODE^}
            --host spark-master.mavasbel.vpn.itam.mx --port $$SPARK_MASTER_PORT --webui-port
            $$SPARK_MASTER_WEBUI_PORT"
        environment:
        - PYSPARK_PYTHON=python3
        - SPARK_MODE=master
        - SPARK_MASTER_PORT=6077
        - SPARK_MASTER_WEBUI_PORT=6080
        - SPARK_DAEMON_MEMORY=1G
        volumes:
        - .\mount\spark-master:/opt/spark/work-dir
        - .\jars\iceberg-spark-runtime-3.5_2.12-1.6.1.jar:/opt/spark/jars/iceberg-spark-runtime-3.5_2.12-1.6.1.jar
        networks:
        - spark-cluster
        hostname: spark-master.mavasbel.vpn.itam.mx
        ports:
        - 6080:6080
        - 6077:6077
        extra_hosts:
        - host.docker.internal:host-gateway
        dns: &id001
        - 10.15.20.1
        deploy:
            resources:
                limits:
                    cpus: '2.0'
                    memory: 1024M
        healthcheck:
            test:
            - CMD
            - curl
            - -f
            - http://spark-master.mavasbel.vpn.itam.mx:6080
            interval: 10s
            timeout: 10s
            retries: 10
            start_period: 10s
    spark-jupyter:
        image: spark-jupyter:3.5.7-scala2.12-java17-python3-ubuntu
        container_name: spark-jupyter
        user: root
        command:
        - bash
        - -c
        - jupyter lab --ip=0.0.0.0 --port=6888 --no-browser --allow-root --NotebookApp.token=''
            --NotebookApp.password='' --NotebookApp.allow_origin='*' --ServerApp.disable_check_xsrf=True
            --ServerApp.root_dir=/opt/spark/work-dir
        environment:
        - PYSPARK_PYTHON=python3
        - SPARK_EXECUTOR_MEMORY=1536M
        volumes:
        - .\mount:/opt/spark/work-dir
        - .\vscode_server:/root/.vscode-server
        - .\jars\iceberg-spark-runtime-3.5_2.12-1.6.1.jar:/opt/spark/jars/iceberg-spark-runtime-3.5_2.12-1.6.1.jar
        networks:
        - spark-cluster
        hostname: spark-jupyter.mavasbel.vpn.itam.mx
        ports:
        - 6888:6888
        - 4040:4040
        extra_hosts:
        - host.docker.internal:host-gateway
        dns: *id001
        deploy:
            resources:
                limits:
                    cpus: '2.0'
                    memory: 2048M
        healthcheck:
            test:
            - CMD
            - curl
            - -f
            - http://spark-jupyter.mavasbel.vpn.itam.mx:6888
            interval: 10s
            timeout: 10s
            retries: 10
            start_period: 10s
    spark-worker-1:
        image: apache/spark:3.5.7-scala2.12-java17-python3-ubuntu
        container_name: spark-worker-1
        user: root
        command: bash -c "/opt/spark/bin/spark-class org.apache.spark.deploy.$$SPARK_MODE.$${SPARK_MODE^}
            $$SPARK_MASTER_URL --host spark-worker-1.mavasbel.vpn.itam.mx --webui-port
            $$SPARK_WORKER_WEBUI_PORT"
        environment:
        - PYSPARK_PYTHON=python3
        - SPARK_MODE=worker
        - SPARK_WORKER_CORES=2
        - SPARK_DAEMON_MEMORY=512M
        - SPARK_WORKER_MEMORY=2048M
        - SPARK_WORKER_WEBUI_PORT=6081
        - SPARK_MASTER_URL=spark://spark-master.mavasbel.vpn.itam.mx:6077
        volumes:
        - .\mount\spark-worker-1:/opt/spark/work-dir
        - .\jars\iceberg-spark-runtime-3.5_2.12-1.6.1.jar:/opt/spark/jars/iceberg-spark-runtime-3.5_2.12-1.6.1.jar
        networks:
        - spark-cluster
        hostname: spark-worker-1.mavasbel.vpn.itam.mx
        ports:
        - 6081:6081
        extra_hosts:
        - host.docker.internal:host-gateway
        dns: *id001
        deploy:
            resources:
                limits:
                    cpus: '2.0'
                    memory: 3G
        depends_on:
            spark-master:
                condition: service_healthy
            spark-jupyter:
                condition: service_healthy
        healthcheck:
            test:
            - CMD
            - curl
            - -f
            - http://spark-worker-1.mavasbel.vpn.itam.mx:6081
            interval: 10s
            timeout: 10s
            retries: 10
            start_period: 10s
    spark-worker-2:
        image: apache/spark:3.5.7-scala2.12-java17-python3-ubuntu
        container_name: spark-worker-2
        user: root
        command: bash -c "/opt/spark/bin/spark-class org.apache.spark.deploy.$$SPARK_MODE.$${SPARK_MODE^}
            $$SPARK_MASTER_URL --host spark-worker-2.mavasbel.vpn.itam.mx --webui-port
            $$SPARK_WORKER_WEBUI_PORT"
        environment:
        - PYSPARK_PYTHON=python3
        - SPARK_MODE=worker
        - SPARK_WORKER_CORES=2
        - SPARK_DAEMON_MEMORY=512M
        - SPARK_WORKER_MEMORY=2048M
        - SPARK_WORKER_WEBUI_PORT=6082
        - SPARK_MASTER_URL=spark://spark-master.mavasbel.vpn.itam.mx:6077
        volumes:
        - .\mount\spark-worker-2:/opt/spark/work-dir
        - .\jars\iceberg-spark-runtime-3.5_2.12-1.6.1.jar:/opt/spark/jars/iceberg-spark-runtime-3.5_2.12-1.6.1.jar
        networks:
        - spark-cluster
        hostname: spark-worker-2.mavasbel.vpn.itam.mx
        ports:
        - 6082:6082
        extra_hosts:
        - host.docker.internal:host-gateway
        dns: *id001
        deploy:
            resources:
                limits:
                    cpus: '2.0'
                    memory: 3G
        depends_on:
            spark-master:
                condition: service_healthy
            spark-jupyter:
                condition: service_healthy
            spark-worker-1:
                condition: service_started
        healthcheck:
            test:
            - CMD
            - curl
            - -f
            - http://spark-worker-2.mavasbel.vpn.itam.mx:6082
            interval: 10s
            timeout: 10s
            retries: 10
            start_period: 10s
    spark-worker-3:
        image: apache/spark:3.5.7-scala2.12-java17-python3-ubuntu
        container_name: spark-worker-3
        user: root
        command: bash -c "/opt/spark/bin/spark-class org.apache.spark.deploy.$$SPARK_MODE.$${SPARK_MODE^}
            $$SPARK_MASTER_URL --host spark-worker-3.mavasbel.vpn.itam.mx --webui-port
            $$SPARK_WORKER_WEBUI_PORT"
        environment:
        - PYSPARK_PYTHON=python3
        - SPARK_MODE=worker
        - SPARK_WORKER_CORES=2
        - SPARK_DAEMON_MEMORY=512M
        - SPARK_WORKER_MEMORY=2048M
        - SPARK_WORKER_WEBUI_PORT=6083
        - SPARK_MASTER_URL=spark://spark-master.mavasbel.vpn.itam.mx:6077
        volumes:
        - .\mount\spark-worker-3:/opt/spark/work-dir
        - .\jars\iceberg-spark-runtime-3.5_2.12-1.6.1.jar:/opt/spark/jars/iceberg-spark-runtime-3.5_2.12-1.6.1.jar
        networks:
        - spark-cluster
        hostname: spark-worker-3.mavasbel.vpn.itam.mx
        ports:
        - 6083:6083
        extra_hosts:
        - host.docker.internal:host-gateway
        dns: *id001
        deploy:
            resources:
                limits:
                    cpus: '2.0'
                    memory: 3G
        depends_on:
            spark-master:
                condition: service_healthy
            spark-jupyter:
                condition: service_healthy
            spark-worker-1:
                condition: service_started
            spark-worker-2:
                condition: service_started
        healthcheck:
            test:
            - CMD
            - curl
            - -f
            - http://spark-worker-3.mavasbel.vpn.itam.mx:6083
            interval: 10s
            timeout: 10s
            retries: 10
            start_period: 10s

```

In [65]:
!docker compose -f spark-cluster.docker-compose.yml up -d --wait

 Network spark-cluster_spark-cluster  Creating
 Network spark-cluster_spark-cluster  Created
 Container spark-jupyter  Creating
 Container spark-master  Creating
 Container spark-master  Created
 Container spark-jupyter  Created
 Container spark-worker-1  Creating
 Container spark-worker-1  Created
 Container spark-worker-2  Creating
 Container spark-worker-2  Created
 Container spark-worker-3  Creating
 Container spark-worker-3  Created
 Container spark-jupyter  Starting
 Container spark-master  Starting
 Container spark-master  Started
 Container spark-jupyter  Started
 Container spark-master  Waiting
 Container spark-jupyter  Waiting
 Container spark-jupyter  Healthy
 Container spark-master  Healthy
 Container spark-worker-1  Starting
 Container spark-worker-1  Started
 Container spark-jupyter  Waiting
 Container spark-master  Waiting
 Container spark-jupyter  Healthy
 Container spark-master  Healthy
 Container spark-worker-2  Starting
 Container spark-worker-2  Started
 Container s