# Provision Scripts

## References
- Using CLI: https://chameleoncloud.readthedocs.io/en/latest/technical/cli.html#the-openstack-rc-script
- Using Container API: https://python-chi.readthedocs.io/en/latest/modules/container.html
- Webinar: https://www.youtube.com/watch?v=1MPROv595LM

## Authentication
Before starting `jupyter-lab`, you need to `source openrc.sh` and enter in your password. This file must be downloaded from the Chameleon GUI. 

## Notes
- pegasus arm build container: `ryantanaka/debian10-arm64-pegasus-build-env`
  - architecture listed while building worker package: `aarch64_deb_10`
- to build arm container on x86_64 machine with qemu: `docker buildx build --platform linux/arm64 -t <tag> --progress=plain --push .`
- to run arm container on x86_64 machine with qemu: `docker container run -t -d -v /usr/bin/qemu-aarch64-static:/usr/bin/qemu-aarch64-static --name test-arm-pegasus --rm ryantanaka/condor8-arm64-worker`
- setting up qemu
  - https://medium.com/@artur.klauser/building-multi-architecture-docker-images-with-buildx-27d80f7e2408
  - https://github.com/multiarch/qemu-user-static/issues/100
- htcondor8 arm worker container: `ryantanaka/condor8-arm64-worker`
  - 411 MB image (only contains necessary dependencies to run condor8)
  - uses password authentication
- htcondor9 arm worker container: `ryantanaka/condor9-arm64-isi-worker`
  - 5.69 GB image (entire frozen build environment for htcondor) 
  - uses token authentication to connect to central manager on workflow
  - must be run with the following environment variables set through docker run
      - `TOKEN`
      - `CONDOR_HOST`
- debian10 aarch64 worker packages located at:
    - https://download.pegasus.isi.edu/arm-worker-packages/

## Code

In [None]:
import logging
import pprint
from pathlib import Path
from typing import List

import chi
from chi import container
from chi import lease

logging.basicConfig(level=logging.INFO)
pp = pprint.PrettyPrinter(indent=2)

In [None]:
# initial setup
chi.use_site("CHI@Edge")
chi.set("project_name", "CHI-210827")
chi.set("project_domain_name", "chameleon")

In [None]:
# util to reconfigure condor daemons once central manager ip is known
def configure_condor(container_id: str, configs: List[str]) -> None:
    print("the following configs will be sent to container: {} /etc/condor/config.d/60-condor.conf".format(container_id))
    print(*configs, sep="\n")
    
    # create a new config file and upload
    conf_dir = Path("tmp")
    conf_dir.mkdir()
    conf_file = conf_dir / "60-condor.conf"
    
    with conf_file.open("w") as f:
        for config in configs:
            f.write(config + "\n")
    
    resp = container.upload(container_id, str(conf_dir), "/etc/condor/config.d")
    print(resp)
    assert resp[0].status_code == 200
    
    print("config upload successful..")
    
    # invoke condor_reconfig
    resp = container.execute(container_id, "condor_reconfig")
    print(resp["output"])
    assert resp["exit_code"] == 0
    
    print("condor_reconfig cmd successful..")
    

### Setup Condor8 Central Manager

In [None]:
# create a lease for a central manager
start, end = lease.lease_duration(days=1)
reservations = []
lease.add_device_reservation(reservations, count=1, device_model="4")
container_lease = lease.create_lease("condor-central-manager", reservations)
cm_lease_id = container_lease["id"]

print("Waiting for lease to start ...")
lease.wait_for_active(cm_lease_id)
print("condor-central-manager lease_id: {}".format(cm_lease_id))
print("Done!")

In [None]:
# start condor central manager container
cm_cont = container.create_container(
                name="central-manager",
                image="ryantanaka/condor8-arm64-central-manager",
                exposed_ports=["9618"],
                reservation_id=lease.get_device_reservation(cm_lease_id)
            )

print("waiting for central-manager container to start")
container.wait_for_active(cm_cont.uuid, timeout=600)
print("central-manager container_id: {}".format(cm_cont.uuid))
print("done!")

In [None]:
central_manager_public_ip = container.associate_floating_ip(cm_cont.uuid)
print(central_manager_public_ip)

In [None]:
# reconfigure central manager with host ip
CONDOR_HOST_CONFIG = "CONDOR_HOST = {}".format(central_manager_public_ip)
configure_condor(cm_cont.uuid, [CONDOR_HOST_CONFIG])

In [None]:
# check to see if configs were updated
resp = container.execute(cm_cont.uuid, "condor_config_val -dump CONDOR_HOST")
print(resp["output"])

In [None]:
# this will fail due to timeout, simpler commands like `condor_status -help` will work however
resp = container.execute(cm_cont.uuid, "condor_status")
print(resp["output"])

### Setup Condor8 Worker

In [None]:
# create a lease for a worker
start, end = lease.lease_duration(days=1)
reservations = []
lease.add_device_reservation(reservations, count=1, device_model="4")
container_lease = lease.create_lease("condor-worker", reservations)
worker_lease_id = container_lease["id"]

print("Waiting for lease to start ...")
lease.wait_for_active(worker_lease_id)
print("condor-worker lease_id: {}".format(worker_lease_id))
print("Done!")

In [None]:
# start condor worker
worker_cont = container.create_container(
                    name="worker",
                    image="ryantanaka/condor8-arm64-worker",
                    reservation_id=worker_lease_id
                )


print("waiting for worker container to start")
container.wait_for_active(worker_cont.uuid, timeout=600)
print("worker container_id: {}".format(worker_cont.uuid))
print("done!")

# TODO: need to handle error when container can't properly start
# TODO: need to handle error when worker can't connect to cm (add in timeout..)

In [None]:
# reconfigure worker with central manager ip
CONDOR_HOST_CONFIG = "CONDOR_HOST = {}".format(central_manager_public_ip)
configure_condor(worker_cont.uuid, [CONDOR_HOST_CONFIG])

### Setup Condor9 Worker (connects to central manager on workflow)

In [None]:
# load file containing token
TOP_DIR = Path(".").resolve()
with (TOP_DIR / "condor9-arm64-isi-worker/cm-token").open("r") as f:
    token = f.read().strip()

In [None]:
# create a lease for a worker
start, end = lease.lease_duration(days=1)
reservations = []
lease.add_device_reservation(reservations, count=1, device_model="4")
container_lease = lease.create_lease("condor9-worker", reservations)
worker_lease_id = container_lease["id"]
worker_reservation_id = lease.get_device_reservation(worker_lease_id)

print("Waiting for lease to start ...")
lease.wait_for_active(worker_lease_id)
print("condor-worker lease_id: {}".format(worker_lease_id))
print("Done!")

In [None]:
# start condor worker
worker_cont = container.create_container(
                    name="condor9-isi-worker",
                    image="ryantanaka/condor9-arm64-isi-worker",
                    reservation_id=worker_reservation_id,
                    environment={
                        "TOKEN": token,
                        "CONDOR_HOST": "workflow.isi.edu"
                    }
                )


print("waiting for worker container to start")
container.wait_for_active(worker_cont.uuid, timeout=600)
print("worker container_id: {}".format(worker_cont.uuid))
print("done!")

# TODO: need to handle error when container can't properly start
# TODO: need to handle error when worker can't connect to cm (add in timeout..)

### Check for Available Slot

In [None]:
# requires htcondor to be installed
import htcondor
import classad

def is_slot_available(hostname: str) -> bool:
    """Check whether or not the given hostname has an available slot."""

    col = htcondor.Collector("workflow.isi.edu:9618")
    slots = col.query(
                htcondor.AdTypes.Startd,
                projection=["Name", "Activity", "State"]
            )

    for s in slots:
        activity = s["Activity"]
        state = s["State"]
        name = s["Name"]

        if name.endswith("@{}".format(hostname)) and state == "Unclaimed" and activity == "Idle":
            print(name)
            return True

    return False