# Pegasus SAGE Fabric Setup

In [1]:
import sys
import os
import json
import traceback
import time
from datetime import datetime, timedelta
from dateutil import tz
from ipaddress import ip_network, ip_address, IPv4Address, IPv6Address, IPv4Network, IPv6Network

## Fabric Imports

In [2]:
from fabrictestbed_extensions.fablib.fablib import FablibManager as fablib_manager

fablib = fablib_manager()

In [3]:
try:
    fablib.list_sites(
        force_refresh=True, 
        fields=[
            "name", 
            "cores_available", 
            "ram_available",
            "disk_available",
            "nic_basic_available", 
            "nic_connectx_5_available", 
            "nic_connectx_6_available",
            "nvme_available",
            "tesla_t4_available",
            "rtx6000_available",
            "a30_available",
            "a40_available",
            "fpga_u280_available"
        ]
    )
except Exception as e:
    print(f"Exception: {e}")

Name,Cores Available,RAM Available,Disk Available,Basic NIC Available,ConnectX-5 Available,ConnectX-6 Available,NVMe Available,Tesla T4 Available,RTX6000 Available,A30 Available,A40 Available,U280 Available
PSC,339,1400,55948,367,2,2,10,0,0,4,0,0
EDUKY,73611,8690,2188,4369,0,0,0,0,0,0,0,0
SEAT,148,772,4156,246,2,0,8,0,0,0,0,0
FIU,458,1950,57945,612,4,2,15,0,4,0,0,0
MAX,538,2278,106618,614,4,2,16,4,6,0,0,1
GPN,457,1870,59510,616,4,2,16,1,6,0,0,1
CERN,74,1564,45498,719,4,6,22,0,0,5,2,0
GATECH,578,1912,57869,604,1,2,16,0,0,7,0,1
WASH,316,1282,63181,309,1,2,10,2,3,0,0,1
SRI,360,1458,56318,378,2,2,10,0,0,4,0,1


## Fabric Variables

In [5]:
# Create a FABlib manager
#site_names = ['INDI', 'LOSA', 'UCSD', 'GATECH', 'PSC', 'STAR', 'SALT', 'UTAH', 'MICH', 'FIU', 'GPN', 'WASH', 'RUTG', 'DALL', 'CLEM', 'AMST']
site_names = ['UCSD', 'CLEM']

# FABRIC Config
fabric_prefix =  f"pegasus-submit-sage-"
fabric_slice_name = fabric_prefix+'experiment'
fabric_os_image='default_ubuntu_20'

fabric_submit_name = fabric_prefix+'submit'
fabric_submit_site = 'LOSA'
fabric_submit_cores = 16
fabric_submit_ram = 32
fabric_submit_disk = 500
fabric_submit_ip_addr ="192.168.1.10"

worker_nodes = []
for n in site_names:
    for i in range(1,2):
        worker_nodes.append({
            "name": f"{n}-worker-{i}",
            "site": n,
            "cores": 24,
            "ram": 48,
            "disk": 500,
        })

## Create Fabric Slice

In [6]:
try:
    #Create Slice
    fabric_slice = fablib.new_slice(name=fabric_slice_name)
    
    # Add federated learning submit node
    fabric_submit = fabric_slice.add_node(
                        name=fabric_submit_name, 
                        site=fabric_submit_site,
                        image=fabric_os_image,
                        cores=fabric_submit_cores,
                        ram=fabric_submit_ram,
                        disk=fabric_submit_disk)
    fabric_submit.add_fabnet()

    for w in worker_nodes:
        worker_node = fabric_slice.add_node(
                        name=w["name"], 
                        site=w["site"],
                        image=fabric_os_image,
                        cores=w["cores"],
                        ram=w["ram"],
                        disk=w["disk"])
        worker_node.add_fabnet()

    #Submit the Request
    fabric_slice.submit()
except Exception as e:
    print(f"Exception: {e}")
    traceback.print_exc()


Retry: 12, Time: 349 sec


0,1
ID,8bd37b04-f0d0-4451-afba-45745b2ab6c9
Name,pegasus-submit-sage-experiment
Lease Expiration (UTC),2024-02-09 02:04:02 +0000
Lease Start (UTC),2024-02-08 02:04:02 +0000
Project ID,1a0f06eb-75ea-4258-9b92-ff4d8cf999b0
State,StableOK


ID,Name,Cores,RAM,Disk,Image,Image Type,Host,Site,Username,Management IP,State,Error,SSH Command,Public SSH Key File,Private SSH Key File
ffed63a2-db50-40b8-87c3-683f5c6e9d92,CLEM-worker-1,24,64,500,default_ubuntu_20,qcow2,clem-w1.fabric-testbed.net,CLEM,ubuntu,2620:103:a006:12:f816:3eff:fe0e:2153,Active,,ssh ${Username}@${Management IP},/home/fabric/work/fabric_config/keys/fabric_sliver_key.pub,/home/fabric/work/fabric_config/keys/fabric_sliver_key
f2147481-86d9-4381-9ad0-e4d853ee4eb7,UCSD-worker-1,24,64,500,default_ubuntu_20,qcow2,ucsd-w5.fabric-testbed.net,UCSD,ubuntu,132.249.252.136,Active,,ssh ${Username}@${Management IP},/home/fabric/work/fabric_config/keys/fabric_sliver_key.pub,/home/fabric/work/fabric_config/keys/fabric_sliver_key
ff4f3270-0d11-4f85-9c9b-4cb7e950e14d,pegasus-submit-sage-submit,16,32,500,default_ubuntu_20,qcow2,losa-w2.fabric-testbed.net,LOSA,ubuntu,2001:400:a100:3070:f816:3eff:fe77:ec51,Active,,ssh ${Username}@${Management IP},/home/fabric/work/fabric_config/keys/fabric_sliver_key.pub,/home/fabric/work/fabric_config/keys/fabric_sliver_key


ID,Name,Layer,Type,Site,Subnet,Gateway,State,Error
04e631b4-fe95-4630-a540-f16126caea3f,FABNET_IPv4_CLEM,L3,FABNetv4,CLEM,10.136.9.0/24,10.136.9.1,Active,
7847f734-5656-4c04-b27e-4d613be5d34e,FABNET_IPv4_LOSA,L3,FABNetv4,LOSA,10.137.3.0/24,10.137.3.1,Active,
07443828-9933-42d0-b279-b74b026cd086,FABNET_IPv4_UCSD,L3,FABNetv4,UCSD,10.134.134.0/24,10.134.134.1,Active,


Name,Short Name,Node,Network,Bandwidth,Mode,VLAN,MAC,Physical Device,Device,IP Address,Numa Node
pegasus-submit-sage-submit-FABNET_IPv4_LOSA_nic-p1,p1,pegasus-submit-sage-submit,FABNET_IPv4_LOSA,100,auto,,0A:4C:01:D1:D8:41,enp6s0,enp6s0,10.137.3.2,4
UCSD-worker-1-FABNET_IPv4_UCSD_nic-p1,p1,UCSD-worker-1,FABNET_IPv4_UCSD,100,auto,,02:EF:08:B0:50:5B,enp6s0,enp6s0,10.134.134.2,4
CLEM-worker-1-FABNET_IPv4_CLEM_nic-p1,p1,CLEM-worker-1,FABNET_IPv4_CLEM,100,auto,,06:E6:7C:D5:FB:76,enp6s0,enp6s0,10.136.9.2,6



Time to print interfaces 350 seconds


## Sanity Check: Ping All Nodes From Submit

In [7]:
try:
    worker_ip_addresses = []
    fabric_submit = fabric_slice.get_node(name=fabric_submit_name)
    fabric_submit_addr = fabric_submit.get_interface(network_name=f'FABNET_IPv4_{fabric_submit.get_site()}').get_ip_addr()
    
    for w in worker_nodes:
        worker_node = fabric_slice.get_node(name=w["name"])
        worker_node_addr = worker_node.get_interface(network_name=f'FABNET_IPv4_{worker_node.get_site()}').get_ip_addr()
        worker_ip_addresses.append(worker_node_addr)

    for worker_node_addr in worker_ip_addresses:
        stdout, stderr = fabric_submit.execute(f'ping -c 2 {worker_node_addr}')
except Exception as e:
    print(f"Exception: {e}")
    traceback.print_exc()

PING 10.134.134.2 (10.134.134.2) 56(84) bytes of data.
64 bytes from 10.134.134.2: icmp_seq=1 ttl=62 time=3.98 ms
64 bytes from 10.134.134.2: icmp_seq=2 ttl=62 time=2.32 ms

--- 10.134.134.2 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1001ms
rtt min/avg/max/mdev = 2.319/3.151/3.984/0.832 ms
PING 10.136.9.2 (10.136.9.2) 56(84) bytes of data.
64 bytes from 10.136.9.2: icmp_seq=1 ttl=59 time=80.5 ms
64 bytes from 10.136.9.2: icmp_seq=2 ttl=59 time=79.4 ms

--- 10.136.9.2 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1001ms
rtt min/avg/max/mdev = 79.350/79.911/80.473/0.561 ms


## Install Software on Fabric

In [8]:
try:
    fabric_slice = fablib.get_slice(name=fabric_slice_name)

    for fabric_node in fabric_slice.get_nodes():
        if fabric_node.get_name() == fabric_submit_name:
            fabric_node.upload_file("fabric-submit.sh", "/home/ubuntu/setup.sh")
        else:
            fabric_node.upload_file("fabric-worker.sh", "/home/ubuntu/setup.sh")
        if type(ip_address(fabric_node.get_management_ip())) is IPv6Address:
            fabric_node.execute("sudo sed -i 's/nameserver/nameserver 2a01:4f9:c010:3f02::1\nnameserver 2a00:1098:2c::1\nnameserver 2a00:1098:2b::1\nnameserver/' /etc/resolv.conf", quiet=True)

except Exception as e:
    print(f"Exception: {e}")
    traceback.print_exc()

In [9]:
try:
    fabric_slice = fablib.get_slice(name=fabric_slice_name)

    fabric_submit = fabric_slice.get_node(name=fabric_submit_name)
    fabric_submit_addr = fabric_submit.get_interface(network_name=f'FABNET_IPv4_{fabric_submit.get_site()}').get_ip_addr()
    
    #Create execute threads
    execute_threads = {}
    for fabric_node in fabric_slice.get_nodes():
        os_interface = fabric_node.get_interface(network_name=f'FABNET_IPv4_{fabric_node.get_site()}').get_os_interface()
        config_command = f"sudo bash /home/ubuntu/setup.sh {os_interface} {fabric_submit_addr} {fabric_submit_name}"
        print(f"Starting config on node {fabric_node.get_name()}")
        execute_threads[fabric_node] = fabric_node.execute_thread(config_command)
        
    #Wait for results from threads
    for fabric_node,thread in execute_threads.items():
        print(f"Waiting for result from node {fabric_node.get_name()}")
        stdout,stderr = thread.result()
                        
    print("All Done!")
except Exception as e:
    print(f"Exception: {e}")
    traceback.print_exc()

Starting config on node pegasus-submit-sage-submit
Starting config on node UCSD-worker-1
Starting config on node CLEM-worker-1
Waiting for result from node pegasus-submit-sage-submit
Waiting for result from node UCSD-worker-1
Waiting for result from node CLEM-worker-1
All Done!


In [10]:
try:
    fabric_slice = fablib.get_slice(name=fabric_slice_name)

    fabric_submit = fabric_slice.get_node(name=fabric_submit_name)
    fabric_submit.upload_file("beehive-cloud-processing.tar.gz", "/home/ubuntu/beehive-cloud-processing.tar.gz")
    fabric_submit.execute("tar -xzvf /home/ubuntu/beehive-cloud-processing.tar.gz")
except Exception as e:
    print(f"Fail: {e}")
    traceback.print_exc()


beehive-cloud-processing/
beehive-cloud-processing/workflow.py
beehive-cloud-processing/executables/
beehive-cloud-processing/executables/raingauge_totals.py
beehive-cloud-processing/executables/temperature_stats.py


In [11]:
try:
    fabric_slice = fablib.get_slice(name=fabric_slice_name)

    fabric_submit.execute("/home/ubuntu/beehive-cloud-processing/workflow.py")
except Exception as e:
    print(f"Fail: {e}")
    traceback.print_exc()

[[1;32m#########################[0m] 100.0% ..Success ([1;34mUnready: 0[0m, [1;32mCompleted: 58[0m, [1;33mQueued: 0[0m, [1;36mRunning: 0[0m, [1;31mFailed: 0[0m)
[31m 2024.02.08 04:19:18.887 UTC: [ERROR]  Unable to load from directory  /home/ubuntu/inputs class java.lang.RuntimeException: Unable to connect to replica catalog implementation edu.isi.pegasus.planner.catalog.replica.impl.Directory with props {directory=/home/ubuntu/inputs}
2024.02.08 04:19:19.113 UTC:
2024.02.08 04:19:19.119 UTC:   -----------------------------------------------------------------------
2024.02.08 04:19:19.124 UTC:   File for submitting this DAG to HTCondor           : SageCloudWorkflow-0.dag.condor.sub
2024.02.08 04:19:19.129 UTC:   Log of DAGMan debugging messages                 : SageCloudWorkflow-0.dag.dagman.out
2024.02.08 04:19:19.134 UTC:   Log of HTCondor library output                     : SageCloudWorkflow-0.dag.lib.out
2024.02.08 04:19:19.139 UTC:   Log of HTCondor library error mes

## Extend Fabric Slice

In [None]:
# Prolong Fabric Slice For 12 Days
end_date = (datetime.now(tz=tz.tzutc()) + timedelta(days=14)).strftime("%Y-%m-%d %H:%M:%S %z")
try:
    fabric_slice = fablib.get_slice(name=fabric_slice_name)
    fabric_slice = fabric_slice.renew(end_date)
    
    fabric_slice = fablib.get_slice(name=fabric_slice_name)
    print(f'New lease end time: {fabric_slice.get_lease_end()}')
except Exception as e:
    print(f"Fail: {e}")
    traceback.print_exc()

## Cleanup Fabric (This Deletes The Deployment)

In [None]:
try:
    fabric_slice = fablib.get_slice(fabric_slice_name)
    fabric_slice.delete()
except Exception as e:
    print(f"Exception: {e}")