# PyTorch


Simple Pytorch example.

## Container

An Apptainer image is used to install all the software pieces. The definition of the container can be found in `containers/pytorch.def`. To make this example run quickly, an already built version of the image is downloaded.

## Compute Job

The compute job will run on a remote host, using a GPU. The job will run `my-code.py` which is where the user code should go.

## Workflow


In [None]:
#!/usr/bin/env python3

import os
import sys
import logging
import tarfile
import requests
import numpy as np
from pathlib import Path
from argparse import ArgumentParser
from datetime import datetime
from datetime import timedelta

# --- Import Pegasus API -----------------------------------------------------------
from Pegasus.api import *
logging.basicConfig(level=logging.DEBUG)

# --- Main workflow class ----------------------------------------------------------
class PyTorchWorkflow():
    wf = None
    sc = None
    tc = None
    rc = None
    props = None

    dagfile = None
    wf_dir = None
    shared_scratch_dir = None
    local_storage_dir = None
    wf_name = "pytorch"
    
    
    # --- Init ---------------------------------------------------------------------
    def __init__(self, dagfile="workflow.yml"):
        self.dagfile = dagfile
        self.wf_dir = str(Path(".").resolve())
        self.shared_scratch_dir = os.path.join(self.wf_dir, "scratch")
        self.local_storage_dir = os.path.join(self.wf_dir, "output")

    
    # --- Write files in directory -------------------------------------------------
    def write(self):
        if not self.sc is None:
            self.sc.write()
        self.props.write()
        self.rc.write()
        self.tc.write()
        
        try:
            self.wf.write()
        except PegasusClientError as e:
            print(e)


    # --- Plan and Submit the workflow ----------------------------------------------
    def plan_submit(self):
        try:
            self.wf.plan(submit=True)
        except PegasusClientError as e:
            print(e)
            
            
    # --- Get status of the workflow -----------------------------------------------
    def status(self):
        try:
            self.wf.status(long=True)
        except PegasusClientError as e:
            print(e)
            
    # --- Wait for the workflow to finish -----------------------------------------------
    def wait(self):
        try:
            self.wf.wait()
        except PegasusClientError as e:
            print(e)
            
    # --- Get statistics of the workflow -----------------------------------------------
    def statistics(self):
        try:
            self.wf.statistics()
        except PegasusClientError as e:
            print(e)
            

    # --- Configuration (Pegasus Properties) ---------------------------------------
    def create_pegasus_properties(self):
        self.props = Properties()
        self.props["pegasus.integrity.checking"] = "none"
        return


    # --- Site Catalog -------------------------------------------------------------
    def create_sites_catalog(self, exec_site_name="condorpool"):
        self.sc = SiteCatalog()

        local = (Site("local")
                    .add_directories(
                        Directory(Directory.SHARED_SCRATCH, self.shared_scratch_dir)
                            .add_file_servers(FileServer("file://" + self.shared_scratch_dir, Operation.ALL)),
                        Directory(Directory.LOCAL_STORAGE, self.local_storage_dir)
                            .add_file_servers(FileServer("file://" + self.local_storage_dir, Operation.ALL))
                    )
                )

        condorpool = (Site(exec_site_name)
                        .add_condor_profile(universe="container")
                        .add_pegasus_profile(
                            style="condor"
                        )
                    )
        condorpool.add_profiles(Namespace.CONDOR, key="require_gpus", value="Capability >= 7.0")
        condorpool.add_profiles(Namespace.ENV, PYTHONUNBUFFERED='1')
        
        # exclude the ACCESS Pegasus TestPool 
        #condorpool.add_condor_profile(requirements="TestPool =!= True")

        # If you want to run on OSG, please specify your OSG ProjectName. For testing, feel
        # free to use the USC_Deelman project (the PI of the Pegasus project).For
        # production work, please use your own project.
        condorpool.add_profiles(Namespace.CONDOR, key="+ProjectName", value="\"USC_Deelman\"")
        
        self.sc.add_sites(local, condorpool)
        

    # --- Transformation Catalog (Executables and Containers) ----------------------
    def create_transformation_catalog(self, exec_site_name="condorpool"):
        self.tc = TransformationCatalog()
        
        pytorch_container = Container("pytorch_container",
            container_type = Container.SINGULARITY,
            image = "https://download.pegasus.isi.edu/containers/pytorch/pytorch.sif",
            image_site = "web"
        )
        
        # main job wrapper
        # note how gpus and other resources are requested
        my_code = Transformation("my-code", 
                                 site="local", 
                                 pfn=self.wf_dir+"/bin/my-code.py", 
                                 is_stageable=True, 
                                 container=pytorch_container)\
                  .add_pegasus_profiles(cores=1, gpus=1, memory="6 GB", diskspace="15 GB")
                  
        self.tc.add_containers(pytorch_container)
        self.tc.add_transformations(my_code)

    
    # --- Replica Catalog ----------------------------------------------------------
    def create_replica_catalog(self):
        self.rc = ReplicaCatalog()

        # workflow inputs
        #self.rc.add_replica("local", "data.txt", \
        #                             os.path.join(self.wf_dir, "inputs/data.txt"))
     

    # --- Create Workflow ----------------------------------------------------------
    def create_workflow(self):
        self.wf = Workflow(self.wf_name, infer_dependencies=True)
                
        job = (Job("my-code")
                  #.add_inputs(data_txt)
                  #.add_outputs(output_txt, stage_out=True)
              )
        
        self.wf.add_jobs(job)

            
dagfile = 'workflow.yml'

workflow = PyTorchWorkflow(dagfile=dagfile)

print("Creating execution sites...")
workflow.create_sites_catalog("condorpool")

print("Creating workflow properties...")
workflow.create_pegasus_properties()

print("Creating transformation catalog...")
workflow.create_transformation_catalog("condorpool")

print("Creating replica catalog...")
workflow.create_replica_catalog()

print("Creating workflow dag...")
workflow.create_workflow()

workflow.write()
print("Workflow has been generated!")

## Plan and Submit the Workflow

We will now plan and submit the workflow for execution. By default we are running jobs on site **condorpool** i.e the selected ACCESS resource.

In [None]:
workflow.plan_submit()

After the workflow has been successfully planned and submitted, you can use the Python `Workflow` object in order to monitor the status of the workflow. It shows in detail the counts of jobs of each status and also the whether the job is idle or running.

In [None]:
workflow.status()

## Wait for the workflow to finish

In [None]:
workflow.wait()