In [4]:
import logging, os, sys
import numpy as np
import urllib.request
import shutil
import subprocess
import zipfile
import pickle
from pathlib import Path
import pandas as pd
from Pegasus.api import *

### Use  the requirements .txt file to get the needed packages
! sudo pip3 install opencv-python <br>
! sudo pip3 install --upgrade setuptools <br>
! sudo pip3 install opencv-python <br>
! sudo pip3 install optuna==2.0.0 <br>
! sudo pip3 install matplotlib <br>
! sudo pip3 install torch <br>
! sudo pip3 install scikit-image <br>
! sudo pip3 install torchvision <br>
! sudo pip3 install pytorchtools <br>
! sudo pip3 install joblib

In [25]:
# JUST TO BE SURE: check the imports of the following

import glob, os
import argparse
import tarfile
import time
import signal
import joblib


import pickle
import cv2

import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn, optim
from skimage import io, transform
import torchvision.transforms as transforms
import optuna

#custom utils
from utils.pytorchtools import EarlyStopping
from utils.util_checkpoint import extract_checkpoints, checkpoints_tar
from utils.model_selection import BasicNet, PretrainedVGG16, PretrainedDenseNet121
from utils.data_loader import CatDogsDataset


from IPython import embed

In [26]:
from util_workflow import download_data, unzip_flatten_data, return_corrupted_files, return_input_files
from util_workflow import add_input_wf_files, add_output_job1, add_output_job2, return_filenames_job2
from util_workflow import split_data_filenames, add_input_tune_model,create_tar_and_pkl, create_pkl

In [27]:
# --- Import Pegasus API ---
from Pegasus.api import *
logging.basicConfig(level=logging.DEBUG)
props = Properties()
props["dagman.retry"] = "2"
props["pegasus.transfer.arguments"] = "-m 1"
props.write()

In [28]:
dataset_link = "https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip"
zip_data = "kagglecatsanddogs_3367a.zip"
directory_to_extract_to = "."

DOWNLOAD_DATA = False
DATASET_SIZE = 12
DATA_DIR = "dev_data/"
UTILS_DIR = "utils/"
DATA_SPLIT_FILE = "data_split_id_list.pickle"

arch_names = ["basicnet", "densenet121", "vgg16"]
CATS = "PetImages/Cat"
DOGS = "PetImages/Dog"
LABELS = {CATS: 0, DOGS: 1}
NUM_EPOCHS = 4
NUM_TRIALS = 3

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)   
    
if DOWNLOAD_DATA == True:
    download_data(dataset_link)

In [29]:
# Avoid corrupted files for now
corrupted_files = return_corrupted_files("corrupted_files.txt")

# Get names of image files that will serve as inputs to the workflow
input_file_names = return_input_files(corrupted_files, DATASET_SIZE, DATA_DIR, LABELS)

rc = ReplicaCatalog()

input_preprocessing1 = add_input_wf_files(input_file_names, DATA_DIR,rc)
input_preprocessing2 = add_output_job1(input_file_names)

output_filenames_preprocessing2 =  return_filenames_job2(input_file_names)
output_preprocessing2 = add_output_job2(output_filenames_preprocessing2)


train_filenames,val_filenames,test_filenames,files_split_dict = split_data_filenames(output_filenames_preprocessing2)

data_split_filenames = train_filenames + val_filenames + test_filenames
output_data_split = add_input_tune_model(data_split_filenames)

tune_model_files = train_filenames + val_filenames
input_tune_model = add_input_tune_model(tune_model_files)

In [30]:
DATA_SPLIT_FILE

'data_split_id_list.pickle'

In [31]:
with open(DATA_SPLIT_FILE, 'wb') as filename:
    pickle.dump(files_split_dict, filename)
    
data_split_file = File(DATA_SPLIT_FILE)
rc.add_replica("local", DATA_SPLIT_FILE, str(Path(".").resolve() / DATA_SPLIT_FILE))

<Pegasus.api.replica_catalog.ReplicaCatalog at 0x7f310c01a208>

In [32]:
# ADDITIONAL PYTHON SCRIPS NEEDED BY TUNE_MODEL
data_loader_fn = "data_loader.py"
data_loader_file = File(data_loader_fn )
rc.add_replica("local", data_loader_fn, os.path.join(os.getcwd(), UTILS_DIR + data_loader_fn ))

model_selction_fn = "model_selection.py"
model_selction_file = File(model_selction_fn )
rc.add_replica("local", model_selction_fn, os.path.join(os.getcwd(), UTILS_DIR + model_selction_fn ))

util_checkpoint_fn = "util_checkpoint.py"
util_checkpoint_file = File(util_checkpoint_fn )
rc.add_replica("local", util_checkpoint_fn, os.path.join(os.getcwd(), UTILS_DIR + util_checkpoint_fn ))

early_stopping_fn = "pytorchtools.py"
early_stopping_file = File(early_stopping_fn )
rc.add_replica("local", early_stopping_fn, os.path.join(os.getcwd(), UTILS_DIR + early_stopping_fn ))

<Pegasus.api.replica_catalog.ReplicaCatalog at 0x7f310c01a208>

In [33]:
create_pkl("vgg16")

'hpo_study_checkpoint_vgg16.pkl'

In [34]:
# FILES FOR TUNE_MODEL.py VGG 16
#----------------------------------------------------------------------------------------------------------------
vgg16_pkl = create_pkl("vgg16")
final_vgg16_pkl =  "final_hpo_study_checkpoint_vgg16.pkl"

vgg16_pkl_file = File(vgg16_pkl)
rc.add_replica("local", vgg16_pkl, os.path.join(os.getcwd(), vgg16_pkl))

final_vgg16_pkl_file = File(final_vgg16_pkl)

rc.write()

In [35]:
# Create and add our transformations to the TransformationCatalog.
tc = TransformationCatalog()

# Data preprocessing part 1 
preprocess1 = Transformation(
                "preprocess1",
                site="local",
                pfn = str(Path(".").parent.resolve() / "bin/data_preprocessing1.py"), 
                is_stageable= True
            )

# Data preprocessing part 2
preprocess2 = Transformation(
                    "preprocess2", 
                   site = "local", 
                    pfn = str(Path(".").parent.resolve() / "bin/data_preprocessing2.py"), 
                    is_stageable = True
              )

# Data Split
data_split = Transformation(
                    "data_split", 
                    site = "local", 
                    pfn = str(Path(".").parent.resolve() / "bin/data_split.py"), 
                    is_stageable = True
                )


# Tune models
tune_model_vgg16 = Transformation(
                    "tune_model_vgg16", 
                    site = "local", 
                   pfn = str(Path(".").parent.resolve() / "bin/tune_model.py"), 
                    is_stageable = True
                )

# Choose best model and hyperparameters
choose_best_model = Transformation(
                    "choose_best_model", 
                    site = "local", 
                    pfn = str(Path(".").parent.resolve() / "bin/choose_best_model.py"), 
                    is_stageable = True
                )

tc.add_transformations(preprocess1 , preprocess2,data_split,tune_model_vgg16,choose_best_model)
tc.write()

In [36]:
# --- Workflow -----------------------------------------------------------------
# Set infer_dependencies=True so that they are inferred based on job input and output file usage.
wf = Workflow("catVsdog-test-wf", infer_dependencies=True)

# Create Jobs. These objects store just that. The transformation (executable) used by the job.
#The arguments passed to the executable. The input files used and the output files produced. 


job_preprocess_1 = Job(preprocess1)\
                    .add_inputs(*input_preprocessing1)\
                    .add_outputs(*input_preprocessing2)


job_preprocess_2 = Job(preprocess2)\
                    .add_inputs(*input_preprocessing2)\
                    .add_outputs(*output_preprocessing2 )


job_data_split = Job(data_split)\
                    .add_inputs(data_split_file,*output_preprocessing2)\
                    .add_outputs(*output_data_split) 



job_tune_model_vgg16 = Job(tune_model_vgg16)\
                    .add_args("-a","vgg16","-arch", "vgg16", NUM_EPOCHS, NUM_TRIALS)\
                    .add_checkpoint(vgg16_pkl_file, stage_out=True)\
                    .add_inputs(*input_tune_model,data_loader_file,model_selction_file, early_stopping_file,util_checkpoint_file )\
                    .add_outputs(final_vgg16_pkl_file)\
                    .set_stdout("output_vgg16.txt")\
                    .add_profiles(Namespace.PEGASUS, key="checkpoint.time", value=1)\
                    .add_profiles(Namespace.PEGASUS, key="maxwalltime", value=2)

job_choose_best_model = Job(choose_best_model)\
                    .add_args("-sf","final_hpo_study_checkpoint_vgg16.pkl")\
                    .add_inputs(final_vgg16_pkl_file)\
                    .add_outputs(File("best_model.txt")) 

In [37]:
wf.add_jobs(
    job_preprocess_1,
    job_preprocess_2,
    job_data_split,
    job_tune_model_vgg16,
    job_choose_best_model
)


<Pegasus.api.workflow.Workflow at 0x7f310bfef6a0>

In [38]:
try:
    wf.plan(submit=True)\
    .wait()\
    .analyze()\
    .statistics()
except PegasusClientError as e:
    print(e.output)


################
# pegasus-plan #
################
2021.02.08 06:25:14.574 UTC:
2021.02.08 06:25:14.579 UTC:   -----------------------------------------------------------------------
2021.02.08 06:25:14.585 UTC:   File for submitting this DAG to HTCondor           : catVsdog-test-wf-0.dag.condor.sub
2021.02.08 06:25:14.590 UTC:   Log of DAGMan debugging messages                 : catVsdog-test-wf-0.dag.dagman.out
2021.02.08 06:25:14.595 UTC:   Log of HTCondor library output                     : catVsdog-test-wf-0.dag.lib.out
2021.02.08 06:25:14.600 UTC:   Log of HTCondor library error messages             : catVsdog-test-wf-0.dag.lib.err
2021.02.08 06:25:14.605 UTC:   Log of the life of condor_dagman itself          : catVsdog-test-wf-0.dag.dagman.log
2021.02.08 06:25:14.610 UTC:
2021.02.08 06:25:14.616 UTC:   -no_submit given, not submitting DAG to HTCondor.  You can do this with:
2021.02.08 06:25:14.626 UTC:   -----------------------------------------------------------------------


[[1;32m########################[0m------------]  67.7% ..Failure ([1;32mCompleted: 21[0m, [1;33mQueued: 0[0m, [1;36mRunning: 0[0m, [1;31mFailed: 1[0m)



####################
# pegasus-analyzer #
####################
Your database is compatible with Pegasus version: 5.1.0dev

************************************Summary*************************************

Submit Directory   : /home/scitech/shared-data/pegasus-catdog-wf-master/scitech/pegasus/catVsdog-test-wf/run0009
Total jobs         :     31 (100.00%)
# jobs succeeded   :     21 (67.74%)
# jobs failed      :      1 (3.23%)
# jobs held        :      0 (0.00%)
# jobs unsubmitted :      9 (29.03%)

******************************Failed jobs' details******************************


last state: POST_SCRIPT_FAILED
site: condorpool
submit file: 00/00/tune_model_vgg16_ID0000004.sub
output file: 00/00/tune_model_vgg16_ID0000004.out.002
error file: 00/00/tune_model_vgg16_ID0000004.err.002

-------------------------------Task #1 - Summary--------------------------------

site        : condorpool
hostname    : cae6425c697f
executable  : /var/lib/condor/execute/dir_6757/tune_model_vgg16
arguments

Your database is compatible with Pegasus version: 5.1.0dev

************************************Summary*************************************

 Submit Directory   : /home/scitech/shared-data/pegasus-catdog-wf-master/scitech/pegasus/catVsdog-test-wf/run0009
 Total jobs         :     31 (100.00%)
 # jobs succeeded   :     21 (67.74%)
 # jobs failed      :      1 (3.23%)
 # jobs held        :      0 (0.00%)
 # jobs unsubmitted :      9 (29.03%)

******************************Failed jobs' details******************************


 last state: POST_SCRIPT_FAILED
       site: condorpool
submit file: 00/00/tune_model_vgg16_ID0000004.sub
output file: 00/00/tune_model_vgg16_ID0000004.out.002
 error file: 00/00/tune_model_vgg16_ID0000004.err.002

-------------------------------Task #1 - Summary--------------------------------

site        : condorpool
hostname    : cae6425c697f
executable  : /var/lib/condor/execute/dir_6757/tune_model_vgg16
arguments   : -
exitcode    : 3
working dir : /var/lib/con

In [39]:
## CONFIRM best_model.txt contains hyperparameters for training best model

show_results = open("./wf-output/best_model.txt", "r")
show_results.readlines()

FileNotFoundError: [Errno 2] No such file or directory: './wf-output/best_model.txt'