# BUILDER Wildifre detection system | UKW

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv


# load packages
import json
import logging
import sys
import sys
from pathlib import Path
from uuid import uuid4
from pathlib import Path
from dotenv import load_dotenv
import os
import boto3
from botocore.exceptions import ClientError
import sagemaker
import boto3
from functools import wraps

# Set up logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

## Utilities

In [None]:
load_dotenv()

LOCAL_MODE = False

CODE_FOLDER = Path(os.getcwd()).parent.absolute() / 'infrastructure' / 'system_files'
sys.path.extend([CODE_FOLDER])


CLIENT = os.environ["CLIENT"]
PROJECT = os.environ["PROJECT"]
CLIENT_PROJECT_PAIR = f"{CLIENT}-{PROJECT}"

AWS_USER_NAME = f"{CLIENT_PROJECT_PAIR}-PrincipalDeveloper"
AWS_ROLE_NAME= f'{CLIENT_PROJECT_PAIR}-sagemaker-develoepr'
AWS_ROLE_ARN = 'arn:aws:iam::654654140928:role/AI_FTP_smoke_wildfire-AI_FTP-smoke_wildfire-sagemaker-develoepr'
AWS_ROLE_ARN = os.environ["AWS_ROLE_ARN"]
AWS_ACCESS_KEY_ID = os.environ["AWS_ACCESS_KEY_ID"]
AWS_SECRET_ACCESS_KEY = os.environ["AWS_SECRET_ACCESS_KEY"]
AWS_REGION = os.environ["AWS_REGION"]

CAMERA_NAME = "collon_cura"

RTSP_IP = os.environ['RTSP_IP']
RTSP_PORT = os.environ['RTSP_PORT']
RTSP_USER = os.environ['RTSP_USER']
RTSP_PASS = os.environ['RTSP_PASS']
RTSP_URL=f"rtsp://{RTSP_USER}:{RTSP_PASS}@{RTSP_IP}:{RTSP_PORT}/h264/ch1/main/av_stream"


BUCKET = f"{CLIENT}_{PROJECT}"
BUCKET = "ukw-wildfire-projects"
PROJECT_S3_LOCATION = f"s3://{BUCKET}/wildfire-detector"
MODEL_S3_LOCATION = f'{PROJECT_S3_LOCATION}/{PROJECT}/model/latest/inference_model.tar.gz'
MODEL_S3_LOCATION = 's3://ukw-wildfire-projects/wildfire-detector/model/latest/inference_model.tar.gz'
POSITIVE_INFERENCE_BUCKET = f'{CLIENT}_{PROJECT}_{CAMERA_NAME}_positive_predictions'
SAMPLING_BUCKET = f'{CLIENT}_{PROJECT}_{CAMERA_NAME}_sampling'


PROCESSING_IMAGE_NAME = f"{CLIENT}-{PROJECT}-processing"
PROCESSING_IMAGE_NAME_LOWER = PROCESSING_IMAGE_NAME.lower()
MODEL_DEPLOYMENT_IMAGE_NAME = "yolov5-wildifire-model"
MODEL_DEPLOYMENT_IMAGE_NAME = f"{CLIENT}-{PROJECT}-model-deployment"
MODEL_DEPLOYMENT_IMAGE_NAME_LOWER = MODEL_DEPLOYMENT_IMAGE_NAME.lower()

# MODIFY
MODEL_VERSION = 'smokeab'

DATASET_VERSION = 'full_v3_smoke_ab'
## 
MODEL_TRAINING_IMAGE_NAME = 'from'
MODEL_TRAINING_IMAGE_NAME = f"{CLIENT}-{PROJECT}-model-training-{MODEL_TRAINING_IMAGE_NAME}"
MODEL_TRAINING_IMAGE_NAME_LOWER = MODEL_TRAINING_IMAGE_NAME.lower()


ENDPOINT_NAME = f"YOLO-V5-DEPLOYMENT" #-{random.randint(0,1000)}"
ENDPOINT_NAME = f"{CLIENT}-{PROJECT}-model" #-{random.randint(0,1000)}"

DYNAMO_TABLE = f'{CLIENT}-{PROJECT}-{CAMERA_NAME}-positive-predictions'


## Sagmaker configuration

In [None]:
# By default, The SageMaker SDK logs events related to the default
# configuration using the INFO level. To prevent these from spoiling
# the output of this notebook cells, we can change the logging
# level to ERROR instead.
logging.getLogger("sagemaker.config").setLevel(logging.ERROR)



LOCAL_MODE = False


sagemaker_session = sagemaker.session.Session()
region = boto3.Session().region_name
sagemaker_client = boto3.client("sagemaker")
iam_client = boto3.client("iam")


# # We can retrieve the architecture of the local
# # computer using the `uname -m` command.
architecture = !(uname -m)
IS_ARM64_ARCHITECTURE = architecture[0] == "arm64"

#### Caching Configuration

In [None]:
from sagemaker.workflow.steps import CacheConfig

cache_config = CacheConfig(enable_caching=True, expire_after="15d")

#### Pipeline Configuration

In [None]:
from sagemaker.workflow.parameters import ParameterString
from sagemaker.workflow.pipeline_definition_config import PipelineDefinitionConfig

pipeline_definition_config = PipelineDefinitionConfig(use_custom_job_prefix=True)

dataset_location = ParameterString(
    name="dataset_location",
    default_value=PROJECT_S3_LOCATION,
)

Se define una configuración para Sagemaker que depende si se corre en LOCAL_MODE o no. INcluyre los recursos.


In [None]:
from sagemaker.workflow.pipeline_context import LocalPipelineSession, PipelineSession

pipeline_session = PipelineSession(default_bucket=BUCKET) if not LOCAL_MODE else None

if LOCAL_MODE:
    config = {
        'processing':
            {
                "session": LocalPipelineSession(default_bucket=BUCKET),
                "instance_type": "local",
                # We need to use a custom Docker image when we run the pipeline
                # in Local Model on an ARM64 machine.
                "image": (
                    "sagemaker-tensorflow-toolkit-local" if IS_ARM64_ARCHITECTURE else None
                ),
                "py_version" : "py310"
            },
        'training':
            {
                "session": LocalPipelineSession(default_bucket=BUCKET),
                "instance_type": "local",
                # We need to use a custom Docker image when we run the pipeline
                # in Local Model on an ARM64 machine.
                "image": (
                    "sagemaker-tensorflow-toolkit-local" if IS_ARM64_ARCHITECTURE else None
                ),
                "py_version" : "py310"
            },
    }
else:
    config = {
        'processing':
            {
                "session": pipeline_session,
                "instance_type": "ml.t3.medium",
                "instance_count":1,
                "image": None,
                "py_version" : "py310",
                "role": AWS_ROLE_ARN
            },
        'training':
            {
                "session": pipeline_session,
                "instance_type": "ml.g4dn.xlarge",
                "instance_count":1,
                "image": None,
                "py_version" : "py310",
                "role": AWS_ROLE_ARN
            },
    }

### Sagemaker

#### Training Step

##### scripts


In [None]:
%%writefile {CODE_FOLDER}/containers/training/config/resourceconfig.json
{
    "current_host": "algo-1",
    "hosts": ["algo-1","algo-2","algo-3"],
    "network_interface_name":"eth1"
}


In [None]:
%%writefile {CODE_FOLDER}/containers/training/config/hyp.finetune.yaml

# Hyperparameters for VOC finetuning
# python train.py --batch 64 --weights yolov5m.pt --data VOC.yaml --img 512 --epochs 50
# See tutorials for hyperparameter evolution https://github.com/ultralytics/yolov5#tutorials


# Hyperparameter Evolution Results
# Generations: 306
#                   P         R     mAP.5 mAP.5:.95       box       obj       cls
# Metrics:        0.6     0.936     0.896     0.684    0.0115   0.00805   0.00146

lr0: 0.0032
lrf: 0.12
momentum: 0.843
weight_decay: 0.00036
warmup_epochs: 2.0
warmup_momentum: 0.5
warmup_bias_lr: 0.05
box: 0.0296
cls: 0.243
cls_pw: 0.631
obj: 0.301
obj_pw: 0.911
iou_t: 0.2
anchor_t: 2.91
# anchors: 3.63
fl_gamma: 0.0
hsv_h: 0.0138
hsv_s: 0.664
hsv_v: 0.464
degrees: 0.373
translate: 0.245
scale: 0.898
shear: 0.602
perspective: 0.0
flipud: 0.00856
fliplr: 0.5
mosaic: 1.0
mixup: 0.243
copy_paste: 0.0


In [None]:
# %%writefile {CODE_FOLDER}/containers/training/config/data.yaml
# test: /opt/ml/input/data/training/test/images
# train: /opt/ml/input/data/training/train/images
# val: /opt/ml/input/data/training/valid/images

# nc: 9
# names:
#   - 'Fire-D'
#   - 'Fire-N'
#   - 'Smoke-0'
#   - 'Smoke-1'
#   - 'Smoke-2'
#   - 'Smoke-3'
#   - 'Smoke-C1'
#   - 'Smoke-C2'
#   - 'Smoke-N'

In [None]:
%%writefile {CODE_FOLDER}/containers/training/config/data.yaml
test: /opt/ml/input/data/training/test/images
train: /opt/ml/input/data/training/train/images
val: /opt/ml/input/data/training/valid/images

nc: 11
names:
- 'Fire-D'
- 'Fire-N'
- 'Humo-0.5'
- 'Humo-2.5'
- 'Smoke-0'
- 'Smoke-1'
- 'Smoke-2'
- 'Smoke-3'
- 'Smoke-C1'
- 'Smoke-C2'
- 'Smoke-N'

In [None]:
%%writefile {CODE_FOLDER}/containers/training/config/data.yaml
test: /opt/ml/input/data/training/test/images
train: /opt/ml/input/data/training/train/images
val: /opt/ml/input/data/training/valid/images

nc: 3
names: ['Fire', 'Smoke-A', 'Smoke-B']

In [None]:
%%writefile {CODE_FOLDER}/containers/training/config/train-args.json
{
   "FP_DATA": "/opt/ml/code/config/data.yaml",
   # "FP_YOLO": "/opt/ml/input/train_data/config/yolov5s.yaml",
   # "FP_HYP": "/opt/ml/input/train_data/config/hyp.finetune.yaml",    
   "FP_WEIGHT": "yolov5s.pt",
   "NAME": "wildfire-detector-model",
   "IMG_SIZE": "640",
   "EPOCHS": "35",
   "BATCH": "32"
}

In [None]:
%%writefile {CODE_FOLDER}/containers/training/train
# FP_PARA=/opt/ml/code/config/train-args.json
# data=$(cat $FP_PARA | jq -r '.FP_DATA')
# # yolo=$(cat $FP_PARA | jq -r '.FP_YOLO')
# weight=$(cat $FP_PARA | jq -r '.FP_WEIGHT')
# # hyp=$(cat $FP_PARA | jq -r '.FP_HYP')
# img=$(cat $FP_PARA | jq -r '.IMG_SIZE')
# batch=$(cat $FP_PARA | jq -r '.BATCH')
# epochs=$(cat $FP_PARA | jq -r '.EPOCHS')
# name=$(cat $FP_PARA | jq -r '.NAME')
# echo 'data:'
# echo $data
# echo $yolo
# echo $weight

echo training
ls /opt/ml/input/data/training
echo valid
ls /opt/ml/input/data/training/valid/


python /opt/ml/code/yolov5/train.py --weights /opt/ml/input/data/training/best.pt --img 640 --epochs 35 --batch-size 32 --data /opt/ml/input/data/training/data.yaml --name wildfire-model
cp -r /opt/ml/code/yolov5/runs/train/wildfire-model /opt/ml/model/

##### requirements

In [None]:
%%writefile {CODE_FOLDER}/containers/training/requirements.txt
# | filename: requirements.txt
# | code-line-numbers: true

# sagemaker-training
comet-ml
matplotlib
numpy
opencv-python
pillow
psutil
PyYAML
requests
scipy
thop
torch
torchvision
tqdm
shutil
# ultralytics


In [None]:
%%writefile {CODE_FOLDER}/containers/training/start_with_right_hostname.sh
#!/usr/bin/env bash

if [[ "$1" = "train" ]]; then
     CURRENT_HOST=$(jq .current_host  /opt/ml/input/config/resourceconfig.json)
     sed -ie "s/PLACEHOLDER_HOSTNAME/$CURRENT_HOST/g" changehostname.c
     gcc -o changehostname.o -c -fPIC -Wall changehostname.c
     gcc -o libchangehostname.so -shared -export-dynamic changehostname.o -ldl
     LD_PRELOAD=/opt/ml/code/libchangehostname.so train
else
     eval "$@"
fi

In [None]:
%%writefile {CODE_FOLDER}/containers/training/changehostname.c
#include <stdio.h>#include <stdio.h>
#include <string.h>
/*
 * Modifies gethostname to return algo-1, algo-2, etc. when running on SageMaker.
 *
 * Without this gethostname() on SageMaker returns 'aws', leading NCCL/MPI to think there is only one host,
 * not realizing that it needs to use NET/Socket.
 *
 * When docker container starts we read 'current_host' value  from /opt/ml/input/train_data/config/resourceconfig.json
 * and replace PLACEHOLDER_HOSTNAME with it before compiling this code into a shared library.
 */
int gethostname(char *name, size_t len)
{
  const char *val = PLACEHOLDER_HOSTNAME;
  strncpy(name, val, len);
  return 0;
}
#include <string.h>

/*
 * Modifies gethostname to return algo-1, algo-2, etc. when running on SageMaker.
 *
 * Without this gethostname() on SageMaker returns 'aws', leading NCCL/MPI to think there is only one host,
 * not realizing that it needs to use NET/Socket.
 *
 * When docker container starts we read 'current_host' value  from /opt/ml/input/config/resourceconfig.json
 * and replace PLACEHOLDER_HOSTNAME with it before compiling this code into a shared library.
 */
int gethostname(char *name, size_t len)
{
  const char *val = PLACEHOLDER_HOSTNAME;
  strncpy(name, val, len);
  return 0;
}

##### Dockerfile

In [None]:
%%writefile {CODE_FOLDER}/containers/training/Dockerfile

FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.2.0-gpu-py310-cu121-ubuntu20.04-sagemaker as base

RUN apt-get update \
 && apt-get install -y --no-install-recommends --allow-unauthenticated \
    jq git gcc libgl1-mesa-dev wget gsutil libglib2.0-0

## fix /usr/local/cuda-10.0/compat/libcuda.so
## RUN bash -c 'echo "/usr/local/cuda-10.0/compat" > /etc/ld.so.conf.d/cuda.conf'
RUN ldconfig -v
RUN pip install tensorboard torch torchvision --upgrade


FROM base as yolo

# /opt/ml and all subdirectories are utilized by SageMaker, we use the /code subdirectory to store our user code.
WORKDIR /opt/ml/code

ENV PATH="/opt/ml/code:${PATH}"

RUN git clone https://github.com/ultralytics/yolov5
RUN pip install -r yolov5/requirements.txt


FROM yolo as training

COPY train /opt/ml/code

ADD . /opt/ml/code
# COPY input/data.yaml /opt/ml/data.yaml
# COPY input/config/train_arguments.json /opt/ml/input/train_arguments.json

COPY changehostname.c /opt/ml/code/changehostname.c
COPY start_with_right_hostname.sh /usr/local/bin/start_with_right_hostname.sh

RUN chmod 777 -R /opt/ml


WORKDIR /opt/ml/code/

# this environment variable is used by the SageMaker PyTorch container to determine our user code directory.
ENV SAGEMAKER_SUBMIT_DIRECTORY /opt/ml/code

# this environment variable is used by the SageMaker PyTorch container to determine our program entry point
# for training and serving.
# For more information: https://github.com/aws/sagemaker-pytorch-container
ENV SAGEMAKER_PROGRAM train

In [None]:
# login to AWS DLC
!aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com

if not LOCAL_MODE:
    # If we aren't running the code in Local Mode, we need
    # to specify we want to build the Docker image for the
    # linux/amd64 architecture before uploading it to ECR.
    print("Building Docker image for linux/amd64 architecture...")
    
    !docker build --platform="linux/amd64" -t $MODEL_TRAINING_IMAGE_NAME_LOWER \
        $CODE_FOLDER/containers/training/ 
else:
    # If we are running in Local Mode, we can use the
    # default Docker build command.
    print("Building Docker image for arm64 architecture...")

    !docker build -t $MODEL_TRAINING_IMAGE_NAME_LOWER \
        $CODE_FOLDER/containers/training/ 

##### docker image

In [None]:
# CAMBIAR VERSION !!!!!!!!!!!
print(MODEL_VERSION)
print(MODEL_TRAINING_IMAGE_NAME)
print(DATASET_VERSION)


In [None]:
%%bash -s "False" "ai-ftp-smoke-wildfire-model-training-from"
# | eval: false

algorithm_name=$2
account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration
# (default to us-east-1 if none defined)
region=$(aws configure get region)
region=${region:-us-east-1}

repository="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

echo $1
echo $2
echo ${repository}

# We only want to push the Docker image to ECR if
# we are not running in Local Mode.
if [ $1 = "False" ]
then
    # Create the repository if it doesn't exist in ECR
    aws ecr describe-repositories \
        --repository-names "${algorithm_name}" > /dev/null 2>&1
    if [ $? -ne 0 ]
    then
        aws ecr create-repository \
            --repository-name "${algorithm_name}" > /dev/null
    fi

    # Get the login command from ECR to run the
    # Docker push command.
    aws ecr get-login-password \
        --region ${region}|docker \
        login --username AWS --password-stdin ${repository}
        
    # Push the Docker image to the ECR repository
    docker tag ${algorithm_name} ${repository}
    docker push ${repository}
fi

In [None]:
account_id = boto3.client("sts").get_caller_identity().get("Account")
tag = ":latest"

uri_suffix = "amazonaws.com"
if AWS_REGION in ["cn-north-1", "cn-northwest-1"]:
    uri_suffix = "amazonaws.com.cn"

training_container_image = (
    MODEL_TRAINING_IMAGE_NAME_LOWER
    if LOCAL_MODE
    else (f"{account_id}.dkr.ecr.{AWS_REGION}.amazonaws.com/{MODEL_TRAINING_IMAGE_NAME_LOWER}:latest")
)

training_container_image

##### Sagemaker estimator

In [None]:
from sagemaker.estimator import Estimator


# inputs = {
#     "cfg": TrainingInput(cfg),
#     "images": TrainingInput(images),
#     "weights": TrainingInput(weights),
#     "labels": TrainingInput(labels),
# }

estimator = Estimator(
    image_uri=training_container_image,
    role=config['training']['role'],
    instance_count=config['training']['instance_count'],
    instance_type=config['training']['instance_type'],
    input_mode='File',
    output_path=f"{PROJECT_S3_LOCATION}/model/",
    base_job_name=f"{CLIENT}-{PROJECT}-{MODEL_VERSION}"
)

##### Sagemaker step

In [None]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep

inputs = {
    'training': TrainingInput(
        s3_data=f"{PROJECT_S3_LOCATION}/input/data/dataset_agg/{DATASET_VERSION}/", #processing_step_pre.properties.ProcessingOutputConfig.Outputs["data"].S3Output.S3Uri,
    )
}

training_step = TrainingStep(
    name=f"{CLIENT}-{PROJECT}-{MODEL_VERSION}",
    estimator=estimator,
    inputs=inputs,
    # outputs=outputs
    # cache_config=cache_config,
)



In [None]:
# training_step.estimator.fit(inputs)

### Build training Pipeline

In [None]:
from sagemaker.workflow.pipeline import Pipeline

model_pipeline = Pipeline(
    name=f"{CLIENT}-{PROJECT}-{MODEL_VERSION}",
    parameters=[dataset_location],
    steps=[
        # processing_step_pre,
        training_step,
    ],
    pipeline_definition_config=pipeline_definition_config,
    # sagemaker_session=config["session"],
)

In [None]:
if LOCAL_MODE == False:
    model_pipeline.upsert(role_arn=AWS_ROLE_ARN)

In [None]:
model_pipeline.start()

In [None]:
model_pipeline.list_executions()