In [17]:
!pip install awswrangler -q

In [1]:
!cat container/Dockerfile

FROM ubuntu:20.04

MAINTAINER Amazon AI <sage-learner@amazon.com>


RUN apt-get -y update && apt-get install -y --no-install-recommends \
         libgomp1 \
         wget \
         python3-pip \
         python3-setuptools \
         nginx \
         ca-certificates \
    && rm -rf /var/lib/apt/lists/*

RUN ln -s /usr/bin/python3 /usr/bin/python
#RUN ln -s /usr/bin/pip3 /usr/bin/pip

RUN pip --no-cache-dir install numpy scikit-learn pandas flask gunicorn lightgbm

ENV PYTHONUNBUFFERED=TRUE
ENV PYTHONDONTWRITEBYTECODE=TRUE
ENV PATH="/opt/program:${PATH}"

# Set up the program in the image
COPY lightgbm /opt/program
WORKDIR /opt/program


In [10]:
%%sh

# The name of our algorithm
algorithm_name=lightgbm-algorithm

cd container

chmod +x lightgbm/train
chmod +x lightgbm/serve

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build  -t ${algorithm_name} .
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



Login Succeeded


#0 building with "default" instance using docker driver

#1 [internal] load build definition from Dockerfile
#1 transferring dockerfile: 760B done
#1 DONE 0.0s

#2 [internal] load metadata for docker.io/library/ubuntu:20.04
#2 DONE 0.1s

#3 [internal] load .dockerignore
#3 transferring context: 2B done
#3 DONE 0.0s

#4 [1/6] FROM docker.io/library/ubuntu:20.04@sha256:8feb4d8ca5354def3d8fce243717141ce31e2c428701f6682bd2fafe15388214
#4 DONE 0.0s

#5 [internal] load build context
#5 transferring context: 6.99kB done
#5 DONE 0.0s

#6 [2/6] RUN apt-get -y update && apt-get install -y --no-install-recommends          libgomp1          wget          python3-pip          python3-setuptools          nginx          ca-certificates     && rm -rf /var/lib/apt/lists/*
#6 CACHED

#7 [3/6] RUN ln -s /usr/bin/python3 /usr/bin/python
#7 CACHED

#8 [4/6] RUN pip --no-cache-dir install numpy scikit-learn pandas flask gunicorn lightgbm==3.3.5
#8 CACHED

#9 [5/6] COPY lightgbm /opt/program
#9 DONE 0.0s

#1

The push refers to repository [084828584964.dkr.ecr.us-east-1.amazonaws.com/lightgbm-algorithm]
5f70bf18a086: Preparing
ab7e3eae2760: Preparing
5780eb6aff2e: Preparing
0788e1eae391: Preparing
40a9b2d59653: Preparing
470b66ea5123: Preparing
470b66ea5123: Waiting
5780eb6aff2e: Layer already exists
0788e1eae391: Layer already exists
5f70bf18a086: Layer already exists
40a9b2d59653: Layer already exists
470b66ea5123: Layer already exists
ab7e3eae2760: Pushed
latest: digest: sha256:b894fb347313e2693e2833aaf187f0941f233e9f2001ee0ed9a777a38a668c0c size: 1574


In [None]:
# Define IAM role
import boto3
import re
import os
from sagemaker import get_execution_role
import sagemaker

role = get_execution_role()

In [None]:
lightgbm_container="xxxxx.dkr.ecr.us-east-1.amazonaws.com/lightgbm-algorithm:latest"

# Create testing data 

In [18]:
import pandas as pd
import numpy as np
import awswrangler as wr

In [21]:
# Generar datos aleatorios
total_columns = 80

# Crear nombres de columnas: target, feature_1, ..., feature_78
columns = ['target'] + [f'feature_{i}' for i in range(1, total_columns)]

In [None]:
data_train= np.random.rand(5000, total_columns)

# Crear DataFrame
df_train = pd.DataFrame(data_train, columns=columns)

# Mostrar el DataFrame
#df_train

path_to_s3="s3://xxxxx-analytics-artifact/generic/data/train/file.csv"
wr.s3.to_csv(df_train,path_to_s3,index=False)

{'paths': ['s3://anta-acoe-san-084828584964-analytics-artifact/generic/data/train/file.csv'],
 'partitions_values': {}}

In [None]:
# Generar datos aleatorios
data_test= np.random.rand(1000, total_columns)

# Crear DataFrame
df_test = pd.DataFrame(data_test, columns=columns)

# Mostrar el DataFrame
#df_train

path_to_s3="s3://xxxx-analytics-artifact/generic/data/test/file.csv"
wr.s3.to_csv(df_test,path_to_s3,index=False)

{'paths': ['s3://anta-acoe-san-084828584964-analytics-artifact/generic/data/test/file.csv'],
 'partitions_values': {}}

In [None]:
path_to_s3="s3://xxxxx-analytics-artifact/generic/data/input-inference-batch/file.csv"
wr.s3.to_csv(df_test.iloc[:,1:],path_to_s3,index=False,header=None)

{'paths': ['s3://anta-acoe-san-084828584964-analytics-artifact/generic/data/input-inference-batch/file.csv'],
 'partitions_values': {}}

# Testing container to training job

In [None]:
model = sagemaker.estimator.Estimator(lightgbm_container,
                       role, 1, 'ml.m5.4xlarge',
                       output_path="s3://xxxxx-analytics-artifact/generic/modelo/lightgbm/"
                        )

model.fit({"training":"s3://xxxxxx-analytics-artifact/generic/data/train/file.csv",
           "test":"s3://xxxxxx-analytics-artifact/generic/data/test/file.csv"})

INFO:sagemaker:Creating training-job with name: lightgbm-algorithm-2025-06-22-05-43-12-300


2025-06-22 05:43:13 Starting - Starting the training job...
2025-06-22 05:43:47 Downloading - Downloading input data...
2025-06-22 05:44:18 Training - Training image download completed. Training in progress.
2025-06-22 05:44:18 Uploading - Uploading generated training model[34mStarting the training[0m
[34m/opt/ml/input/data/training[0m
[34mreading files[0m
[34m/opt/ml/input/data/training/file.csv[0m
[34m<class 'str'>[0m
[34mreading files[0m
[34m/opt/ml/input/data/test/file.csv[0m
[34m<class 'str'>[0m
[34m     target  feature_1  feature_2  ...  feature_77  feature_78  feature_79[0m
[34m0  0.977474   0.415885   0.414363  ...    0.865150    0.454088    0.282860[0m
[34m1  0.569410   0.750682   0.738963  ...    0.986457    0.816591    0.201643[0m
[34m2  0.207959   0.739869   0.990133  ...    0.492181    0.813890    0.773281[0m
[34m3  0.680000   0.371214   0.114991  ...    0.407690    0.299906    0.702279[0m
[34m4  0.070847   0.982181   0.074998  ...    0.777711   

## Hosting your model


In [29]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer

In [31]:
model.deploy(initial_instance_count=1,instance_type="ml.m5.xlarge",
                    endpoint_name="endpoint-model-test")

INFO:sagemaker:Creating model with name: lightgbm-algorithm-2025-06-22-05-49-18-150
INFO:sagemaker:Creating endpoint-config with name endpoint-model-test
INFO:sagemaker:Creating endpoint with name endpoint-model-test


----!

<sagemaker.base_predictor.Predictor at 0x7f9cdf2c2890>

In [32]:
payload = df_test.iloc[1,1:].to_numpy()
payload

array([0.39646043, 0.67273293, 0.34177952, 0.79123849, 0.91877489,
       0.99424983, 0.34845433, 0.81030144, 0.73127898, 0.49009087,
       0.72411716, 0.76985971, 0.74107603, 0.64647794, 0.72859426,
       0.05564635, 0.43500586, 0.61628677, 0.96164726, 0.3226066 ,
       0.77311804, 0.15910234, 0.42238824, 0.33505477, 0.60007015,
       0.66398401, 0.15131276, 0.28474064, 0.84391049, 0.78777699,
       0.10742814, 0.20223064, 0.2346628 , 0.90742386, 0.36613274,
       0.89073359, 0.22602661, 0.72955125, 0.30493497, 0.04543578,
       0.10689167, 0.69138402, 0.0041289 , 0.9541923 , 0.09086866,
       0.2354061 , 0.54604247, 0.77151693, 0.47511718, 0.26794378,
       0.38284594, 0.93962889, 0.72934966, 0.33998096, 0.28955782,
       0.56457175, 0.88730349, 0.04773594, 0.41697635, 0.50234576,
       0.8316042 , 0.54286695, 0.56721722, 0.0700925 , 0.86782505,
       0.1459624 , 0.84088783, 0.8906023 , 0.96724628, 0.36522054,
       0.13306133, 0.88691683, 0.5792379 , 0.03905112, 0.13924

In [35]:
predictor = Predictor(
    endpoint_name="endpoint-model-test", serializer=CSVSerializer()
)

print(predictor.predict(payload))

b'0.4223045980621642\n'


In [None]:
sess.delete_endpoint(predictor.endpoint)

## Run Batch Transform Job

In [None]:
output_path="xxxxx-analytics-artifact/generic/data/output-inference-batch/file.csv"

transformer = model.transformer(instance_count=1,
                               instance_type='ml.m4.xlarge',
                               output_path=output_path,
                               assemble_with='Line',
                               accept='text/csv')

INFO:sagemaker:Creating model with name: lightgbm-algorithm-2025-06-22-05-56-40-626


We use tranform() on the transfomer to get inference results against the data that we uploaded. You can use these options when invoking the transformer. 

* The __data_location__ which is the location of input data
* The __content_type__ which is the content type set when making HTTP request to container to get prediction
* The __split_type__ which is the delimiter used for splitting input data 
* The __input_filter__ which indicates the first column (ID) of the input will be dropped before making HTTP request to container

In [None]:
transformer.transform("s3://xxxxxxxxx-analytics-artifact/generic/data/input-inference-batch/file.csv",
                      content_type='text/csv', split_type='Line')#, input_filter='$[1:]')
transformer.wait()

For more information on the configuration options, see [CreateTransformJob API](https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTransformJob.html)

### View Output
Lets read results of above transform job from s3 files and print output

In [None]:
s3_client = sess.boto_session.client('s3')
s3_client.download_file("s3://xxxxxxx-analytics-artifact/generic/data/output-inference-batch/file.csv", '/tmp/output.csv.out')
with open('/tmp/output.csv.out') as f:
    results = f.readlines()   
print("Transform results: \n{}".format(''.join(results)))