In [None]:
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor

region = boto3.session.Session().region_name
role = get_execution_role()

In [None]:
%%writefile batch_ingest_sm_sdk.py
import pandas as pd
import os
import glob
import subprocess
import sys
import argparse

subprocess.check_call([sys.executable, "-m", "pip", "install", "sagemaker"])
import sagemaker as sm

from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.session import Session
import boto3

boto_session = boto3.Session()

sagemaker_client = boto_session.client(service_name='sagemaker')
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime')

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

def ingest_data(args):
    # Read data locally 
    file_list = glob.glob('/opt/ml/processing/input/*.csv')
    print(f'***** Starting processing job, received the following input files: \n{file_list}')

    df = pd.concat([pd.read_csv(f) for f in file_list], ignore_index=True)

    print(f'***** ingesting {df.shape[0]} total rows from {len(file_list)} files')
    print(f'      into {args.feature_group_name}, using {args.num_processes} processes and {args.num_workers} workers...\n')

    fg = FeatureGroup(name=args.feature_group_name, sagemaker_session=feature_store_session)
    fg.ingest(data_frame=df, max_processes=args.num_processes, max_workers=args.num_workers, wait=True)
    
    return

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--num_processes", type=int, default=1)
    parser.add_argument("--num_workers", type=int, default=1)
    parser.add_argument("--feature_group_name", type=str)

    args, _ = parser.parse_known_args()
    return args

if __name__ == '__main__':
    args = parse_args()
    ingest_data(args)
    print('Finished ingesting data')

In [None]:
s3_uri_prefix = 's3://roymark-ohio/feature-store/raw-by-day'
feature_group_name = 'trans-both-fg'

In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_processor = SKLearnProcessor(framework_version='0.20.0',
                                     role=role,
                                     instance_type='ml.m5.4xlarge',
                                     instance_count=8,
                                     env={'AWS_DEFAULT_REGION': boto3.Session().region_name})

sklearn_processor.run(
    code='batch_ingest_sm_sdk.py',
    arguments = ['--num_processes', '24', 
                 '--feature_group_name', feature_group_name],
    inputs=[ProcessingInput(
        s3_data_type='S3Prefix',
        source=s3_uri_prefix,
        s3_data_distribution_type='ShardedByS3Key',
        destination='/opt/ml/processing/input')]
)