In [43]:
import pandas as pd
df = pd.read_csv('data/ade.csv')
df.sample(10)

Unnamed: 0,text,label
13139,Riluzole is a new drug representing the first ...,0
12906,St Jude's Total XIII protocol was started.,0
12270,Body temperature was reduced to 26 degrees C a...,0
14727,Both patients completed the planned course of ...,0
6886,CONCLUSIONS: PHD may play an important role in...,0
10441,"Toxic manifestation includes myoclonus, ataxia...",0
4171,Pancytopenia associated with 5-aminosalicylic ...,1
8825,"Thalidomide induces immunomodulator, anti-infl...",0
15939,Possible mechanisms of action are discussed.,0
8973,Pursuing a diagnosis in a Caribbean man.,0


In [44]:
df['label'].sum()/len(df)

0.20439318529862174

In [2]:
from sagemaker.huggingface.processing import HuggingFaceProcessor
import sagemaker
from sagemaker import get_execution_role

In [35]:
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = f"az-ade-{sess.account_id()}"
sess._create_s3_bucket_if_it_does_not_exist(bucket_name=bucket, region=sess._region_name)
filepath = f"s3://{bucket}/data/ade.csv"

In [24]:
filepath

's3://az-ade-905847418383/data/ade.csv'

In [25]:
!aws s3 cp data/ade.csv $filepath

upload: data/ade.csv to s3://az-ade-905847418383/data/ade.csv    


In [30]:
hf_processor = HuggingFaceProcessor(
    role=role,
    instance_type="ml.p3.2xlarge",
    transformers_version='4.6',
    base_job_name="az-ade",
    pytorch_version='1.7',
    instance_count=1,
)

In [31]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

inputs = [ProcessingInput(source=filepath, destination="/opt/ml/processing/input")]
outputs=[
        ProcessingOutput(output_name="train_data", source="/opt/ml/processing/training", destination=f"s3://{bucket}/processing_output/train_data"),
        ProcessingOutput(output_name="validation_data", source="/opt/ml/processing/validation", destination=f"s3://{bucket}/processing_output/validation_data"),
        ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test", destination=f"s3://{bucket}/processing_output/test_data"),
    ]
arguments = ["--file-name", "ade.csv",
             "--model-name", "distilbert-base-uncased",
             "--train-ratio", "0.7",
             "--val-ratio", "0.15",]

In [32]:
hf_processor.run(
    code="scripts/preprocess.py",
    inputs=inputs,
    outputs=outputs,
    arguments=arguments
)


Job Name:  az-ade-2021-10-04-19-01-33-564
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://az-ade-905847418383/data/ade.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-905847418383/az-ade-2021-10-04-19-01-33-564/source/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'entrypoint', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-905847418383/az-ade-2021-10-04-19-01-33-564/source/runproc.sh', 'LocalPath': '/opt/ml/processing/input/entrypoint', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outp

In [33]:
preprocessing_job_description = hf_processor.jobs[-1].describe()

output_config = preprocessing_job_description['ProcessingOutputConfig']
for output in output_config['Outputs']:
    print(output['S3Output']['S3Uri'])

s3://az-ade-905847418383/processing_output/train_data
s3://az-ade-905847418383/processing_output/validation_data
s3://az-ade-905847418383/processing_output/test_data
