### Load dataset and have a peak:

In [None]:
import pandas as pd
df = pd.read_csv('data/ade.csv')
df.sample(10)

### Determine ratio of positive ADE phrases compared to total dataset

In [None]:
df['label'].sum()/len(df)

### Initialise Sagemaker variables and copy data into S3 bucket

In [None]:
from sagemaker.huggingface.processing import HuggingFaceProcessor
import sagemaker
from sagemaker import get_execution_role

In [None]:
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = f"az-ade-{sess.account_id()}"
sess._create_s3_bucket_if_it_does_not_exist(bucket_name=bucket, region=sess._region_name)
filepath = f"s3://{bucket}/data/ade.csv"

In [None]:
filepath

In [None]:
!aws s3 cp data/ade.csv $filepath

### Save the name of the S3 bucket for later sessions

In [None]:
%store bucket

### Set up processing job

In [None]:
hf_processor = HuggingFaceProcessor(
    role=role,
    instance_type="ml.p3.2xlarge",
    transformers_version='4.6',
    base_job_name="az-ade",
    pytorch_version='1.7',
    instance_count=1,
)

In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

inputs = [ProcessingInput(source=filepath, destination="/opt/ml/processing/input")]
outputs=[
        ProcessingOutput(output_name="train_data", source="/opt/ml/processing/training", destination=f"s3://{bucket}/processing_output/train_data"),
        ProcessingOutput(output_name="validation_data", source="/opt/ml/processing/validation", destination=f"s3://{bucket}/processing_output/validation_data"),
        ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test", destination=f"s3://{bucket}/processing_output/test_data"),
    ]
arguments = ["--file-name", "ade.csv",
             "--model-name", "distilbert-base-uncased",
             "--train-ratio", "0.7",
             "--val-ratio", "0.15",]

In [None]:
hf_processor.run(
    code="scripts/preprocess.py",
    inputs=inputs,
    outputs=outputs,
    arguments=arguments
)

In [None]:
preprocessing_job_description = hf_processor.jobs[-1].describe()

output_config = preprocessing_job_description['ProcessingOutputConfig']
for output in output_config['Outputs']:
    print(output['S3Output']['S3Uri'])