In [None]:
!pip install datasets -q

In [None]:
!pip install sagemaker -U -q

In [None]:
!pip install s3fs==0.4.2 -U -q

### Load dataset and have a peak:

In [None]:
# This cell is required in SageMaker Studio, otherwise the download of the dataset will throw an error.
# After running this cell, the kernel needs to be restarted. After restarting tthe kernel, continue with the cell below (loading the dataset)
%%capture
import IPython
!conda install -c conda-forge ipywidgets -y
IPython.Application.instance().kernel.do_shutdown(True) # has to restart kernel so changes are used

In [2]:
from datasets import load_dataset
import pandas as pd
dataset = load_dataset('ade_corpus_v2', 'Ade_corpus_v2_classification')
df = pd.DataFrame(dataset['train'])
df.sample(5, random_state=124)

Reusing dataset ade_corpus_v2 (/root/.cache/huggingface/datasets/ade_corpus_v2/Ade_corpus_v2_classification/1.0.0/940d61334dbfac6b01ac5d00286a2122608b8dc79706ee7e9206a1edb172c559)


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,text,label
2148,Eruptive epidermoid cysts resulting from treat...,1
8891,"Diagnosis, therapy, and complications of thera...",0
7660,This case report describes the diagnostic work...,0
7507,"During IFN therapy, serum aminotransferases fe...",0
8174,Reversal of post-reperfusion coagulopathy by p...,0


### Determine ratio of positive ADE phrases compared to total dataset

In [3]:
df['label'].sum()/len(df)

0.29005783296478993

### Initialise Sagemaker variables and copy data into S3 bucket

In [4]:
from sagemaker.huggingface.processing import HuggingFaceProcessor
import sagemaker
from sagemaker import get_execution_role

In [5]:
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = f"az-ade-{sess.account_id()}"
sess._create_s3_bucket_if_it_does_not_exist(bucket_name=bucket, region=sess._region_name)
filepath = f"s3://{bucket}/data/ade.csv"

In [6]:
filepath

's3://az-ade-167374998168/data/ade.csv'

In [7]:
!aws s3 cp data/ade.csv $filepath


The user-provided path data/ade.csv does not exist.


### Save the name of the S3 bucket for later sessions

In [8]:
%store bucket

Stored 'bucket' (str)


### Set up processing job

In [9]:
hf_processor = HuggingFaceProcessor(
    role=role,
    instance_type="ml.p3.2xlarge",
    transformers_version='4.6',
    base_job_name="az-ade",
    pytorch_version='1.7',
    instance_count=1,
)

In [10]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

outputs=[
        ProcessingOutput(output_name="train_data", source="/opt/ml/processing/training", destination=f"s3://{bucket}/processing_output/train_data"),
        ProcessingOutput(output_name="validation_data", source="/opt/ml/processing/validation", destination=f"s3://{bucket}/processing_output/validation_data"),
        ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test", destination=f"s3://{bucket}/processing_output/test_data"),
    ]
arguments = ["--dataset-name", "ade_corpus_v2",
             "--datasubset-name", "Ade_corpus_v2_classification",
             "--model-name", "distilbert-base-uncased",
             "--train-ratio", "0.7",
             "--val-ratio", "0.15",]

In [11]:
hf_processor.run(
    code="scripts/preprocess.py",
    outputs=outputs,
    arguments=arguments
)


Job Name:  az-ade-2021-10-07-13-51-31-200
Inputs:  [{'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-167374998168/az-ade-2021-10-07-13-51-31-200/source/sourcedir.tar.gz', 'LocalPath': '/opt/ml/processing/input/code/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'entrypoint', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-167374998168/az-ade-2021-10-07-13-51-31-200/source/runproc.sh', 'LocalPath': '/opt/ml/processing/input/entrypoint', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://az-ade-167374998168/processing_output/train_data', 'LocalPath': '/opt/ml/processing/training', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'validation_data', 'AppManaged': False, 'S3Output

In [12]:
preprocessing_job_description = hf_processor.jobs[-1].describe()

output_config = preprocessing_job_description['ProcessingOutputConfig']
for output in output_config['Outputs']:
    print(output['S3Output']['S3Uri'])

s3://az-ade-167374998168/processing_output/train_data
s3://az-ade-167374998168/processing_output/validation_data
s3://az-ade-167374998168/processing_output/test_data
