In [None]:
# download the data
!aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Software_v1_00.tsv.gz .

In [1]:
import pandas as pd
df = pd.read_csv('amazon_reviews_us_Digital_Software_v1_00.tsv.gz', compression='gzip', sep='\t')
df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,17747349,R2EI7QLPK4LF7U,B00U7LCE6A,106182406,CCleaner Free [Download],Digital_Software,4,0,0,N,Y,Four Stars,So far so good,2015-08-31
1,US,10956619,R1W5OMFK1Q3I3O,B00HRJMOM4,162269768,ResumeMaker Professional Deluxe 18,Digital_Software,3,0,0,N,Y,Three Stars,Needs a little more work.....,2015-08-31
2,US,13132245,RPZWSYWRP92GI,B00P31G9PQ,831433899,Amazon Drive Desktop [PC],Digital_Software,1,1,2,N,Y,One Star,Please cancel.,2015-08-31
3,US,35717248,R2WQWM04XHD9US,B00FGDEPDY,991059534,Norton Internet Security 1 User 3 Licenses,Digital_Software,5,0,0,N,Y,Works as Expected!,Works as Expected!,2015-08-31
4,US,17710652,R1WSPK2RA2PDEF,B00FZ0FK0U,574904556,SecureAnywhere Intermet Security Complete 5 De...,Digital_Software,4,1,2,N,Y,Great antivirus. Worthless customer support,I've had Webroot for a few years. It expired a...,2015-08-31


In [2]:
from sagemaker.huggingface.processing import HuggingFaceProcessor
import sagemaker
from sagemaker import get_execution_role

In [15]:
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
filepath = f"s3://sm-hf-processing/reviews.tsv.gz"

In [None]:
!aws s3 cp amazon_reviews_us_Digital_Software_v1_00.tsv.gz $filepath

In [17]:
hf_processor = HuggingFaceProcessor(
    role=role,
    instance_type="ml.p3.2xlarge",
    transformers_version='4.6',
    pytorch_version='1.7',
    instance_count=1,
)

In [18]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
inputs = [ProcessingInput(source=filepath, destination="/opt/ml/processing/input")]
outputs=[
        ProcessingOutput(output_name="train_data", source="/opt/ml/processing/training"),
        ProcessingOutput(output_name="validation_data", source="/opt/ml/processing/validation"),
        ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test"),
    ]
arguments = ["--file-name", "reviews.tsv.gz",
             "--model-name", "distilbert-base-uncased",
             "--train-ratio", "0.7",
             "--val-ratio", "0.15",
             "--star-threshold", "4"]

In [None]:
hf_processor.run(
    code="scripts/preprocessing-hf.py",
    inputs=inputs,
    outputs=outputs,
    arguments=arguments
)

In [None]:
preprocessing_job_description = hf_processor.jobs[-1].describe()

output_config = preprocessing_job_description['ProcessingOutputConfig']
for output in output_config['Outputs']:
    print(output['S3Output']['S3Uri'])