### Imports and configuration

In [3]:
import boto3
import sagemaker
import pandas as pd
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()
language = "en" # change this to de, making sure the data is in german
output_file_name = 'augmented_reviews.csv'

In [3]:
input_data = f's3://{bucket}/sm-nlp-data-aug/data/imdb.csv'

### Instantiate a HuggingFaceProcessor with Job configurations

In [None]:
hf_processor = HuggingFaceProcessor(
    role = role, 
    instance_type = 'ml.p3.2xlarge',
    transformers_version = '4.6',
    pytorch_version = '1.7',
    instance_count = 1,
)

### Set data and arguments configuration
We decouple the choice of source language from the processing script; the `language` parameter can be either `de` or `en`.

In [None]:
inputs = [ProcessingInput(
    source = input_data, 
    destination = "/opt/ml/processing/input")
         ]
outputs = [ProcessingOutput(
    output_name = 'augemented-text', 
    source="/opt/ml/processing/output", 
    destination =f's3://{bucket}/sm-nlp-data-aug/output/')
          ]
arguments = ["--file-name", "imdb.csv", 
             "--output-file-name", output_file_name,
             "--language", language]

### Start the Processing Job

In [None]:
hf_processor.run(
    code = 'scripts/aug-hf.py',
    inputs = inputs, 
    outputs = outputs, 
    arguments = arguments,
    wait = False
)

### Check the results

In [None]:
! aws s3 cp s3://"$bucket"/sm-nlp-data-aug/output/"$output_file_name" .

In [8]:
df_results = pd.read_csv('augmented_data.csv')

Print out all the translations and their original sentence, by their index in the dataframe 

In [None]:
selected_index = 1
df_results[df_results.ID==df_results.iloc[selected_index].ID]