In [None]:
%%sh
pip -q install spacy
python -m spacy download en
python -m spacy validate

## Inspect and processing data manually

In [None]:
%%sh
aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Camera_v1_00.tsv.gz /tmp

In [1]:
import pandas as pd

In [None]:
data = pd.read_csv('/tmp/amazon_reviews_us_Camera_v1_00.tsv.gz', sep='\t', compression='gzip',
                   error_bad_lines=False, dtype='str')

In [None]:
data.head()

In [None]:
data = data.dropna()
print(data.shape)

In [None]:
data = data[:100000]

In [None]:
data = data[['review_body']]

In [None]:
data.head()

In [None]:
import spacy

spacy_nlp = spacy.load('en')

def tokenize(text):
    tokens = spacy_nlp.tokenizer(text)
    tokens = [ t.text for t in tokens ]
    return " ".join(tokens).lower()

In [None]:
%%time
data['review_body'] = data['review_body'].apply(tokenize)

In [None]:
data.head()

In [None]:
import numpy as np

np.savetxt('/tmp/training.txt', data.values, fmt='%s')

In [None]:
!head -5 /tmp/training.txt

## Training

In [2]:
import boto3
import sagemaker

print(sagemaker.__version__)

session = sagemaker.Session()
bucket = session.default_bucket()

2.0.0rc1


In [None]:
# Run this cell if you want to use the data you processed manually

prefix = 'amazon-reviews-word2vec'

s3_train_path = session.upload_data(path='/tmp/training.txt', bucket=bucket, key_prefix=prefix+'/input/train')
s3_output = 's3://{}/{}/output/'.format(bucket, prefix)

print(s3_train_path)
print(s3_output)

In [3]:
# Run this cell if you want to use the data processed by SageMaker Processing

prefix = 'amazon-reviews-word2vec'

s3_train_path = 's3://sagemaker-us-east-1-886035371869/sagemaker-scikit-learn-2021-03-27-08-00-51-443/output/train_data'
s3_output = 's3://{}/{}/output/'.format(bucket, prefix)

print(s3_train_path)
print(s3_output)

s3://sagemaker-us-east-1-886035371869/sagemaker-scikit-learn-2021-03-27-08-00-51-443/output/train_data
s3://sagemaker-us-east-1-886035371869/amazon-reviews-word2vec/output/


In [4]:
from sagemaker import image_uris

region = boto3.Session().region_name    
container = image_uris.retrieve('blazingtext', region)
print(container)

811284229777.dkr.ecr.us-east-1.amazonaws.com/blazingtext:1


In [5]:
role = sagemaker.get_execution_role()

bt = sagemaker.estimator.Estimator(container,
                                   role, 
                                   instance_count=1, 
                                   instance_type='ml.p3.2xlarge',
                                   output_path=s3_output)

In [6]:
bt.set_hyperparameters(mode='skipgram', subwords=True)

In [7]:
train_data = sagemaker.TrainingInput(s3_train_path, 
                      distribution='FullyReplicated', 
                      content_type='text/plain',
                      s3_data_type='S3Prefix')

s3_channels = {'train': train_data}

In [8]:
bt.fit(inputs=s3_channels)

2021-03-27 08:19:46 Starting - Starting the training job...
2021-03-27 08:19:48 Starting - Launching requested ML instances.........
2021-03-27 08:21:21 Starting - Preparing the instances for training......
2021-03-27 08:22:39 Downloading - Downloading input data
2021-03-27 08:22:39 Training - Downloading the training image...
2021-03-27 08:23:01 Training - Training image download completed. Training in progress.[34mArguments: train[0m
[34m[03/27/2021 08:23:02 INFO 140065586574720] nvidia-smi took: 0.12607836723327637 secs to identify 1 gpus[0m
[34m[03/27/2021 08:23:02 INFO 140065586574720] Running BlazingText on singe GPU using skipgram[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[03/27/2021 08:23:02 INFO 140065586574720] Processing /opt/ml/input/data/train/training.txt . File size: 24.75409507751465 MB[0m
[34mRead 5M words[0m
[34mNumber of words:  15958[0m
[34mInitialized GPU 0 successfully! Now starting training....[0m
[34m##### Alpha: 0.0489  Progre

In [None]:
%%bash -s "$s3_output"
aws s3 ls --recursive $1