## Inspect and processing data manually

In [2]:
%%sh
aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Camera_v1_00.tsv.gz /tmp

download: s3://amazon-reviews-pds/tsv/amazon_reviews_us_Camera_v1_00.tsv.gz to ../../../../../../tmp/amazon_reviews_us_Camera_v1_00.tsv.gz


In [1]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('/tmp/amazon_reviews_us_Camera_v1_00.tsv.gz', sep='\t', 
                   compression='gzip', error_bad_lines=False, dtype='str')
data.dropna(inplace=True)

b'Skipping line 85458: expected 15 fields, saw 22\nSkipping line 91161: expected 15 fields, saw 22\n'
b'Skipping line 166123: expected 15 fields, saw 22\n'
b'Skipping line 225458: expected 15 fields, saw 22\nSkipping line 229936: expected 15 fields, saw 22\nSkipping line 259297: expected 15 fields, saw 22\n'
b'Skipping line 284728: expected 15 fields, saw 22\nSkipping line 286334: expected 15 fields, saw 22\nSkipping line 293400: expected 15 fields, saw 22\nSkipping line 294415: expected 15 fields, saw 22\nSkipping line 308150: expected 15 fields, saw 22\nSkipping line 315022: expected 15 fields, saw 22\nSkipping line 315730: expected 15 fields, saw 22\nSkipping line 316071: expected 15 fields, saw 22\nSkipping line 326729: expected 15 fields, saw 22\n'
b'Skipping line 329101: expected 15 fields, saw 22\nSkipping line 333077: expected 15 fields, saw 22\nSkipping line 377031: expected 15 fields, saw 22\nSkipping line 389496: expected 15 fields, saw 22\nSkipping line 390486: expected 15 

In [4]:
print(data.shape)
print(data.columns)

(1800755, 15)
Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date'],
      dtype='object')


In [5]:
data.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,2975964,R1NBG94582SJE2,B00I01JQJM,860486164,GoPro Rechargeable Battery 2.0 (HERO3/HERO3+ o...,Camera,5,0,0,N,Y,Five Stars,ok,2015-08-31
1,US,23526356,R273DCA6Y0H9V7,B00TCO0ZAA,292641483,Professional 58mm Center Pinch Lens Cap for CA...,Camera,5,0,0,N,Y,Love it!!!,"Perfect, even sturdier than the original!",2015-08-31
2,US,52764145,RQVOXO7WUOFK6,B00B7733E0,75825744,Spy Tec Z12 Motion Activated Intelligent Secur...,Camera,2,1,1,N,Y,Another Motion Detect Fail,"If the words, &#34;Cheap Chinese Junk&#34; com...",2015-08-31
3,US,47348933,R1KWKSF21PO6HO,B006ZN4U34,789352955,"Celestron UpClose G2 10x25 Monocular, Black (7...",Camera,5,0,0,N,Y,Exactly what I wanted and expected.,Exactly what I wanted and expected. Perfect fo...,2015-08-31
4,US,33680700,R38H3UO1J190GI,B00HUEBGMU,19067902,Vidpro XM-L Wired Lavalier microphone - 20' Au...,Camera,5,1,1,N,Y,Good mic at a Good Price...Not Canon Though.,I will look past the fact that they tricked me...,2015-08-31


In [6]:
data = data[:100000]

In [7]:
data = data[['star_rating', 'review_body']]

In [8]:
data.star_rating.unique()

array(['5', '2', '3', '1', '4'], dtype=object)

In [9]:
data['label'] = data.star_rating.map({
    '1': '__label__negative__',
    '2': '__label__negative__',
    '3': '__label__neutral__',
    '4': '__label__positive__',
    '5': '__label__positive__'})

In [10]:
data = data.drop(['star_rating'], axis=1)

In [11]:
data = data[['label', 'review_body']]

In [12]:
data.head()

Unnamed: 0,label,review_body
0,__label__positive__,ok
1,__label__positive__,"Perfect, even sturdier than the original!"
2,__label__negative__,"If the words, &#34;Cheap Chinese Junk&#34; com..."
3,__label__positive__,Exactly what I wanted and expected. Perfect fo...
4,__label__positive__,I will look past the fact that they tricked me...


In [13]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
%%time
data['review_body'] = data['review_body'].apply(nltk.word_tokenize)

CPU times: user 57 s, sys: 369 ms, total: 57.4 s
Wall time: 59.3 s


In [15]:
%%time
data['review_body'] = data.apply(lambda row: " ".join(row['review_body']).lower(), axis=1)

CPU times: user 1.51 s, sys: 38.8 ms, total: 1.55 s
Wall time: 1.69 s


In [16]:
data.head()

Unnamed: 0,label,review_body
0,__label__positive__,ok
1,__label__positive__,"perfect , even sturdier than the original !"
2,__label__negative__,"if the words , & # 34 ; cheap chinese junk & #..."
3,__label__positive__,exactly what i wanted and expected . perfect f...
4,__label__positive__,i will look past the fact that they tricked me...


In [17]:
from sklearn.model_selection import train_test_split

training, validation = train_test_split(data, test_size=0.05)

In [18]:
print(training.shape)
print(validation.shape)

(95000, 2)
(5000, 2)


In [19]:
np.savetxt('/tmp/training.txt', training.values, fmt='%s')
np.savetxt('/tmp/validation.txt', validation.values, fmt='%s')

In [20]:
!head -5 /tmp/training.txt

__label__neutral__ i thought i would like it more . it 's as advertised for sure , i just kind of prefer the regular straps and will look for another one to replace my broken old one rather than keep using this strap .
__label__positive__ good quality , lightweight , and easy to adjust .
__label__neutral__ the eyepieces are good quality . its a nice set . but the case that you get is not the one displayed . you get a black case instead of the brushed aluminum . amazon , please show the correct case ! ! ! ! ! !
__label__neutral__ have n't used them yet , but expect they will work every bit as the original did .
__label__positive__ works great !


## Training

In [21]:
import boto3
import sagemaker

print(sagemaker.__version__)

session = sagemaker.Session()
bucket = session.default_bucket()

2.0.0rc1


In [None]:
# Run this cell if you want to use the data you processed manually

prefix = 'amazon-reviews'

s3_train_path = session.upload_data(path='/tmp/training.txt', bucket=bucket, key_prefix=prefix+'/input/train')
s3_val_path = session.upload_data(path='/tmp/validation.txt', bucket=bucket, key_prefix=prefix+'/input/validation')
s3_output = 's3://{}/{}/output/'.format(bucket, prefix)

print(s3_train_path)
print(s3_val_path)
print(s3_output)

In [23]:
# Run this cell if you want to use the data processed by SageMaker Processing

prefix = 'amazon-reviews'

s3_train_path = 's3://sagemaker-us-east-1-886035371869/sagemaker-scikit-learn-2021-03-27-05-43-09-515/output/train_data'
s3_val_path = 's3://sagemaker-us-east-1-886035371869/sagemaker-scikit-learn-2021-03-27-05-43-09-515/output/validation_data'
s3_output = 's3://{}/{}/output/'.format(bucket, prefix)

print(s3_train_path)
print(s3_val_path)
print(s3_output)

s3://sagemaker-us-east-1-886035371869/sagemaker-scikit-learn-2021-03-27-05-43-09-515/output/train_data
s3://sagemaker-us-east-1-886035371869/sagemaker-scikit-learn-2021-03-27-05-43-09-515/output/validation_data
s3://sagemaker-us-east-1-886035371869/amazon-reviews/output/


In [24]:
from sagemaker import image_uris

region = boto3.Session().region_name    
container = image_uris.retrieve('blazingtext', region)
print(container)

811284229777.dkr.ecr.us-east-1.amazonaws.com/blazingtext:1


In [25]:
role = sagemaker.get_execution_role()

bt = sagemaker.estimator.Estimator(container,
                                   role, 
                                   instance_count=1, 
                                   instance_type='ml.c5.2xlarge',
                                   output_path=s3_output)

In [26]:
bt.set_hyperparameters(mode='supervised')

In [27]:
train_data = sagemaker.TrainingInput(s3_train_path, 
                      distribution='FullyReplicated', 
                      content_type='text/plain',
                      s3_data_type='S3Prefix')

validation_data = sagemaker.TrainingInput(s3_val_path,
                      distribution='FullyReplicated', 
                      content_type='text/plain', 
                      s3_data_type='S3Prefix')

s3_channels = {'train': train_data, 'validation': validation_data}

In [28]:
bt.fit(inputs=s3_channels)

2021-03-27 08:14:19 Starting - Starting the training job...
2021-03-27 08:14:23 Starting - Launching requested ML instances......
2021-03-27 08:15:34 Starting - Preparing the instances for training...
2021-03-27 08:16:18 Downloading - Downloading input data......
2021-03-27 08:17:11 Training - Training image download completed. Training in progress.[34mArguments: train[0m
[34m[03/27/2021 08:17:12 INFO 140089906472576] nvidia-smi took: 0.02521038055419922 secs to identify 0 gpus[0m
[34m[03/27/2021 08:17:12 INFO 140089906472576] Running single machine CPU BlazingText training using supervised mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[03/27/2021 08:17:12 INFO 140089906472576] Processing /opt/ml/input/data/train/training.txt . File size: 740.6727781295776 MB[0m
[34m[03/27/2021 08:17:12 INFO 140089906472576] Processing /opt/ml/input/data/validation/validation.txt . File size: 39.17148780822754 MB[0m
[34mRead 10M words[0m
[34mRead 20M words[0m
[34mRe

In [29]:
bt_predictor = bt.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

---------------!

In [30]:
import json
import pprint

sentences = ['This is a bad camera it doesnt work at all , i want a refund  .', 
             'The camera works , the pictures are decent quality, nothing special to say about it .',
             'Very happy to have bought this , exactly what I needed']

payload = {"instances" : sentences, "configuration": {"k": 3}}

bt_predictor.serializer = sagemaker.serializers.JSONSerializer()
response = bt_predictor.predict(payload)

print(response)

b'[{"label": ["__label__negative__", "__label__neutral__", "__label__positive__"], "prob": [0.9976259469985962, 0.0023937320802360773, 1.0349267540732399e-05]}, {"label": ["__label__positive__", "__label__neutral__", "__label__negative__"], "prob": [0.6081520318984985, 0.38039788603782654, 0.01148008368909359]}, {"label": ["__label__positive__", "__label__neutral__", "__label__negative__"], "prob": [0.9998488426208496, 0.00017054460477083921, 1.0639761967468075e-05]}]'


In [None]:
bt_predictor.delete_endpoint()