#### Data is from:
* https://www.kaggle.com/skillsmuggler/amazon-ratings

In [16]:
import pandas as pd
import numpy as np
pd.set_option('display.width',170, 'display.max_rows',200, 'display.max_columns',900)

In [17]:
def print_info(df):
    print ("\n\n---------------------")
    print ("Dataset INFORMATION")
    print ("---------------------")
    print ("Shape of data set:", df.shape, "\n")
    print ("Column Headers:", list(df.columns.values), "\n")
    print (df.dtypes)
    
def print_detailed_report(df):
    import re
    missing_values = []
    nonumeric_values = []

    print ("Dataset INFORMATION")
    print ("========================\n")

    for column in df:
        # Find all the unique feature values
        uniq = df[column].unique()
        print ("'{}' has {} unique values" .format(column,uniq.size))
        if (uniq.size > 10):
            print("~~Listing up to 10 unique values~~")
        print (uniq[0:10])
        print ("\n-----------------------------------------------------------------------\n")

        # Find features with missing values
        if (True in pd.isnull(uniq)):
            s = "{} has {} missing" .format(column, pd.isnull(df[column]).sum())
            missing_values.append(s)

        # Find features with non-numeric values
        for i in range (1, np.prod(uniq.shape)):
            if (re.match('nan', str(uniq[i]))):
                break
            if not (re.search('(^\d+\.?\d*$)|(^\d*\.?\d+$)', str(uniq[i]))):
                nonumeric_values.append(column)
                break

    print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
    print ("Features with missing values:\n{}\n\n" .format(missing_values))
    print ("Features with non-numeric values:\n{}" .format(nonumeric_values))
    print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")


In [18]:
df = pd.read_csv("s3://devpost-aws/ratings_Beauty.csv")

In [19]:
print_info(df)



---------------------
Dataset INFORMATION
---------------------
Shape of data set: (2023070, 4) 

Column Headers: ['UserId', 'ProductId', 'Rating', 'Timestamp'] 

UserId        object
ProductId     object
Rating       float64
Timestamp      int64
dtype: object


In [20]:
print_detailed_report(df)

Dataset INFORMATION

'UserId' has 1210271 unique values
~~Listing up to 10 unique values~~
['A39HTATAQ9V7YF' 'A3JM6GV9MNOF9X' 'A1Z513UWSAAO0F' 'A1WMRR494NWEWV'
 'A3IAAVS479H7M7' 'AKJHHD5VEH7VG' 'A1BG8QW55XHN6U' 'A22VW0P4VZHDE3'
 'A3V3RE4132GKRO' 'A327B0I7CYTEJC']

-----------------------------------------------------------------------

'ProductId' has 249274 unique values
~~Listing up to 10 unique values~~
['0205616461' '0558925278' '0733001998' '0737104473' '0762451459'
 '1304139212' '1304139220' '130414089X' '130414643X' '1304146537']

-----------------------------------------------------------------------

'Rating' has 5 unique values
[5. 3. 4. 1. 2.]

-----------------------------------------------------------------------

'Timestamp' has 4231 unique values
~~Listing up to 10 unique values~~
[1369699200 1355443200 1404691200 1382572800 1274227200 1404518400
 1371945600 1373068800 1401840000 1389052800]

-----------------------------------------------------------------------


~~~~~

### Timestamp is in UNIX

In [21]:
df['weekday'] = pd.to_datetime(df['Timestamp'], unit ='s').dt.weekday

In [22]:
dfg = df.groupby('ProductId').agg(
{'UserId': 'count',
 'Rating': 'median', 
 'weekday': lambda x: max(list(x))  
})

In [23]:
dfg =  dfg[(dfg['UserId'] > 25)  & (dfg['UserId'] < 100)]

In [24]:
dfg['Rating'] = dfg['Rating'].astype('int')

In [25]:
dfg.head()

Unnamed: 0_level_0,UserId,Rating,weekday
ProductId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7806397051,35,3,6
9746427962,41,5,6
9759091062,40,3,6
9788071198,36,5,6
9788072216,34,5,6


In [27]:
dfg.to_csv('s3://******/training_data/amazon_beauty.csv', index=False, header = None, mode='wb')

In [35]:
import sagemaker as sage
from sagemaker import get_execution_role

role = get_execution_role()
sagemaker_session = sage.Session()

# Specify the ARN of the model package you will be using. You can get this
# from the sagemaker console after you subscribe to the model package.
model_package_arn = 'arn:aws:sagemaker:us-east-1:*****:model-package/use-uber-reg-basic-f5e7663b6e187913cecca2eaa77a85b8'

In [36]:
transform_input ="s3://sagemaker-us-east-****/training_data/amazon_beauty.csv"

In [43]:
#print(f'transform input: {transform_input}')

In [44]:
# Create model from model package
model = sage.ModelPackage(
            role=sage.get_execution_role(),
            model_package_arn=model_package_arn,
            sagemaker_session=sage.Session())

# Create the transformer
# A variety of parameters may be specified here including the output path where
# SageMaker will send the results of the transform. Since we do not specify the output,
# Sagemaker will leave the results in the default bucket. We will retrieve this location 
# below so that we can inspect the output.
transformer = model.transformer(1, 'ml.m4.xlarge')

# Run the transform job.
# By default, the output file contains only the inference result for each row.
# You can use the output_filter parameter to include any of the input columns. Review also
# input_filter which allows you to filter the parameters passed as input to the transformer.
# The combination of input_filter and output_filter gives you a lot of flexibility.
# By default, the API does not wait for the transform job to complete. You can control this with
# the wait parameter.
transformer.transform(transform_input, content_type='text/csv')

# The transform job sets the output path in the output_path member.
#print(f'Transform output: {transformer.output_path}')

In [41]:
from urllib.parse import urlparse
parsed_url = urlparse(transformer.output_path)
#parsed_url.path

In [42]:
file_key = "use-uber-reg-basic-*******************/amazon_beauty.csv.out"

s3_client = sagemaker_session.boto_session.client('s3')

response = s3_client.get_object(Bucket = sagemaker_session.default_bucket(), Key = file_key)
response_bytes = response['Body'].read().decode('utf-8')
print(response_bytes)

0.6498289704322815
0.3018924295902252
0.6498289704322815
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.6554905772209167
0.6554905772209167
0.3018924295902252
0.5651708841323853
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.5651708841323853
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295902252
0.3018924295