# Make user, item, and interaction datasets
This notebook creates the official Amazon Personalize dataset resources for the
car search recommendations demo.

In [92]:
account_num = '<your-account-num>'

In [93]:
import json
import boto3
import time
import sagemaker

bucket   = sagemaker.Session().default_bucket() # or replace with your preferred s3 bucket
print(bucket)
prefix   = 'personalize/car'

region   = boto3.Session().region_name # or replace with your preferred region
print(region)

sagemaker-us-east-1-355151823911
us-east-1


In [94]:
dataset_group_name = 'car-dg'

CAR_INTERACTION_SCHEMA_NAME = 'car-interactions-schema'
CAR_INTERACTION_SCHEMA_ARN  = 'arn:aws:personalize:{}:{}:schema/'.format(region, account_num) + \
                                CAR_INTERACTION_SCHEMA_NAME
    
CAR_ITEM_SCHEMA_NAME = 'car-items-schema'
CAR_ITEM_SCHEMA_ARN  = 'arn:aws:personalize:{}:{}:schema/'.format(region, account_num) + \
                                CAR_ITEM_SCHEMA_NAME

CAR_USER_SCHEMA_NAME = 'car-users-schema'
CAR_USER_SCHEMA_ARN  = 'arn:aws:personalize:{}:{}:schema/'.format(region, account_num) + \
                                CAR_USER_SCHEMA_NAME

cars_filename         = 'car_items.csv'
users_filename        = 'users.csv'
interactions_filename = 'interactions.csv'

MAX_WAIT_TIME = time.time() + 60*60 # 1 hour

In [95]:
personalize = boto3.client('personalize')
personalize_runtime = boto3.client('personalize-runtime')

## Set up IAM role and allow Personalize to access your bucket

In [96]:
def allow_bucket_access():
    s3 = boto3.client('s3')

    policy = {
        "Version": "2012-10-17",
        "Id": "PersonalizeS3BucketAccessPolicy",
        "Statement": [
            {
                "Sid": "PersonalizeS3BucketAccessPolicy",
                "Effect": "Allow",
                "Principal": {
                    "Service": "personalize.amazonaws.com"
                },
                "Action": [
                    "s3:GetObject",
                    "s3:ListBucket"
                ],
                "Resource": [
                    "arn:aws:s3:::{}".format(bucket),
                    "arn:aws:s3:::{}/*".format(bucket)
                ]
            }
        ]
    }

    s3.put_bucket_policy(Bucket=bucket, Policy=json.dumps(policy))

In [97]:
allow_bucket_access()

In [98]:
def create_personalize_role():
    iam = boto3.client('iam')

    role_name = 'PersonalizeS3Role'
    assume_role_policy_document = {
        "Version": "2012-10-17",
        "Statement": [
            {
              "Effect": "Allow",
              "Principal": {
                "Service": "personalize.amazonaws.com"
              },
              "Action": "sts:AssumeRole"
            }
        ]
    }

    try:
        print('Creating role: {}...'.format(role_name))
        create_role_response = iam.create_role(
            RoleName = role_name,
            AssumeRolePolicyDocument = json.dumps(assume_role_policy_document)
        )
    except Exception as e:
        print('role creation failed. Likely already existed.')

    # AmazonPersonalizeFullAccess provides access to any S3 bucket with a name that includes "personalize" or "Personalize" 
    # if you would like to use a bucket with a different name, please consider creating and attaching a new policy
    # that provides read access to your bucket or attaching the AmazonS3ReadOnlyAccess policy to the role
    print('Attaching Personalize full access policy...')
    pers_policy_arn = 'arn:aws:iam::aws:policy/service-role/AmazonPersonalizeFullAccess'
    iam.attach_role_policy(
        RoleName  = role_name,
        PolicyArn = pers_policy_arn
    )
    print('Attaching S3 read-only access policy...')
    s3_policy_arn = 'arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess'
    iam.attach_role_policy(
        RoleName  = role_name,
        PolicyArn = s3_policy_arn
    )

    print('Waiting for policy attachment to propagate...')
    time.sleep(60) # wait for a minute to allow IAM role policy attachment to propagate

    role_arn = 'arn:aws:iam::{}:role/{}'.format(account_num, role_name)
    return role_arn

In [99]:
role_arn = create_personalize_role()
print(role_arn)

Creating role: PersonalizeS3Role...
role creation failed. Likely already existed.
Attaching Personalize full access policy...
Attaching S3 read-only access policy...
Waiting for policy attachment to propagate...
arn:aws:iam::355151823911:role/PersonalizeS3Role


## Create the INTERACTIONS schema
Create the INTERACTIONS schema if it is not in place already.

In [100]:
try:
    # first see if the schema is already in place
    arn = CAR_INTERACTION_SCHEMA_ARN
    response = personalize.describe_schema(schemaArn=arn)
    interactions_schema_arn = response['schema']['schemaArn']
    print(interactions_schema_arn)
except Exception as e:
    print('Schema {} did not exist, creating it...'.format(arn))
    schema = {
        "type": "record",
        "name": "Interactions",
        "namespace": "com.amazonaws.personalize.schema",
        "fields": [
            {
                "name": "USER_ID",
                "type": "string"
            },
            {
                "name": "ITEM_ID",
                "type": "string"
            },
            {
                "name": "TIMESTAMP",
                "type": "long"
            }
        ],
        "version": "1.0"
    }

    create_schema_response = personalize.create_schema(
        name   = CAR_INTERACTION_SCHEMA_NAME,
        schema = json.dumps(schema)
    )

    interactions_schema_arn = create_schema_response['schemaArn']
    print(json.dumps(create_schema_response, indent=2))

Schema arn:aws:personalize:us-east-1:355151823911:schema/car-interactions-schema did not exist, creating it...
{
  "schemaArn": "arn:aws:personalize:us-east-1:355151823911:schema/car-interactions-schema",
  "ResponseMetadata": {
    "RequestId": "8f971450-7213-498f-b963-47b3e5649a99",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Wed, 17 Jul 2019 15:19:29 GMT",
      "x-amzn-requestid": "8f971450-7213-498f-b963-47b3e5649a99",
      "content-length": "89",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


## Create the ITEMS schema
Create the ITEMS schema if it is not in place already.

In [101]:
try:
    arn = CAR_ITEM_SCHEMA_ARN
    response = personalize.describe_schema(schemaArn=arn)
    items_schema_arn = response['schema']['schemaArn']
    print(items_schema_arn)
except Exception as e:
    print('schema not found, creating new...')
    schema = {
        "type": "record",
        "name": "Items",
        "namespace": "com.amazonaws.personalize.schema",
        "fields": [
            {
                "name": "ITEM_ID",
                "type": "string"
            },
            {
                "name": "MAKE",
                "type": "string",
                "categorical": True
            },
            {
                "name": "MODEL",
                "type": "string",
                "categorical": True
            },
            {
                "name": "YEAR",
                "type": "int"
            },
            {
                "name": "MILEAGE",
                "type": "int"
            },
            {
                "name": "PRICE",
                "type": "int"
            }
            #,
#            {
#                "name": "COLOR",
#                "type": "string",
#                "categorical": True
#            }#,  Max of 5 metadata columns is the Personalize limit for now
    #        {
    #            "name": "LOCATION",
    #            "type": "string",
    #            "categorical": True
    #        }
        ],
        "version": "1.0"
    }

    create_schema_response = personalize.create_schema(
        name   = CAR_ITEM_SCHEMA_NAME,
        schema = json.dumps(schema)
    )

    items_schema_arn = create_schema_response['schemaArn']
    print(json.dumps(create_schema_response, indent=2))

schema not found, creating new...
{
  "schemaArn": "arn:aws:personalize:us-east-1:355151823911:schema/car-items-schema",
  "ResponseMetadata": {
    "RequestId": "67f9b1bc-d98a-4ca7-92f0-86e9f9892bf9",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Wed, 17 Jul 2019 15:19:29 GMT",
      "x-amzn-requestid": "67f9b1bc-d98a-4ca7-92f0-86e9f9892bf9",
      "content-length": "82",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


## Create the USERS schema
Create the USERS schema if it is not in place already.

In [102]:
try:
    arn = CAR_USER_SCHEMA_ARN
    response = personalize.describe_schema(schemaArn=arn)
    users_schema_arn = response['schema']['schemaArn']
    print(users_schema_arn)
except Exception as e:
    schema = {
        "type": "record",
        "name": "Users",
        "namespace": "com.amazonaws.personalize.schema",
        "fields": [
            {
                "name": "USER_ID",
                "type": "string"
            },
            {
                "name": "AGE",
                "type": "int"
            },
            {
                "name": "GENDER",
                "type": "string",
                "categorical": True
            },
            {
                "name": "LOCATION",
                "type": "string",
                "categorical": True
            },
            {
                "name": "SALARY",
                "type": "int"
            }
        ],
        "version": "1.0"
    }

    create_schema_response = personalize.create_schema(
        name   = CAR_USER_SCHEMA_NAME,
        schema = json.dumps(schema)
    )

    users_schema_arn = create_schema_response['schemaArn']
    print(json.dumps(create_schema_response, indent=2))

{
  "schemaArn": "arn:aws:personalize:us-east-1:355151823911:schema/car-users-schema",
  "ResponseMetadata": {
    "RequestId": "8b7d984d-dff2-48c5-8f1d-0918de7a64fa",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Wed, 17 Jul 2019 15:19:30 GMT",
      "x-amzn-requestid": "8b7d984d-dff2-48c5-8f1d-0918de7a64fa",
      "content-length": "82",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


## Create a dataset group and the datasets within it
This assumes you have done any cleanup of prior versions of the datasets and dataset group. If not, use
the cleanup notebook.

In [103]:
print('\nCreating new dataset group {}'.format(dataset_group_name))
create_dataset_group_response = personalize.create_dataset_group(
    name = dataset_group_name
)

dataset_group_arn = create_dataset_group_response['datasetGroupArn']
print(json.dumps(create_dataset_group_response, indent=2))


Creating new dataset group car-dg
{
  "datasetGroupArn": "arn:aws:personalize:us-east-1:355151823911:dataset-group/car-dg",
  "ResponseMetadata": {
    "RequestId": "c5e8869e-de50-452f-ac10-d57a29428af6",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Wed, 17 Jul 2019 15:19:29 GMT",
      "x-amzn-requestid": "c5e8869e-de50-452f-ac10-d57a29428af6",
      "content-length": "85",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [104]:
max_time = time.time() + MAX_WAIT_TIME
while time.time() < max_time:
    describe_dataset_group_response = personalize.describe_dataset_group(
        datasetGroupArn = dataset_group_arn
    )
    status = describe_dataset_group_response['datasetGroup']['status']
    print('DatasetGroup: {}'.format(status))
    
    if status == 'ACTIVE' or status == 'CREATE FAILED':
        break
        
    time.sleep(60)

DatasetGroup: CREATE PENDING
DatasetGroup: ACTIVE


### Create the INTERACTIONS dataset

In [105]:
dataset_type = 'INTERACTIONS'
create_dataset_response = personalize.create_dataset(
    name = 'car-interactions',
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = interactions_schema_arn
)

interactions_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-east-1:355151823911:dataset/car-dg/INTERACTIONS",
  "ResponseMetadata": {
    "RequestId": "39569f62-a25c-4968-aa12-aefef49eeeb1",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Wed, 17 Jul 2019 15:20:30 GMT",
      "x-amzn-requestid": "39569f62-a25c-4968-aa12-aefef49eeeb1",
      "content-length": "87",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [106]:
print(items_schema_arn)
print(dataset_group_arn)

arn:aws:personalize:us-east-1:355151823911:schema/car-items-schema
arn:aws:personalize:us-east-1:355151823911:dataset-group/car-dg


### Create the ITEMS dataset

In [107]:
dataset_type = 'ITEMS'
create_dataset_response = personalize.create_dataset(
    name = 'car-items',
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = items_schema_arn
)

items_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-east-1:355151823911:dataset/car-dg/ITEMS",
  "ResponseMetadata": {
    "RequestId": "b4768fc1-4d4a-4c46-b101-d309b99cdc2f",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Wed, 17 Jul 2019 15:20:30 GMT",
      "x-amzn-requestid": "b4768fc1-4d4a-4c46-b101-d309b99cdc2f",
      "content-length": "80",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


### Create the USERS dataset

In [108]:
dataset_type = 'USERS'
create_dataset_response = personalize.create_dataset(
    name = 'car-users',
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = users_schema_arn
)

users_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-east-1:355151823911:dataset/car-dg/USERS",
  "ResponseMetadata": {
    "RequestId": "aad1a35a-8445-4a1d-968a-17e20f522bb9",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Wed, 17 Jul 2019 15:20:30 GMT",
      "x-amzn-requestid": "aad1a35a-8445-4a1d-968a-17e20f522bb9",
      "content-length": "80",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


## Create an import job for each of the datasets

In [109]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = '{}-car-interactions-import'.format(dataset_group_name),
    datasetArn = interactions_dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}/{}".format(bucket, prefix, interactions_filename)
    },
    roleArn = role_arn
)

interactions_dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-east-1:355151823911:dataset-import-job/car-dg-car-interactions-import",
  "ResponseMetadata": {
    "RequestId": "0c0d77c4-6e7b-4ce4-86ef-989ef1538e65",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Wed, 17 Jul 2019 15:20:30 GMT",
      "x-amzn-requestid": "0c0d77c4-6e7b-4ce4-86ef-989ef1538e65",
      "content-length": "118",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [110]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = '{}-car-items-import'.format(dataset_group_name),
    datasetArn = items_dataset_arn,
    dataSource = {
        'dataLocation': 's3://{}/{}/{}'.format(bucket, prefix, cars_filename)
    },
    roleArn = role_arn
)

items_dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-east-1:355151823911:dataset-import-job/car-dg-car-items-import",
  "ResponseMetadata": {
    "RequestId": "7c3b0394-16d0-49a4-bcdc-f8a40e9a2a0e",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Wed, 17 Jul 2019 15:20:30 GMT",
      "x-amzn-requestid": "7c3b0394-16d0-49a4-bcdc-f8a40e9a2a0e",
      "content-length": "111",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [111]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = '{}-car-users-import'.format(dataset_group_name),
    datasetArn = users_dataset_arn,
    dataSource = {
        'dataLocation': 's3://{}/{}/{}'.format(bucket, prefix, users_filename)
    },
    roleArn = role_arn
)

users_dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-east-1:355151823911:dataset-import-job/car-dg-car-users-import",
  "ResponseMetadata": {
    "RequestId": "12ea2eaa-8e8c-4737-aaa6-21d0d618fb4b",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Wed, 17 Jul 2019 15:20:32 GMT",
      "x-amzn-requestid": "12ea2eaa-8e8c-4737-aaa6-21d0d618fb4b",
      "content-length": "111",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


### Wait for the dataset import jobs to complete

In [112]:
print('Waiting for ITEMS data import to complete...')
max_time = time.time() + MAX_WAIT_TIME
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = items_dataset_import_job_arn
    )
    status = describe_dataset_import_job_response['datasetImportJob']['status']
    print('DatasetImportJob: {}'.format(status))
    
    if status == 'ACTIVE' or status == 'CREATE FAILED':
        break
        
    time.sleep(60)
    if status == 'ACTIVE':
        print('ITEMS dataset is ACTIVE.')

Waiting for ITEMS data import to complete...
DatasetImportJob: CREATE PENDING
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: ACTIVE


In [113]:
print('Waiting for USERS data import to complete...')
max_time = time.time() + MAX_WAIT_TIME
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = users_dataset_import_job_arn
    )
    status = describe_dataset_import_job_response['datasetImportJob']['status']
    print('DatasetImportJob: {}'.format(status))
    
    if status == 'ACTIVE' or status == 'CREATE FAILED':
        break
        
    time.sleep(60)
    if status == 'ACTIVE':
        print('USERS dataset is ACTIVE.')

Waiting for USERS data import to complete...
DatasetImportJob: ACTIVE


In [114]:
print('Waiting for INTERACTIONS data import to complete...')
max_time = time.time() + MAX_WAIT_TIME
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = interactions_dataset_import_job_arn
    )
    status = describe_dataset_import_job_response['datasetImportJob']['status']
    print('DatasetImportJob: {}'.format(status))
    
    if status == 'ACTIVE' or status == 'CREATE FAILED':
        break
        
    time.sleep(60)
    if status == 'ACTIVE':
        print('INTERACTIONS dataset is ACTIVE.')

Waiting for INTERACTIONS data import to complete...
DatasetImportJob: ACTIVE
