# Make user, item, and interaction datasets
This notebook creates the official Amazon Personalize dataset resources for the
car search recommendations demo.

In [76]:
import json
import boto3
import time

cars_filename         = 'car_items.csv'
users_filename        = 'users.csv'
interactions_filename = 'interactions.csv'

dataset_group_name = 'car-dg10'

schema_version = 'v10'

bucket   = 'roymark-aws-ml'
prefix   = 'personalize/' + schema_version

MAX_WAIT_TIME = time.time() + 60*60 # 1 hour

role_arn = 'arn:aws:iam::355151823911:role/PersonalizeS3Role'

CAR_INTERACTION_SCHEMA_NAME = 'car-interactions-schema-' + schema_version
CAR_INTERACTION_SCHEMA_ARN  = 'arn:aws:personalize:us-east-1:355151823911:schema/' + \
                                CAR_INTERACTION_SCHEMA_NAME
    
CAR_ITEM_SCHEMA_NAME = 'car-items-schema-' + schema_version
CAR_ITEM_SCHEMA_ARN  = 'arn:aws:personalize:us-east-1:355151823911:schema/' + \
                                CAR_ITEM_SCHEMA_NAME

CAR_USER_SCHEMA_NAME = 'car-users-schema-' + schema_version
CAR_USER_SCHEMA_ARN  = 'arn:aws:personalize:us-east-1:355151823911:schema/' + \
                                CAR_USER_SCHEMA_NAME

In [77]:
personalize = boto3.client('personalize')
personalize_runtime = boto3.client('personalize-runtime')

In [78]:
try:
    # first see if the schema is already in place
    arn = CAR_INTERACTION_SCHEMA_ARN
    response = personalize.describe_schema(schemaArn=arn)
    interactions_schema_arn = response['schema']['schemaArn']
    print(interactions_schema_arn)
except Exception as e:
    print('Schema {} did not exist, creating it...'.format(arn))
    schema = {
        "type": "record",
        "name": "Interactions",
        "namespace": "com.amazonaws.personalize.schema",
        "fields": [
            {
                "name": "USER_ID",
                "type": "string"
            },
            {
                "name": "ITEM_ID",
                "type": "string"
            },
            {
                "name": "TIMESTAMP",
                "type": "long"
            }
        ],
        "version": "1.0"
    }

    create_schema_response = personalize.create_schema(
        name   = CAR_INTERACTION_SCHEMA_NAME,
        schema = json.dumps(schema)
    )

    interactions_schema_arn = create_schema_response['schemaArn']
    print(json.dumps(create_schema_response, indent=2))

An error occurred (ResourceNotFoundException) when calling the DescribeSchema operation: Resource Arn arn:aws:personalize:us-east-1:355151823911:schema/car-interactions-schema-v10 does not exist.
Schema arn:aws:personalize:us-east-1:355151823911:schema/car-interactions-schema-v10 did not exist, creating it...
{
  "schemaArn": "arn:aws:personalize:us-east-1:355151823911:schema/car-interactions-schema-v10",
  "ResponseMetadata": {
    "RequestId": "cfc74463-a8a6-43e6-9e50-0535ae302090",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Thu, 11 Jul 2019 14:44:24 GMT",
      "x-amzn-requestid": "cfc74463-a8a6-43e6-9e50-0535ae302090",
      "content-length": "93",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [79]:
try:
    arn = CAR_ITEM_SCHEMA_ARN
    response = personalize.describe_schema(schemaArn=arn)
    items_schema_arn = response['schema']['schemaArn']
    print(items_schema_arn)
except Exception as e:
    print('schema not found, creating new...')
    schema = {
        "type": "record",
        "name": "Items",
        "namespace": "com.amazonaws.personalize.schema",
        "fields": [
            {
                "name": "ITEM_ID",
                "type": "string"
            },
            {
                "name": "MAKE",
                "type": "string",
                "categorical": True
            },
            {
                "name": "MODEL",
                "type": "string",
                "categorical": True
            },
            {
                "name": "YEAR",
                "type": "int"
            },
            {
                "name": "MILEAGE",
                "type": "int"
            },
            {
                "name": "PRICE",
                "type": "int"
            }
            #,
#            {
#                "name": "COLOR",
#                "type": "string",
#                "categorical": True
#            }#,  Max of 5 metadata columns is the Personalize limit for now
    #        {
    #            "name": "LOCATION",
    #            "type": "string",
    #            "categorical": True
    #        }
        ],
        "version": "1.0"
    }

    create_schema_response = personalize.create_schema(
        name   = CAR_ITEM_SCHEMA_NAME,
        schema = json.dumps(schema)
    )

    items_schema_arn = create_schema_response['schemaArn']
    print(json.dumps(create_schema_response, indent=2))

schema not found, creating new...
{
  "schemaArn": "arn:aws:personalize:us-east-1:355151823911:schema/car-items-schema-v10",
  "ResponseMetadata": {
    "RequestId": "8b9a7077-0e89-4eb3-9092-b821db8a4557",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Thu, 11 Jul 2019 14:44:24 GMT",
      "x-amzn-requestid": "8b9a7077-0e89-4eb3-9092-b821db8a4557",
      "content-length": "86",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [80]:
try:
    arn = CAR_USER_SCHEMA_ARN
    response = personalize.describe_schema(schemaArn=arn)
    users_schema_arn = response['schema']['schemaArn']
    print(users_schema_arn)
except Exception as e:
    schema = {
        "type": "record",
        "name": "Users",
        "namespace": "com.amazonaws.personalize.schema",
        "fields": [
            {
                "name": "USER_ID",
                "type": "string"
            },
            {
                "name": "AGE",
                "type": "int"
            },
            {
                "name": "GENDER",
                "type": "string",
                "categorical": True
            },
            {
                "name": "LOCATION",
                "type": "string",
                "categorical": True
            },
            {
                "name": "SALARY",
                "type": "int"
            }
        ],
        "version": "1.0"
    }

    create_schema_response = personalize.create_schema(
        name   = CAR_USER_SCHEMA_NAME,
        schema = json.dumps(schema)
    )

    users_schema_arn = create_schema_response['schemaArn']
    print(json.dumps(create_schema_response, indent=2))

{
  "schemaArn": "arn:aws:personalize:us-east-1:355151823911:schema/car-users-schema-v10",
  "ResponseMetadata": {
    "RequestId": "1af7e932-895c-4e0c-9da2-43752b60d7c0",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Thu, 11 Jul 2019 14:44:23 GMT",
      "x-amzn-requestid": "1af7e932-895c-4e0c-9da2-43752b60d7c0",
      "content-length": "86",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [82]:
try:
    personalize.delete_dataset(datasetArn='arn:aws:personalize:us-east-1:355151823911:dataset/{}/INTERACTIONS'.format(dataset_group_name))
except Exception as e:
    pass
    
try:
    personalize.delete_dataset(datasetArn='arn:aws:personalize:us-east-1:355151823911:dataset/{}/ITEMS'.format(dataset_group_name))
except Exception as e:
    pass

try:
    personalize.delete_dataset(datasetArn='arn:aws:personalize:us-east-1:355151823911:dataset/{}/USERS'.format(dataset_group_name))
except Exception as e:
    pass

try:
    personalize.delete_dataset_group(datasetGroupArn='arn:aws:personalize:us-east-1:355151823911:dataset-group/{}'.format(dataset_group_name))
except Exception as e:
    pass

print('Waiting for dataset group to be created...')
time.sleep(30)
        
create_dataset_group_response = personalize.create_dataset_group(
    name = dataset_group_name
)

dataset_group_arn = create_dataset_group_response['datasetGroupArn']
print(json.dumps(create_dataset_group_response, indent=2))

Waiting for dataset group to be created...
{
  "datasetGroupArn": "arn:aws:personalize:us-east-1:355151823911:dataset-group/car-dg10",
  "ResponseMetadata": {
    "RequestId": "8d8d65fa-2ff2-4d8a-9f92-b23a743f070e",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Thu, 11 Jul 2019 14:46:49 GMT",
      "x-amzn-requestid": "8d8d65fa-2ff2-4d8a-9f92-b23a743f070e",
      "content-length": "87",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [83]:
max_time = time.time() + MAX_WAIT_TIME
while time.time() < max_time:
    describe_dataset_group_response = personalize.describe_dataset_group(
        datasetGroupArn = dataset_group_arn
    )
    status = describe_dataset_group_response['datasetGroup']['status']
    print('DatasetGroup: {}'.format(status))
    
    if status == 'ACTIVE' or status == 'CREATE FAILED':
        break
        
    time.sleep(60)

DatasetGroup: CREATE PENDING
DatasetGroup: ACTIVE


In [84]:
dataset_type = 'INTERACTIONS'
create_dataset_response = personalize.create_dataset(
    name = 'car-interactions',
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = interactions_schema_arn
)

interactions_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-east-1:355151823911:dataset/car-dg10/INTERACTIONS",
  "ResponseMetadata": {
    "RequestId": "ec4c84b2-e1b8-494d-8256-88a01ac95634",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Thu, 11 Jul 2019 14:47:49 GMT",
      "x-amzn-requestid": "ec4c84b2-e1b8-494d-8256-88a01ac95634",
      "content-length": "89",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [85]:
print(items_schema_arn)
print(dataset_group_arn)

arn:aws:personalize:us-east-1:355151823911:schema/car-items-schema-v10
arn:aws:personalize:us-east-1:355151823911:dataset-group/car-dg10


In [86]:
dataset_type = 'ITEMS'
create_dataset_response = personalize.create_dataset(
    name = 'car-items',
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = items_schema_arn
)

items_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-east-1:355151823911:dataset/car-dg10/ITEMS",
  "ResponseMetadata": {
    "RequestId": "68c9ccad-fc66-4bba-a35c-ce1103d84e2b",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Thu, 11 Jul 2019 14:47:48 GMT",
      "x-amzn-requestid": "68c9ccad-fc66-4bba-a35c-ce1103d84e2b",
      "content-length": "82",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [87]:
dataset_type = 'USERS'
create_dataset_response = personalize.create_dataset(
    name = 'car-users',
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = users_schema_arn
)

users_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-east-1:355151823911:dataset/car-dg10/USERS",
  "ResponseMetadata": {
    "RequestId": "b9fd0e32-2825-4df4-ad9d-57dde848c7b5",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Thu, 11 Jul 2019 14:47:48 GMT",
      "x-amzn-requestid": "b9fd0e32-2825-4df4-ad9d-57dde848c7b5",
      "content-length": "82",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [88]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = '{}-car-interactions-import'.format(dataset_group_name),
    datasetArn = interactions_dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}/{}".format(bucket, prefix, interactions_filename)
    },
    roleArn = role_arn
)

interactions_dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-east-1:355151823911:dataset-import-job/car-dg10-car-interactions-import",
  "ResponseMetadata": {
    "RequestId": "43cb2ca7-d9a8-4179-9ff3-26397fc7637c",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Thu, 11 Jul 2019 14:47:49 GMT",
      "x-amzn-requestid": "43cb2ca7-d9a8-4179-9ff3-26397fc7637c",
      "content-length": "120",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [89]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = '{}-car-items-import'.format(dataset_group_name),
    datasetArn = items_dataset_arn,
    dataSource = {
        'dataLocation': 's3://{}/{}/{}'.format(bucket, prefix, cars_filename)
    },
    roleArn = role_arn
)

items_dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-east-1:355151823911:dataset-import-job/car-dg10-car-items-import",
  "ResponseMetadata": {
    "RequestId": "388a7e88-fd72-4dbc-baf3-62014143ec66",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Thu, 11 Jul 2019 14:47:49 GMT",
      "x-amzn-requestid": "388a7e88-fd72-4dbc-baf3-62014143ec66",
      "content-length": "113",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [90]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = '{}-car-users-import'.format(dataset_group_name),
    datasetArn = users_dataset_arn,
    dataSource = {
        'dataLocation': 's3://{}/{}/{}'.format(bucket, prefix, users_filename)
    },
    roleArn = role_arn
)

users_dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-east-1:355151823911:dataset-import-job/car-dg10-car-users-import",
  "ResponseMetadata": {
    "RequestId": "eacfd7a3-86a8-4d42-ad74-a353bcaf947e",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Thu, 11 Jul 2019 14:47:50 GMT",
      "x-amzn-requestid": "eacfd7a3-86a8-4d42-ad74-a353bcaf947e",
      "content-length": "113",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [None]:
max_time = time.time() + MAX_WAIT_TIME
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = items_dataset_import_job_arn
    )
    status = describe_dataset_import_job_response['datasetImportJob']['status']
    print('DatasetImportJob: {}'.format(status))
    
    if status == 'ACTIVE' or status == 'CREATE FAILED':
        break
        
    time.sleep(60)

DatasetImportJob: CREATE PENDING
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS


In [None]:
max_time = time.time() + MAX_WAIT_TIME
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = users_dataset_import_job_arn
    )
    status = describe_dataset_import_job_response['datasetImportJob']['status']
    print('DatasetImportJob: {}'.format(status))
    
    if status == 'ACTIVE' or status == 'CREATE FAILED':
        break
        
    time.sleep(60)

In [None]:
max_time = time.time() + MAX_WAIT_TIME
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = interactions_dataset_import_job_arn
    )
    status = describe_dataset_import_job_response['datasetImportJob']['status']
    print('DatasetImportJob: {}'.format(status))
    
    if status == 'ACTIVE' or status == 'CREATE FAILED':
        break
        
    time.sleep(60)