# Make user, item, and interaction datasets
This notebook creates the official Amazon Personalize dataset resources for the
car search recommendations demo.

In [22]:
import json
import boto3
import time

schema_version = 'v12'
dataset_group_name = 'car-dg12'

bucket      = '<your-bucket>'
prefix      = 'personalize/' + schema_version
region      = '<your-region>'
account_num = '<your-account>'
role_arn    = '<your-role-arn>'

CAR_INTERACTION_SCHEMA_NAME = 'car-interactions-schema-' + schema_version
CAR_INTERACTION_SCHEMA_ARN  = 'arn:aws:personalize:{}:{}:schema/'.format(region, account_num) + \
                                CAR_INTERACTION_SCHEMA_NAME
    
CAR_ITEM_SCHEMA_NAME = 'car-items-schema-' + schema_version
CAR_ITEM_SCHEMA_ARN  = 'arn:aws:personalize:{}:{}:schema/'.format(region, account_num) + \
                                CAR_ITEM_SCHEMA_NAME

CAR_USER_SCHEMA_NAME = 'car-users-schema-' + schema_version
CAR_USER_SCHEMA_ARN  = 'arn:aws:personalize:{}:{}:schema/'.format(region, account_num) + \
                                CAR_USER_SCHEMA_NAME

cars_filename         = 'car_items.csv'
users_filename        = 'users.csv'
interactions_filename = 'interactions.csv'

MAX_WAIT_TIME = time.time() + 60*60 # 1 hour

In [23]:
personalize = boto3.client('personalize')
personalize_runtime = boto3.client('personalize-runtime')

## Create the INTERACTIONS schema
Create the INTERACTIONS schema if it is not in place already.

In [24]:
try:
    # first see if the schema is already in place
    arn = CAR_INTERACTION_SCHEMA_ARN
    response = personalize.describe_schema(schemaArn=arn)
    interactions_schema_arn = response['schema']['schemaArn']
    print(interactions_schema_arn)
except Exception as e:
    print('Schema {} did not exist, creating it...'.format(arn))
    schema = {
        "type": "record",
        "name": "Interactions",
        "namespace": "com.amazonaws.personalize.schema",
        "fields": [
            {
                "name": "USER_ID",
                "type": "string"
            },
            {
                "name": "ITEM_ID",
                "type": "string"
            },
            {
                "name": "TIMESTAMP",
                "type": "long"
            }
        ],
        "version": "1.0"
    }

    create_schema_response = personalize.create_schema(
        name   = CAR_INTERACTION_SCHEMA_NAME,
        schema = json.dumps(schema)
    )

    interactions_schema_arn = create_schema_response['schemaArn']
    print(json.dumps(create_schema_response, indent=2))

Schema arn:aws:personalize:us-east-1:355151823911:schema/car-interactions-schema-v12 did not exist, creating it...
{
  "schemaArn": "arn:aws:personalize:us-east-1:355151823911:schema/car-interactions-schema-v12",
  "ResponseMetadata": {
    "RequestId": "e92fa94b-e767-4d48-94fd-8f5189989add",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Sun, 14 Jul 2019 20:33:01 GMT",
      "x-amzn-requestid": "e92fa94b-e767-4d48-94fd-8f5189989add",
      "content-length": "93",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


## Create the ITEMS schema
Create the ITEMS schema if it is not in place already.

In [25]:
try:
    arn = CAR_ITEM_SCHEMA_ARN
    response = personalize.describe_schema(schemaArn=arn)
    items_schema_arn = response['schema']['schemaArn']
    print(items_schema_arn)
except Exception as e:
    print('schema not found, creating new...')
    schema = {
        "type": "record",
        "name": "Items",
        "namespace": "com.amazonaws.personalize.schema",
        "fields": [
            {
                "name": "ITEM_ID",
                "type": "string"
            },
            {
                "name": "MAKE",
                "type": "string",
                "categorical": True
            },
            {
                "name": "MODEL",
                "type": "string",
                "categorical": True
            },
            {
                "name": "YEAR",
                "type": "int"
            },
            {
                "name": "MILEAGE",
                "type": "int"
            },
            {
                "name": "PRICE",
                "type": "int"
            }
            #,
#            {
#                "name": "COLOR",
#                "type": "string",
#                "categorical": True
#            }#,  Max of 5 metadata columns is the Personalize limit for now
    #        {
    #            "name": "LOCATION",
    #            "type": "string",
    #            "categorical": True
    #        }
        ],
        "version": "1.0"
    }

    create_schema_response = personalize.create_schema(
        name   = CAR_ITEM_SCHEMA_NAME,
        schema = json.dumps(schema)
    )

    items_schema_arn = create_schema_response['schemaArn']
    print(json.dumps(create_schema_response, indent=2))

schema not found, creating new...
{
  "schemaArn": "arn:aws:personalize:us-east-1:355151823911:schema/car-items-schema-v12",
  "ResponseMetadata": {
    "RequestId": "165ace4e-0960-4786-8a44-70da97c1951b",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Sun, 14 Jul 2019 20:33:01 GMT",
      "x-amzn-requestid": "165ace4e-0960-4786-8a44-70da97c1951b",
      "content-length": "86",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


## Create the USERS schema
Create the USERS schema if it is not in place already.

In [26]:
try:
    arn = CAR_USER_SCHEMA_ARN
    response = personalize.describe_schema(schemaArn=arn)
    users_schema_arn = response['schema']['schemaArn']
    print(users_schema_arn)
except Exception as e:
    schema = {
        "type": "record",
        "name": "Users",
        "namespace": "com.amazonaws.personalize.schema",
        "fields": [
            {
                "name": "USER_ID",
                "type": "string"
            },
            {
                "name": "AGE",
                "type": "int"
            },
            {
                "name": "GENDER",
                "type": "string",
                "categorical": True
            },
            {
                "name": "LOCATION",
                "type": "string",
                "categorical": True
            },
            {
                "name": "SALARY",
                "type": "int"
            }
        ],
        "version": "1.0"
    }

    create_schema_response = personalize.create_schema(
        name   = CAR_USER_SCHEMA_NAME,
        schema = json.dumps(schema)
    )

    users_schema_arn = create_schema_response['schemaArn']
    print(json.dumps(create_schema_response, indent=2))

{
  "schemaArn": "arn:aws:personalize:us-east-1:355151823911:schema/car-users-schema-v12",
  "ResponseMetadata": {
    "RequestId": "bbb8a74d-a465-44ed-94e7-8b8f9a3a56b8",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Sun, 14 Jul 2019 20:33:01 GMT",
      "x-amzn-requestid": "bbb8a74d-a465-44ed-94e7-8b8f9a3a56b8",
      "content-length": "86",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


## Create a dataset group and the datasets within it
First delete the datasets and dataset group if they already exist. Then create the dataset
group.

In [27]:
try:
    print('Deleting prior verions of the dataset...')
    personalize.delete_dataset(datasetArn='arn:aws:personalize:{}:{}:dataset/{}/EVENT_INTERACTIONS'.format(region, account_num, dataset_group_name))
    personalize.delete_dataset(datasetArn='arn:aws:personalize:{}:{}:dataset/{}/INTERACTIONS'.format(region, account_num, dataset_group_name))
    personalize.delete_dataset(datasetArn='arn:aws:personalize:{}:{}:dataset/{}/ITEMS'.format(region, account_num, dataset_group_name))
    personalize.delete_dataset(datasetArn='arn:aws:personalize:{}:{}:dataset/{}/USERS'.format(region, account_num, dataset_group_name))
except Exception as e:
    pass

print('Waiting for datasets to be deleted...')
time.sleep(20)

try:
    personalize.delete_dataset_group(datasetGroupArn='arn:aws:personalize:{}:{}:dataset-group/{}'.format(region, account_num, dataset_group_name))
except Exception as e:
    pass
        
print('Waiting for dataset group to be deleted...')
time.sleep(20)

print('\nCreating new dataset group {}'.format(dataset_group_name))
create_dataset_group_response = personalize.create_dataset_group(
    name = dataset_group_name
)

dataset_group_arn = create_dataset_group_response['datasetGroupArn']
print(json.dumps(create_dataset_group_response, indent=2))

Waiting for datasets to be deleted...
Waiting for dataset group to be deleted...
Creating new dataset group car-dg12
{
  "datasetGroupArn": "arn:aws:personalize:us-east-1:355151823911:dataset-group/car-dg12",
  "ResponseMetadata": {
    "RequestId": "a1a38ffb-6a79-457b-a2e6-1a992d3d93ac",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Sun, 14 Jul 2019 20:33:41 GMT",
      "x-amzn-requestid": "a1a38ffb-6a79-457b-a2e6-1a992d3d93ac",
      "content-length": "87",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [28]:
max_time = time.time() + MAX_WAIT_TIME
while time.time() < max_time:
    describe_dataset_group_response = personalize.describe_dataset_group(
        datasetGroupArn = dataset_group_arn
    )
    status = describe_dataset_group_response['datasetGroup']['status']
    print('DatasetGroup: {}'.format(status))
    
    if status == 'ACTIVE' or status == 'CREATE FAILED':
        break
        
    time.sleep(60)

DatasetGroup: CREATE PENDING
DatasetGroup: ACTIVE


### Create the INTERACTIONS dataset

In [29]:
dataset_type = 'INTERACTIONS'
create_dataset_response = personalize.create_dataset(
    name = 'car-interactions',
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = interactions_schema_arn
)

interactions_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-east-1:355151823911:dataset/car-dg12/INTERACTIONS",
  "ResponseMetadata": {
    "RequestId": "ac85d4cc-cb85-4513-a168-fb57913dd3bd",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Sun, 14 Jul 2019 20:34:42 GMT",
      "x-amzn-requestid": "ac85d4cc-cb85-4513-a168-fb57913dd3bd",
      "content-length": "89",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [30]:
print(items_schema_arn)
print(dataset_group_arn)

arn:aws:personalize:us-east-1:355151823911:schema/car-items-schema-v12
arn:aws:personalize:us-east-1:355151823911:dataset-group/car-dg12


### Create the ITEMS dataset

In [31]:
dataset_type = 'ITEMS'
create_dataset_response = personalize.create_dataset(
    name = 'car-items',
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = items_schema_arn
)

items_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-east-1:355151823911:dataset/car-dg12/ITEMS",
  "ResponseMetadata": {
    "RequestId": "ef7fe112-f08b-412f-955e-a3b7e43591a2",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Sun, 14 Jul 2019 20:34:42 GMT",
      "x-amzn-requestid": "ef7fe112-f08b-412f-955e-a3b7e43591a2",
      "content-length": "82",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


### Create the USERS dataset

In [32]:
dataset_type = 'USERS'
create_dataset_response = personalize.create_dataset(
    name = 'car-users',
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = users_schema_arn
)

users_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-east-1:355151823911:dataset/car-dg12/USERS",
  "ResponseMetadata": {
    "RequestId": "2f954b6e-1bba-4b34-9e07-cf19879406ac",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Sun, 14 Jul 2019 20:34:42 GMT",
      "x-amzn-requestid": "2f954b6e-1bba-4b34-9e07-cf19879406ac",
      "content-length": "82",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


## Create an import job for each of the datasets

In [33]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = '{}-car-interactions-import'.format(dataset_group_name),
    datasetArn = interactions_dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}/{}".format(bucket, prefix, interactions_filename)
    },
    roleArn = role_arn
)

interactions_dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-east-1:355151823911:dataset-import-job/car-dg12-car-interactions-import",
  "ResponseMetadata": {
    "RequestId": "b2e3dd1d-3ddf-44c8-b13b-111e31cc40a7",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Sun, 14 Jul 2019 20:34:42 GMT",
      "x-amzn-requestid": "b2e3dd1d-3ddf-44c8-b13b-111e31cc40a7",
      "content-length": "120",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [34]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = '{}-car-items-import'.format(dataset_group_name),
    datasetArn = items_dataset_arn,
    dataSource = {
        'dataLocation': 's3://{}/{}/{}'.format(bucket, prefix, cars_filename)
    },
    roleArn = role_arn
)

items_dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-east-1:355151823911:dataset-import-job/car-dg12-car-items-import",
  "ResponseMetadata": {
    "RequestId": "c49b17e1-9657-4912-bee3-e1b6f2329415",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Sun, 14 Jul 2019 20:34:42 GMT",
      "x-amzn-requestid": "c49b17e1-9657-4912-bee3-e1b6f2329415",
      "content-length": "113",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [35]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = '{}-car-users-import'.format(dataset_group_name),
    datasetArn = users_dataset_arn,
    dataSource = {
        'dataLocation': 's3://{}/{}/{}'.format(bucket, prefix, users_filename)
    },
    roleArn = role_arn
)

users_dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-east-1:355151823911:dataset-import-job/car-dg12-car-users-import",
  "ResponseMetadata": {
    "RequestId": "1c20bd3e-5c9b-4752-95ac-d091d6e948a0",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Sun, 14 Jul 2019 20:34:43 GMT",
      "x-amzn-requestid": "1c20bd3e-5c9b-4752-95ac-d091d6e948a0",
      "content-length": "113",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


### Wait for the dataset import jobs to complete

In [39]:
print('Waiting for ITEMS data import to complete...')
max_time = time.time() + MAX_WAIT_TIME
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = items_dataset_import_job_arn
    )
    status = describe_dataset_import_job_response['datasetImportJob']['status']
    print('DatasetImportJob: {}'.format(status))
    
    if status == 'ACTIVE' or status == 'CREATE FAILED':
        break
        
    time.sleep(60)
    if status == 'ACTIVE':
        print('ITEMS dataset is ACTIVE.')

Waiting for ITEMS data import to complete...
DatasetImportJob: ACTIVE


In [37]:
print('Waiting for USERS data import to complete...')
max_time = time.time() + MAX_WAIT_TIME
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = users_dataset_import_job_arn
    )
    status = describe_dataset_import_job_response['datasetImportJob']['status']
    print('DatasetImportJob: {}'.format(status))
    
    if status == 'ACTIVE' or status == 'CREATE FAILED':
        break
        
    time.sleep(60)
    if status == 'ACTIVE':
        print('USERS dataset is ACTIVE.')

DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: ACTIVE


In [38]:
print('Waiting for INTERACTIONS data import to complete...')
max_time = time.time() + MAX_WAIT_TIME
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = interactions_dataset_import_job_arn
    )
    status = describe_dataset_import_job_response['datasetImportJob']['status']
    print('DatasetImportJob: {}'.format(status))
    
    if status == 'ACTIVE' or status == 'CREATE FAILED':
        break
        
    time.sleep(60)
    if status == 'ACTIVE':
        print('INTERACTIONS dataset is ACTIVE.')

DatasetImportJob: ACTIVE
