### Get the Personalize boto3 Client

In [17]:
import boto3

import json
import numpy as np
import pandas as pd
import time

personalize = boto3.client('personalize')
personalize_runtime = boto3.client('personalize-runtime')

### Specify a Bucket and Data Output Location

In [54]:
bucket = "liam-movielens-20m"       # replace with the name of your S3 bucket
filename = "movie-lens-20M.csv"  # replace with a name that you want to save the dataset under

### Download, Prepare, and Upload Training Data

#### Download and Explore the Dataset

In [56]:
!wget -N http://files.grouplens.org/datasets/movielens/ml-20m.zip
!unzip -o ml-20m.zip
data = pd.read_csv('./ml-20m/ratings.csv', sep=',', dtype={'userid': "int64", 'movieid': "int64", 'rating': "float64", 'timestamp': "int64"})
pd.set_option('display.max_rows', 25)
data.rename(columns = {'userId':'USER_ID','movieId':'ITEM_ID','rating':'RATING','timestamp':'TIMESTAMP'}, inplace = True)
data

Unnamed: 0,USER_ID,ITEM_ID,RATING,TIMESTAMP
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
5,1,112,3.5,1094785740
6,1,151,4.0,1094785734
7,1,223,4.0,1112485573
8,1,253,4.0,1112484940
9,1,260,4.0,1112484826


#### Prepare and Upload Data

In [57]:
data = data[data['RATING'] > 3.6]                # keep only movies rated 3.6 and above
data = data[['USER_ID', 'ITEM_ID', 'TIMESTAMP']] # select columns that match the columns in the schema below
data.to_csv(filename, index=False)

boto3.Session().resource('s3').Bucket(bucket).Object(filename).upload_file(filename)

### Create Schema

In [61]:
schema = {
    "type": "record",
    "name": "Interactions",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "USER_ID",
            "type": "string"
        },
        {
            "name": "ITEM_ID",
            "type": "string"
        },
        {
            "name": "TIMESTAMP",
            "type": "long"
        }
    ],
    "version": "1.0"
}

create_schema_response = personalize.create_schema(
    name = "DEMO-schema",
    schema = json.dumps(schema)
)

schema_arn = create_schema_response['schemaArn']
print(json.dumps(create_schema_response, indent=2))

{
  "schemaArn": "arn:aws:personalize:us-east-1:930444659029:schema/DEMO-schema",
  "ResponseMetadata": {
    "RequestId": "d8f7e9f2-a152-4362-b884-2de8c65d7725",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 03 Sep 2019 20:55:01 GMT",
      "x-amzn-requestid": "d8f7e9f2-a152-4362-b884-2de8c65d7725",
      "content-length": "77",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


### Create and Wait for Dataset Group

#### Create Dataset Group

In [62]:
create_dataset_group_response = personalize.create_dataset_group(
    name = "DEMO-dataset-group"
)

dataset_group_arn = create_dataset_group_response['datasetGroupArn']
print(json.dumps(create_dataset_group_response, indent=2))

{
  "datasetGroupArn": "arn:aws:personalize:us-east-1:930444659029:dataset-group/DEMO-dataset-group",
  "ResponseMetadata": {
    "RequestId": "bd067632-8ac8-47ad-a2a1-25318e16c630",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 03 Sep 2019 20:55:04 GMT",
      "x-amzn-requestid": "bd067632-8ac8-47ad-a2a1-25318e16c630",
      "content-length": "97",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


#### Wait for Dataset Group to Have ACTIVE Status

In [63]:
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_group_response = personalize.describe_dataset_group(
        datasetGroupArn = dataset_group_arn
    )
    status = describe_dataset_group_response["datasetGroup"]["status"]
    print("DatasetGroup: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

DatasetGroup: CREATE PENDING
DatasetGroup: ACTIVE


### Create Dataset

In [64]:
dataset_type = "INTERACTIONS"
create_dataset_response = personalize.create_dataset(
    name = "DEMO-dataset",
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = schema_arn
)

dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-east-1:930444659029:dataset/DEMO-dataset-group/INTERACTIONS",
  "ResponseMetadata": {
    "RequestId": "b42c6b96-ec80-43c4-a81f-97cbed815b31",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 03 Sep 2019 20:58:06 GMT",
      "x-amzn-requestid": "b42c6b96-ec80-43c4-a81f-97cbed815b31",
      "content-length": "99",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


### Prepare, Create, and Wait for Dataset Import Job

#### Attach Policy to S3 Bucket

In [66]:
s3 = boto3.client("s3")

policy = {
    "Version": "2012-10-17",
    "Id": "PersonalizeS3BucketAccessPolicy",
    "Statement": [
        {
            "Sid": "PersonalizeS3BucketAccessPolicy",
            "Effect": "Allow",
            "Principal": {
                "Service": "personalize.amazonaws.com"
            },
            "Action": [
                "s3:GetObject",
                "s3:ListBucket"
            ],
            "Resource": [
                "arn:aws:s3:::{}".format(bucket),
                "arn:aws:s3:::{}/*".format(bucket)
            ]
        }
    ]
}

s3.put_bucket_policy(Bucket=bucket, Policy=json.dumps(policy))

{'ResponseMetadata': {'RequestId': 'C98C4A947D1564B5',
  'HostId': '7hWWTaXNg/xKIqFgWh8KegccQaurKgwm38aJYZEb+FpJ6mKzlo1/SLlJt6s8eRuBzm4loAuWqqw=',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'x-amz-id-2': '7hWWTaXNg/xKIqFgWh8KegccQaurKgwm38aJYZEb+FpJ6mKzlo1/SLlJt6s8eRuBzm4loAuWqqw=',
   'x-amz-request-id': 'C98C4A947D1564B5',
   'date': 'Tue, 03 Sep 2019 21:00:29 GMT',
   'server': 'AmazonS3'},
  'RetryAttempts': 0}}

#### Create Personalize Role

In [72]:
iam = boto3.client("iam")

role_name = "PersonalizeRole"
assume_role_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
          "Effect": "Allow",
          "Principal": {
            "Service": "personalize.amazonaws.com"
          },
          "Action": "sts:AssumeRole"
        }
    ]
}

create_role_response = iam.create_role(
    RoleName = role_name,
    AssumeRolePolicyDocument = json.dumps(assume_role_policy_document)
)

# AmazonPersonalizeFullAccess provides access to any S3 bucket with a name that includes "personalize" or "Personalize" 
# if you would like to use a bucket with a different name, please consider creating and attaching a new policy
# that provides read access to your bucket or attaching the AmazonS3ReadOnlyAccess policy to the role
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonPersonalizeFullAccess"
iam.attach_role_policy(
    RoleName = role_name,
    PolicyArn = policy_arn
)

time.sleep(60) # wait for a minute to allow IAM role policy attachment to propagate

role_arn = create_role_response["Role"]["Arn"]
print(role_arn)

arn:aws:iam::930444659029:role/PersonalizeRole


#### Create Dataset Import Job

In [73]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "DEMO-dataset-import-job",
    datasetArn = dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket, filename)
    },
    roleArn = role_arn
)

dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-east-1:930444659029:dataset-import-job/DEMO-dataset-import-job",
  "ResponseMetadata": {
    "RequestId": "a0014168-e017-4bbe-a4d8-48d2937b63c4",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 03 Sep 2019 21:04:54 GMT",
      "x-amzn-requestid": "a0014168-e017-4bbe-a4d8-48d2937b63c4",
      "content-length": "111",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


#### Wait for Dataset Import Job to Have ACTIVE Status

In [74]:
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = dataset_import_job_arn
    )
    status = describe_dataset_import_job_response["datasetImportJob"]['status']
    print("DatasetImportJob: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: ACTIVE


### Select Recipe

In [75]:
list_recipes_response = personalize.list_recipes()
recipe_arn = "arn:aws:personalize:::recipe/aws-hrnn" # aws-hrnn selected for demo purposes
list_recipes_response

{'recipes': [{'name': 'aws-hrnn',
   'recipeArn': 'arn:aws:personalize:::recipe/aws-hrnn',
   'status': 'ACTIVE',
   'creationDateTime': datetime.datetime(2019, 6, 10, 0, 0, tzinfo=tzlocal()),
   'lastUpdatedDateTime': datetime.datetime(2019, 6, 20, 0, 39, 17, 65000, tzinfo=tzlocal())},
  {'name': 'aws-hrnn-coldstart',
   'recipeArn': 'arn:aws:personalize:::recipe/aws-hrnn-coldstart',
   'status': 'ACTIVE',
   'creationDateTime': datetime.datetime(2019, 6, 10, 0, 0, tzinfo=tzlocal()),
   'lastUpdatedDateTime': datetime.datetime(2019, 6, 20, 0, 39, 17, 64000, tzinfo=tzlocal())},
  {'name': 'aws-hrnn-metadata',
   'recipeArn': 'arn:aws:personalize:::recipe/aws-hrnn-metadata',
   'status': 'ACTIVE',
   'creationDateTime': datetime.datetime(2019, 6, 10, 0, 0, tzinfo=tzlocal()),
   'lastUpdatedDateTime': datetime.datetime(2019, 6, 20, 0, 39, 17, 64000, tzinfo=tzlocal())},
  {'name': 'aws-personalized-ranking',
   'recipeArn': 'arn:aws:personalize:::recipe/aws-personalized-ranking',
   'stat

### Create and Wait for Solution

#### Create Solution

In [76]:
create_solution_response = personalize.create_solution(
    name = "DEMO-solution",
    datasetGroupArn = dataset_group_arn,
    recipeArn = recipe_arn
)

solution_arn = create_solution_response['solutionArn']
print(json.dumps(create_solution_response, indent=2))

{
  "solutionArn": "arn:aws:personalize:us-east-1:930444659029:solution/DEMO-solution",
  "ResponseMetadata": {
    "RequestId": "22459a74-bc5e-4f1d-8235-84ab8ed760aa",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 03 Sep 2019 21:25:13 GMT",
      "x-amzn-requestid": "22459a74-bc5e-4f1d-8235-84ab8ed760aa",
      "content-length": "83",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


#### Create Solution Version

In [77]:
create_solution_version_response = personalize.create_solution_version(
    solutionArn = solution_arn
)

solution_version_arn = create_solution_version_response['solutionVersionArn']
print(json.dumps(create_solution_version_response, indent=2))

{
  "solutionVersionArn": "arn:aws:personalize:us-east-1:930444659029:solution/DEMO-solution/84ccb370",
  "ResponseMetadata": {
    "RequestId": "875f1d79-f4f8-44fb-9ba6-e257b583e74a",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 03 Sep 2019 21:25:18 GMT",
      "x-amzn-requestid": "875f1d79-f4f8-44fb-9ba6-e257b583e74a",
      "content-length": "99",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


#### Wait for Solution Version to Have ACTIVE Status

In [None]:
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_solution_version_response = personalize.describe_solution_version(
        solutionVersionArn = solution_version_arn
    )
    status = describe_solution_version_response["solutionVersion"]["status"]
    print("SolutionVersion: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

SolutionVersion: CREATE PENDING
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS


#### Get Metrics of Solution

In [79]:
get_solution_metrics_response = personalize.get_solution_metrics(
    solutionVersionArn = solution_version_arn
)

print(json.dumps(get_solution_metrics_response, indent=2))

{
  "solutionVersionArn": "arn:aws:personalize:us-east-1:930444659029:solution/DEMO-solution/84ccb370",
  "metrics": {
    "coverage": 0.2234,
    "mean_reciprocal_rank_at_25": 0.0915,
    "normalized_discounted_cumulative_gain_at_10": 0.1258,
    "normalized_discounted_cumulative_gain_at_25": 0.1573,
    "normalized_discounted_cumulative_gain_at_5": 0.1035,
    "precision_at_10": 0.0196,
    "precision_at_25": 0.0129,
    "precision_at_5": 0.0261
  },
  "ResponseMetadata": {
    "RequestId": "af37c7ed-b499-4958-a12f-9749fe8dfefe",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 03 Sep 2019 23:04:38 GMT",
      "x-amzn-requestid": "af37c7ed-b499-4958-a12f-9749fe8dfefe",
      "content-length": "397",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


### Create and Wait for Campaign

#### Create Campaign

In [80]:
create_campaign_response = personalize.create_campaign(
    name = "DEMO-campaign",
    solutionVersionArn = solution_version_arn,
    minProvisionedTPS = 1
)

campaign_arn = create_campaign_response['campaignArn']
print(json.dumps(create_campaign_response, indent=2))

{
  "campaignArn": "arn:aws:personalize:us-east-1:930444659029:campaign/DEMO-campaign",
  "ResponseMetadata": {
    "RequestId": "7cc17c7c-4018-4e28-9a6f-8fe1ca2307e6",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Tue, 03 Sep 2019 23:04:44 GMT",
      "x-amzn-requestid": "7cc17c7c-4018-4e28-9a6f-8fe1ca2307e6",
      "content-length": "83",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


#### Wait for Campaign to Have ACTIVE Status

In [81]:
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_campaign_response = personalize.describe_campaign(
        campaignArn = campaign_arn
    )
    status = describe_campaign_response["campaign"]["status"]
    print("Campaign: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

Campaign: CREATE PENDING
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: CREATE IN_PROGRESS
Campaign: ACTIVE


### Get Recommendations

#### Select a User and an Item

In [84]:
items = pd.read_csv('./ml-20m/movies.csv', sep=',', usecols=[0,1], header=0)
items.columns = ['ITEM_ID', 'TITLE']

user_id, item_id, _ = data.sample().values[0]
item_title = items.loc[items['ITEM_ID'] == item_id].values[0][-1]
print("USER: {}".format(user_id))
print("ITEM: {}".format(item_title))

items

USER: 38957
ITEM: Sisterhood of the Traveling Pants, The (2005)


Unnamed: 0,ITEM_ID,TITLE
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
5,6,Heat (1995)
6,7,Sabrina (1995)
7,8,Tom and Huck (1995)
8,9,Sudden Death (1995)
9,10,GoldenEye (1995)


#### Call GetRecommendations

In [85]:
get_recommendations_response = personalize_runtime.get_recommendations(
    campaignArn = campaign_arn,
    userId = str(user_id),
    itemId = str(item_id)
)

item_list = get_recommendations_response['itemList']
title_list = [items.loc[items['ITEM_ID'] == np.int(item['itemId'])].values[0][-1] for item in item_list]

print("Recommendations: {}".format(json.dumps(title_list, indent=2)))

Recommendations: [
  "Proposal, The (2009)",
  "Jane Austen Book Club, The (2007)",
  "Atonement (2007)",
  "Sex and the City (2008)",
  "P.S. I Love You (2007)",
  "Holiday, The (2006)",
  "He's Just Not That Into You (2009)",
  "27 Dresses (2008)",
  "Definitely, Maybe (2008)",
  "In the Land of Women (2007)",
  "Pride and Prejudice (1995)",
  "Becoming Jane (2007)",
  "Sisterhood of the Traveling Pants 2, The (2008)",
  "Duchess, The (2008)",
  "Just Like Heaven (2005)",
  "Catch and Release (2006)",
  "Julie & Julia (2009)",
  "Other Boleyn Girl, The (2008)",
  "Made of Honor (2008)",
  "Sisterhood of the Traveling Pants, The (2005)",
  "August Rush (2007)",
  "No Reservations (2007)",
  "Waitress (2007)",
  "Enchanted (2007)",
  "Penelope (2006)"
]


In [86]:
print("Campaign ARN is: " + str(campaign_arn))

Campaign ARN is: arn:aws:personalize:us-east-1:930444659029:campaign/DEMO-campaign


In [87]:
print("Dataset Group ARN is: " + str(dataset_group_arn))

Dataset Group ARN is: arn:aws:personalize:us-east-1:930444659029:dataset-group/DEMO-dataset-group


In [88]:
print("Solution Version ARN is: " + str(solution_version_arn))

Solution Version ARN is: arn:aws:personalize:us-east-1:930444659029:solution/DEMO-solution/84ccb370


In [89]:
print("Solution ARN is: " + str(solution_arn))

Solution ARN is: arn:aws:personalize:us-east-1:930444659029:solution/DEMO-solution


In [90]:
print("Dataset Interactions ARN is: " + str(dataset_arn))

Dataset Interactions ARN is: arn:aws:personalize:us-east-1:930444659029:dataset/DEMO-dataset-group/INTERACTIONS
