In [1]:
# This Notebook is an exercise by following Walker Rower tutorial at https://www.bmc.com/blogs/amazon-sagemaker/
%sc
!wget 'https://s3.amazonaws.com/crimexyz/crime.csv'

--2018-06-26 16:01:41--  https://s3.amazonaws.com/crimexyz/crime.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.88.10
Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.88.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1395 (1.4K) [text/csv]
Saving to: ‘crime.csv.10’


2018-06-26 16:01:41 (106 MB/s) - ‘crime.csv.10’ saved [1395/1395]



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
crime = pd.read_csv('crime.csv', header=0)
print(crime.head())

        State  crimeCluster  Murder  Assault  UrbanPop  Rape
0     Alabama             4    13.2      236        58  21.2
1      Alaska             4    10.0      263        48  44.5
2     Arizona             4     8.1      294        80  31.0
3    Arkansas             3     8.8      190        50  19.5
4  California             4     9.0      276        91  40.6


In [3]:
def stateToNumber(s):
    l = 0
    for x in s:
        l = l + int(hex(ord(x)),16)
    return l


In [4]:
xref = pd.DataFrame(crime['State'])

crime['State']=crime['State'].apply(lambda x: stateToNumber(x))

crime.head()


Unnamed: 0,State,crimeCluster,Murder,Assault,UrbanPop,Rape
0,671,4,13.2,236,58,21.2
1,589,4,10.0,263,48,44.5
2,724,4,8.1,294,80,31.0
3,820,3,8.8,190,50,19.5
4,1016,4,9.0,276,91,40.6


In [5]:
crimeArray = crime.as_matrix().astype(np.float32)

  if __name__ == '__main__':


In [6]:
crimeArray.shape

(50, 6)

In [7]:
from sagemaker import KMeans
from sagemaker import get_execution_role

role = get_execution_role()
print(role)

## Note: 2018-06-26 On S3 I created the bucket "outputxyz" 
##and two folders there "my_kmeans_example_data" and "my_kmeans_example_output"

bucket = "outputxyz"  
data_location = "outputxyz"

## Note: 2018-06-26: Since I got some permissions issues at this point, by following the original post (https://www.bmc.com/blogs/amazon-sagemaker/)
## I did the following:
### 1) Create, on IAM, a new policy called "testxyz" for S3, providing "List, Read, Write, Permissions management"
### 2) Create a new role, called "xyzrole", attached the "testxyz" policy to this role.
### 3) Set  the "xzyrole" role  as the "IAM role ARN" for this notebook.

#### 


data_location = 's3://{}/my_kmeans_example_data'.format(bucket)
output_location = 's3://{}/my_kmeans_example_output'.format(bucket)

print('The training data will be uploaded to: {}'.format(data_location))
print('The training artifacts will be uploaded to: {}'.format(output_location))

kmeans = KMeans(role=role,
               train_instance_count=1,
               train_instance_type='ml.c4.8xlarge',
               output_path=output_location,
               k=10,
               data_location=data_location)

arn:aws:iam::007604646786:role/xyzrole
The training data will be uploaded to: s3://outputxyz/my_kmeans_example_data
The training artifacts will be uploaded to: s3://outputxyz/my_kmeans_example_output


In [8]:
slice=crimeArray[:,1:5]

In [None]:
%%time
kmeans.fit(kmeans.record_set(slice))

INFO:sagemaker:Creating training-job with name: kmeans-2018-06-26-16-01-44-678


...............

In [None]:
%%time
kmeans_predictor = kmeans.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

In [None]:
slice=crimeArray[:,1:5]
slice.shape
slice

In [None]:
s=slice[1:2]

In [None]:
%%time
result = kmeans_predictor.predict(s)
clusters = [r.label['closest_cluster'].float32_tensor.values[0] for r in result]
i = 0

for r in result:
    out = {
        "State" : crime['State'].iloc[i],
        "StateCode" : xref['State'].iloc[i],
        "closest_cluster" : r.label['closest_cluster'].float32_tensor.values[0],
        "crimeCluster" : crime['crimeCluster'].iloc[i],
        "Murder" : crime['Murder'].iloc[i],
        "Assault" : crime['Assault'].iloc[i],
        "UrbanPop" : crime['UrbanPop'].iloc[i],
        "Rape" : crime['Rape'].iloc[i]
    }
    print(out)
    i = i + 1
    

In [None]:
%%time
result = kmeans_predictor.predict(slice)
clusters = [r.label['closest_cluster'].float32_tensor.values[0] for r in result]
i = 0

for r in result:
    out = {
        "State" : crime['State'].iloc[i],
        "StateCode" : xref['State'].iloc[i],
        "closest_cluster" : r.label['closest_cluster'].float32_tensor.values[0],
        "crimeCluster" : crime['crimeCluster'].iloc[i],
        "Murder" : crime['Murder'].iloc[i],
        "Assault" : crime['Assault'].iloc[i],
        "UrbanPop" : crime['UrbanPop'].iloc[i],
        "Rape" : crime['Rape'].iloc[i]
    }
    print(out)
    i = i + 1
    