In [1]:
# This Notebook is an exercise by following Walker Rower tutorial at https://www.bmc.com/blogs/amazon-sagemaker/
%sc
!wget 'https://s3.amazonaws.com/crimexyz/crime.csv'

--2018-06-26 16:01:41--  https://s3.amazonaws.com/crimexyz/crime.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.88.10
Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.88.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1395 (1.4K) [text/csv]
Saving to: ‘crime.csv.10’


2018-06-26 16:01:41 (106 MB/s) - ‘crime.csv.10’ saved [1395/1395]



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
crime = pd.read_csv('crime.csv', header=0)
print(crime.head())

        State  crimeCluster  Murder  Assault  UrbanPop  Rape
0     Alabama             4    13.2      236        58  21.2
1      Alaska             4    10.0      263        48  44.5
2     Arizona             4     8.1      294        80  31.0
3    Arkansas             3     8.8      190        50  19.5
4  California             4     9.0      276        91  40.6


In [3]:
def stateToNumber(s):
    l = 0
    for x in s:
        l = l + int(hex(ord(x)),16)
    return l


In [4]:
xref = pd.DataFrame(crime['State'])

crime['State']=crime['State'].apply(lambda x: stateToNumber(x))

crime.head()


Unnamed: 0,State,crimeCluster,Murder,Assault,UrbanPop,Rape
0,671,4,13.2,236,58,21.2
1,589,4,10.0,263,48,44.5
2,724,4,8.1,294,80,31.0
3,820,3,8.8,190,50,19.5
4,1016,4,9.0,276,91,40.6


In [5]:
crimeArray = crime.as_matrix().astype(np.float32)

  if __name__ == '__main__':


In [6]:
crimeArray.shape

(50, 6)

In [7]:
from sagemaker import KMeans
from sagemaker import get_execution_role

role = get_execution_role()
print(role)

## Note: 2018-06-26 On S3 I created the bucket "outputxyz" 
##and two folders there "my_kmeans_example_data" and "my_kmeans_example_output"

bucket = "outputxyz"  
data_location = "outputxyz"

## Note: 2018-06-26: Since I got some permissions issues at this point, by following the original post (https://www.bmc.com/blogs/amazon-sagemaker/)
## I did the following:
### 1) Create, on IAM, a new policy called "testxyz" for S3, providing "List, Read, Write, Permissions management"
### 2) Create a new role, called "xyzrole", attached the "testxyz" policy to this role.
### 3) Set  the "xzyrole" role  as the "IAM role ARN" for this notebook.

#### 


data_location = 's3://{}/my_kmeans_example_data'.format(bucket)
output_location = 's3://{}/my_kmeans_example_output'.format(bucket)

print('The training data will be uploaded to: {}'.format(data_location))
print('The training artifacts will be uploaded to: {}'.format(output_location))

kmeans = KMeans(role=role,
               train_instance_count=1,
               train_instance_type='ml.c4.8xlarge',
               output_path=output_location,
               k=10,
               data_location=data_location)

arn:aws:iam::007604646786:role/xyzrole
The training data will be uploaded to: s3://outputxyz/my_kmeans_example_data
The training artifacts will be uploaded to: s3://outputxyz/my_kmeans_example_output


In [8]:
slice=crimeArray[:,1:5]

In [9]:
%%time
kmeans.fit(kmeans.record_set(slice))

INFO:sagemaker:Creating training-job with name: kmeans-2018-06-26-16-01-44-678


....................
[31mDocker entrypoint called with argument(s): train[0m
[31m[06/26/2018 16:04:50 INFO 140138676823872] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/default-input.json: {u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'local_lloyd_num_trials': u'auto', u'_log_level': u'info', u'_kvstore': u'auto', u'local_lloyd_init_method': u'kmeans++', u'force_dense': u'true', u'epochs': u'1', u'init_method': u'random', u'local_lloyd_tol': u'0.0001', u'local_lloyd_max_iter': u'300', u'_disable_wait_to_read': u'false', u'extra_center_factor': u'auto', u'eval_metrics': u'["msd"]', u'_num_kv_servers': u'1', u'mini_batch_size': u'5000', u'half_life_time_size': u'0', u'_num_slices': u'1'}[0m
[31m[06/26/2018 16:04:50 INFO 140138676823872] Reading provided configuration from /opt/ml/input/config/hyperparameters.json: {u'feature_dim': u'4', u'k': u'10', u'force_dense': u'True'}[0m
[31m[06/26/2018 16:04:50 INFO 140138676823872] Final

===== Job Complete =====
Billable seconds: 99
CPU times: user 352 ms, sys: 12 ms, total: 364 ms
Wall time: 3min 43s


In [10]:
%%time
kmeans_predictor = kmeans.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: kmeans-2018-06-26-16-05-26-456
INFO:sagemaker:Creating endpoint with name kmeans-2018-06-26-16-01-44-678


---------------------------------------------------------------------------!CPU times: user 292 ms, sys: 16 ms, total: 308 ms
Wall time: 6min 19s


In [11]:
slice=crimeArray[:,1:5]
slice.shape
slice

array([[  4. ,  13.2, 236. ,  58. ],
       [  4. ,  10. , 263. ,  48. ],
       [  4. ,   8.1, 294. ,  80. ],
       [  3. ,   8.8, 190. ,  50. ],
       [  4. ,   9. , 276. ,  91. ],
       [  3. ,   7.9, 204. ,  78. ],
       [  2. ,   3.3, 110. ,  77. ],
       [  4. ,   5.9, 238. ,  72. ],
       [  4. ,  15.4, 335. ,  80. ],
       [  3. ,  17.4, 211. ,  60. ],
       [  1. ,   5.3,  46. ,  83. ],
       [  2. ,   2.6, 120. ,  54. ],
       [  4. ,  10.4, 249. ,  83. ],
       [  2. ,   7.2, 113. ,  65. ],
       [  1. ,   2.2,  56. ,  57. ],
       [  2. ,   6. , 115. ,  66. ],
       [  2. ,   9.7, 109. ,  52. ],
       [  4. ,  15.4, 249. ,  66. ],
       [  1. ,   2.1,  83. ,  51. ],
       [  4. ,  11.3, 300. ,  67. ],
       [  3. ,   4.4, 149. ,  85. ],
       [  4. ,  12.1, 255. ,  74. ],
       [  1. ,   2.7,  72. ,  66. ],
       [  4. ,  16.1, 259. ,  44. ],
       [  3. ,   9. , 178. ,  70. ],
       [  2. ,   6. , 109. ,  53. ],
       [  2. ,   4.3, 102. ,  62. ],
 

In [12]:
s=slice[1:2]

In [13]:
%%time
result = kmeans_predictor.predict(s)
clusters = [r.label['closest_cluster'].float32_tensor.values[0] for r in result]
i = 0

for r in result:
    out = {
        "State" : crime['State'].iloc[i],
        "StateCode" : xref['State'].iloc[i],
        "closest_cluster" : r.label['closest_cluster'].float32_tensor.values[0],
        "crimeCluster" : crime['crimeCluster'].iloc[i],
        "Murder" : crime['Murder'].iloc[i],
        "Assault" : crime['Assault'].iloc[i],
        "UrbanPop" : crime['UrbanPop'].iloc[i],
        "Rape" : crime['Rape'].iloc[i]
    }
    print(out)
    i = i + 1
    

{'State': 671, 'StateCode': 'Alabama', 'closest_cluster': 2.0, 'crimeCluster': 4, 'Murder': 13.2, 'Assault': 236, 'UrbanPop': 58, 'Rape': 21.2}
CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 244 ms


In [14]:
%%time
result = kmeans_predictor.predict(slice)
clusters = [r.label['closest_cluster'].float32_tensor.values[0] for r in result]
i = 0

for r in result:
    out = {
        "State" : crime['State'].iloc[i],
        "StateCode" : xref['State'].iloc[i],
        "closest_cluster" : r.label['closest_cluster'].float32_tensor.values[0],
        "crimeCluster" : crime['crimeCluster'].iloc[i],
        "Murder" : crime['Murder'].iloc[i],
        "Assault" : crime['Assault'].iloc[i],
        "UrbanPop" : crime['UrbanPop'].iloc[i],
        "Rape" : crime['Rape'].iloc[i]
    }
    print(out)
    i = i + 1
    

{'State': 671, 'StateCode': 'Alabama', 'closest_cluster': 8.0, 'crimeCluster': 4, 'Murder': 13.2, 'Assault': 236, 'UrbanPop': 58, 'Rape': 21.2}
{'State': 589, 'StateCode': 'Alaska', 'closest_cluster': 2.0, 'crimeCluster': 4, 'Murder': 10.0, 'Assault': 263, 'UrbanPop': 48, 'Rape': 44.5}
{'State': 724, 'StateCode': 'Arizona', 'closest_cluster': 3.0, 'crimeCluster': 4, 'Murder': 8.1, 'Assault': 294, 'UrbanPop': 80, 'Rape': 31.0}
{'State': 820, 'StateCode': 'Arkansas', 'closest_cluster': 0.0, 'crimeCluster': 3, 'Murder': 8.8, 'Assault': 190, 'UrbanPop': 50, 'Rape': 19.5}
{'State': 1016, 'StateCode': 'California', 'closest_cluster': 3.0, 'crimeCluster': 4, 'Murder': 9.0, 'Assault': 276, 'UrbanPop': 91, 'Rape': 40.6}
{'State': 819, 'StateCode': 'Colorado', 'closest_cluster': 0.0, 'crimeCluster': 3, 'Murder': 7.9, 'Assault': 204, 'UrbanPop': 78, 'Rape': 38.7}
{'State': 1151, 'StateCode': 'Connecticut', 'closest_cluster': 4.0, 'crimeCluster': 2, 'Murder': 3.3, 'Assault': 110, 'UrbanPop': 77, '