In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from datetime import datetime
import boto3
from sagemaker import get_execution_role
import sagemaker.amazon.common as smac

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [4]:
role = get_execution_role()
bucket = 'ufo-sight-data'
prefix = 'input'
key = 'ufo_fullset.csv'
data_location = f's3://{bucket}/{prefix}/{key}'
data_location

's3://ufo-sight-data/input/ufo_fullset.csv'

In [5]:
df = pd.read_csv(data_location, low_memory=False)

In [6]:
df.head()

Unnamed: 0,reportedTimestamp,eventDate,eventTime,shape,duration,witnesses,weather,firstName,lastName,latitude,longitude,sighting,physicalEvidence,contact,researchOutcome
0,1977-04-04T04:02:23.340Z,1977-03-31,23:46,circle,4,1,rain,Ila,Bashirian,47.329444,-122.578889,Y,N,N,explained
1,1982-11-22T02:06:32.019Z,1982-11-15,22:04,disk,4,1,partly cloudy,Eriberto,Runolfsson,52.664913,-1.034894,Y,Y,N,explained
2,1992-12-07T19:06:52.482Z,1992-12-07,19:01,circle,49,1,clear,Miller,Watsica,38.951667,-92.333889,Y,N,N,explained
3,2011-02-24T21:06:34.898Z,2011-02-21,20:56,disk,13,1,partly cloudy,Clifton,Bechtelar,41.496944,-71.367778,Y,N,N,explained
4,1991-03-09T16:18:45.501Z,1991-03-09,11:42,circle,17,1,mostly cloudy,Jayda,Ebert,47.606389,-122.330833,Y,N,N,explained


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   reportedTimestamp  18000 non-null  object 
 1   eventDate          18000 non-null  object 
 2   eventTime          18000 non-null  object 
 3   shape              17998 non-null  object 
 4   duration           18000 non-null  int64  
 5   witnesses          18000 non-null  int64  
 6   weather            18000 non-null  object 
 7   firstName          18000 non-null  object 
 8   lastName           18000 non-null  object 
 9   latitude           18000 non-null  float64
 10  longitude          18000 non-null  float64
 11  sighting           18000 non-null  object 
 12  physicalEvidence   18000 non-null  object 
 13  contact            18000 non-null  object 
 14  researchOutcome    18000 non-null  object 
dtypes: float64(2), int64(2), object(11)
memory usage: 2.1+ MB


In [8]:
df.shape

(18000, 15)

In [10]:
df.duplicated().any()

False

In [11]:
df.isna().any()

reportedTimestamp    False
eventDate            False
eventTime            False
shape                 True
duration             False
witnesses            False
weather              False
firstName            False
lastName             False
latitude             False
longitude            False
sighting             False
physicalEvidence     False
contact              False
researchOutcome      False
dtype: bool

In [12]:
df.isna().sum()

reportedTimestamp    0
eventDate            0
eventTime            0
shape                2
duration             0
witnesses            0
weather              0
firstName            0
lastName             0
latitude             0
longitude            0
sighting             0
physicalEvidence     0
contact              0
researchOutcome      0
dtype: int64

In [13]:
df_geo = df[['latitude', 'longitude']]

In [14]:
df_geo.head()

Unnamed: 0,latitude,longitude
0,47.329444,-122.578889
1,52.664913,-1.034894
2,38.951667,-92.333889
3,41.496944,-71.367778
4,47.606389,-122.330833


In [15]:
df_geo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   latitude   18000 non-null  float64
 1   longitude  18000 non-null  float64
dtypes: float64(2)
memory usage: 281.4 KB


In [16]:
missing_values = df_geo.isnull().values.any()
print('Are there any missing values? {}'.format(missing_values))
if(missing_values):
    df_geo[df_geo.isnull().any(axis=1)]

Are there any missing values? False


In [18]:
data = df_geo.values.astype('float32')

In [19]:
data

array([[  47.329445, -122.57889 ],
       [  52.664913,   -1.034894],
       [  38.951668,  -92.333885],
       ...,
       [  36.86639 ,  -83.888885],
       [  35.385834,  -94.39833 ],
       [  29.883055,  -97.94111 ]], dtype=float32)

In [21]:
from sagemaker import KMeans
clusters = 10
output_location = f's3://{bucket}/model-artifacts'
kmeans = KMeans(role=role, instance_count=2, instance_type='ml.c4.xlarge', output_path=output_location, k=clusters)

In [22]:
job_name = 'kmeans-geo-job-{}'.format(datetime.now().strftime("%Y%m%d%H%M%S"))
print('Here is the job name {}'.format(job_name))

Here is the job name kmeans-geo-job-20231224150919


In [23]:
%%time
kmeans.fit(kmeans.record_set(data), job_name=job_name)

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: kmeans-geo-job-20231224150919


2023-12-24 15:10:22 Starting - Starting the training job...
2023-12-24 15:10:38 Starting - Preparing the instances for training.........
2023-12-24 15:11:56 Downloading - Downloading input data...
2023-12-24 15:12:41 Downloading - Downloading the training image......
2023-12-24 15:13:47 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[12/24/2023 15:13:55 INFO 140437049112384] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'init_method': 'random', 'mini_batch_size': '5000', 'epochs': '1', 'extra_center_factor': 'auto', 'local_lloyd_max_iter': '300', 'local_lloyd_tol': '0.0001', 'local_lloyd_init_method': 'kmeans++', 'local_lloyd_num_trials': 'auto', 'half_life_time_size': '0', 'eval_metrics': '["msd"]', 'force_dense': 'true', '_disable_wait_to_read': 'false', '_enable_profiler': '

In [24]:
import os

model_key = f'model-artifacts/{job_name}/output/model.tar.gz'
boto3.resource('s3').Bucket(bucket).download_file(model_key, 'model.tar.gz')
os.system('tar -zxvf model.tar.gz')
os.system('unzip model_algo-1')

model_algo-1
Archive:  model_algo-1


tar: Ignoring unknown extended header keyword `LIBARCHIVE.creationtime'
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of model_algo-1 or
        model_algo-1.zip, and cannot find model_algo-1.ZIP, period.


2304

In [25]:
!pip install mxnet

Collecting mxnet
  Downloading mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl (49.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.1/49.1 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting graphviz<0.9.0,>=0.8.1 (from mxnet)
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet
Successfully installed graphviz-0.8.4 mxnet-1.9.1


In [26]:
import mxnet as mx
Kmeans_model_params = mx.ndarray.load('model_algo-1')

In [27]:
cluster_centroids_kmeans = pd.DataFrame(Kmeans_model_params[0].asnumpy())
cluster_centroids_kmeans.columns=df_geo.columns
cluster_centroids_kmeans

Unnamed: 0,latitude,longitude
0,39.53104,-118.351028
1,36.538239,-84.715729
2,16.144054,69.983337
3,52.039307,-0.332039
4,-37.596992,162.684753
5,41.049885,-74.562927
6,35.985737,-98.411423
7,64.884079,-154.374054
8,-18.852001,-54.809128
9,21.979956,-158.488373


In [28]:
from io import StringIO

csv_buffer = StringIO()
cluster_centroids_kmeans.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'results/ten_locations_kmeans.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '55AH5H4D2H28ZZ30',
  'HostId': 'GTamHecVE9Y05i9zDT5Cs4mQvKPmp37Jr0+9MEklV/Af/mBSxsFkelEfShDZRsSUdbbNs2yn4G5FUQv8DnXsNckRcRvvSFNuELjWZ6L4wgA=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'GTamHecVE9Y05i9zDT5Cs4mQvKPmp37Jr0+9MEklV/Af/mBSxsFkelEfShDZRsSUdbbNs2yn4G5FUQv8DnXsNckRcRvvSFNuELjWZ6L4wgA=',
   'x-amz-request-id': '55AH5H4D2H28ZZ30',
   'date': 'Sun, 24 Dec 2023 15:20:07 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"531900088e62eabf5fe97d9d3c6258a8"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"531900088e62eabf5fe97d9d3c6258a8"',
 'ServerSideEncryption': 'AES256'}