In [None]:
# 00 Additional Ports Boundaries

In [2]:
import boto3
from datetime import datetime
import pandas as pd
import pyspark.sql.functions as F

from ais import functions as af
from adb_ais import functions as adb_af

import h3
import folium
from folium.plugins import HeatMap
import geopandas as gpd


pd.set_option('display.max_columns', None) #Show all columns in pandas df
pd.set_option('display.max_rows', 100) #Show 100 rows in pandas df
pd.options.display.float_format = '{:.10f}'.format #Show float with 10 decimal points in pandas df

from IPython.core.interactiveshell import InteractiveShell #allow multiple outputs in one jupyter cell
InteractiveShell.ast_node_interactivity = "all"

# udf

In [8]:


def save_ais_to_s3_daily(polygon_hex_df, start_date, end_date):
    s3path = "s3a://ungp-ais-data-historical-backup/user_temp/adb/new_ports_canals/"
    
    print(f"Reading AIS data from {start_date} to {end_date}")
    start_time = datetime.now().replace(microsecond=0)
    
    df = af.get_ais(spark, start_date, end_date, columns=keep_cols, polygon_hex_df=polygon_hex_df)
    df = df.withColumnRenamed("polygon_name","Port").withColumn("year", F.date_format("dt_pos_utc", "yyyy")).withColumn("month", F.date_format("dt_pos_utc","MM")).cache()

    print(f"Saving in S3...")
    df.write.mode("append").partitionBy('Port', 'year', 'month').parquet(s3path)
    print("Save complete")

    end_time = datetime.now().replace(microsecond=0)    
    print('Total time taken = ', end_time - start_time)
    
    
def create_data_for_heatmap(df):
    grouped = df[['latitude', 'longitude', f'H3_int_index_{9}', 'mmsi']][[f'H3_int_index_{9}', 'mmsi']].groupby(by=[f'H3_int_index_{9}'], as_index=False).count().rename(columns = {'mmsi':'count_ais_messages'})
    grouped['H3_str_index_9'] = grouped['H3_int_index_9'].apply(lambda x: h3.h3_to_string(x))
    grouped['hex_center_coordinates'] = grouped['H3_str_index_9'].apply(lambda x: h3.h3_to_geo(x))
    # to retain decimal precision of lat/long
    grouped['hex_center_lat'] = grouped['hex_center_coordinates'].apply(lambda x: float(x[0]))
    grouped['hex_center_lng'] = grouped['hex_center_coordinates'].apply(lambda x: float(x[1]))
    data_heatmap = grouped[['hex_center_lat', 'hex_center_lng', 'count_ais_messages']].to_numpy()
    
    return data_heatmap

def list_files_folders_s3(prefix, opt=3):
    '''List files and folders (non-recur) in a folder'''
    
    client = boto3.client('s3')
    
    # files only
    if opt==1:
        response = client.list_objects_v2(
            Bucket='ungp-ais-data-historical-backup',
            Prefix=prefix,
            Delimiter="/")

        for content in response.get('Contents'):
            print(content['Key'])
    
    # sub-folders only
    elif opt==2:
        result = client.list_objects(
            Bucket='ungp-ais-data-historical-backup', 
            Prefix=prefix, 
            Delimiter='/')
        for o in result.get('CommonPrefixes'):
            print('sub folder : ', o.get('Prefix'))
        
    # files & sub-folder both
    elif opt==3:       
        print("Files: ")
        list_files_folders_s3(prefix, opt=1)
            
        print("\nSub-folders: ")
        list_files_folders_s3(prefix, opt=2)
        
# size of folder in GB
def get_size(bucket, path):
    s3 = boto3.resource('s3')
    my_bucket = s3.Bucket(bucket)
    total_size = 0

    for obj in my_bucket.objects.filter(Prefix=path):
        total_size = total_size + obj.size

    return total_size

# extract LALB AIS using 22 square buffer

In [5]:
path_h3_ind = "s3a://ungp-ais-data-historical-backup/user_temp/adb/new_ports_canals/Port=LA/H3_indices/"
polygon_hex_df = pd.read_pickle(path_h3_ind+"buffer_res9.pkl")
polygon_hex_df.head()

Unnamed: 0,hex_id,polygon_name,hex_resolution
0,617725798794919935,LA,9
1,617725796647436287,LA,9
2,617725794458271743,LA,9
3,617725798669877247,LA,9
4,617725770752589823,LA,9


In [None]:
start_date = datetime.fromisoformat("2023-04-01")
end_date = datetime.fromisoformat("2023-05-31")

# no duplication
polygon_hex_df.shape
polygon_hex_df.nunique()

# res 6 and upwards
keep_cols = ['message_type',
 'mmsi',
 'dt_insert_utc',
 'longitude',
 'latitude',
 'imo',
 'vessel_name',
 'callsign',
 'vessel_type',
 'vessel_type_code',
 'vessel_type_cargo',
 'vessel_class',
 'length',
 'width',
 'flag_country',
 'flag_code',
 'destination',
 'eta',
 'draught',
 'sog',
 'cog',
 'rot',
 'heading',
 'nav_status',
 'nav_status_code',
 'source',
 'dt_pos_utc',
 'dt_static_utc',
 'vessel_type_main',
 'vessel_type_sub',
 'eeid',
 'H3_int_index_6',
 'H3_int_index_7',
 'H3_int_index_8',
 'H3_int_index_9',
 'H3_int_index_10',
 'H3_int_index_11',
 'H3_int_index_12',
 'H3_int_index_13',
 'H3_int_index_14',
 'H3_int_index_15']

save_ais_to_s3_daily(polygon_hex_df, start_date, end_date)

# extract Shanghai AIS using radial boundary

In [3]:
gdf = pd.read_pickle('s3a://ungp-ais-data-historical-backup/user_temp/adb/new_ports_canals/additional_ports_buffers.pkl')
gdf

Unnamed: 0,COUNTRY,PORT_NAME,LATITUDE,LONGITUDE,HARBORTYPE,HARBORSIZE,Radius,epsg_dist,geometry,point_t,Port_boundary_t,Port_boundary
0,PA,BALBOA,8.95,-79.566667,LC,M,100.0,epsg:3857,POINT (-79.56667 8.95000),POINT (-8857320.855 1000386.096),"POLYGON ((-8757320.855 1000386.096, -8757802.3...","POLYGON ((-78.66835 8.95000, -78.67268 8.86301..."
1,SG,JURONG ISLAND,1.283333,103.733333,CN,L,150.0,epsg:3857,POINT (103.73333 1.28333),POINT (11547541.808 142871.923),"POLYGON ((11697541.808 142871.923, 11696819.51...","POLYGON ((105.08081 1.28333, 105.07432 1.15129..."
2,EG,BUR SAID (PORT SAID),31.266667,32.3,LC,L,250.0,epsg:3857,POINT (32.30000 31.26667),POINT (3595619.553 3667429.570),"POLYGON ((3845619.553 3667429.570, 3844415.734...","POLYGON ((34.54579 31.26667, 34.53497 31.07832..."
3,CN,SHANGHAI,31.216667,121.5,RN,L,300.0,epsg:3857,POINT (121.50000 31.21667),POINT (13525318.131 3660919.561),"POLYGON ((13825318.131 3660919.561, 13823873.5...","POLYGON ((124.19495 31.21667, 124.18197 30.990..."
4,NL,ROTTERDAM,51.9,4.483333,RB,L,100.0,epsg:3857,POINT (4.48333 51.90000),POINT (499082.347 6782064.329),"POLYGON ((599082.347 6782064.329, 598600.819 6...","POLYGON ((5.38165 51.90000, 5.37732 51.84564, ..."
5,US,LOS ANGELES,33.75,-118.25,CB,L,22.0,epsg:3857,POINT (-118.25000 33.75000),POINT (-13163529.786 3995282.330),"POLYGON ((-13141529.786 3995282.330, -13141635...","POLYGON ((-118.05237 33.75000, -118.05332 33.7..."
6,US,LONG BEACH,33.766667,-118.183333,CB,M,22.0,epsg:3857,POINT (-118.18333 33.76667),POINT (-13156108.450 3997513.971),"POLYGON ((-13134108.450 3997513.971, -13134214...","POLYGON ((-117.98570 33.76667, -117.98666 33.7..."


In [None]:
start_date = datetime.fromisoformat("2023-03-01")
end_date = datetime.fromisoformat("2023-05-31")

query_polys = []
query_polys.append([ "shanghai", 
                       {"type": "Polygon",
                        "coordinates": [ list(gdf.iloc[3]['Port_boundary'].exterior.coords) ] }])

# res 9 giving kernel issues hence using res 6
polygon_hex_df = af.polygon_to_hex_df(polygons=query_polys, hex_resolution=6, overfill=True)
polygon_hex_df.head()

# no duplication
polygon_hex_df.shape
polygon_hex_df.nunique()

# res 6 and upwards
keep_cols = ['message_type',
 'mmsi',
 'dt_insert_utc',
 'longitude',
 'latitude',
 'imo',
 'vessel_name',
 'callsign',
 'vessel_type',
 'vessel_type_code',
 'vessel_type_cargo',
 'vessel_class',
 'length',
 'width',
 'flag_country',
 'flag_code',
 'destination',
 'eta',
 'draught',
 'sog',
 'cog',
 'rot',
 'heading',
 'nav_status',
 'nav_status_code',
 'source',
 'dt_pos_utc',
 'dt_static_utc',
 'vessel_type_main',
 'vessel_type_sub',
 'eeid',
 'H3_int_index_6',
 'H3_int_index_7',
 'H3_int_index_8',
 'H3_int_index_9',
 'H3_int_index_10',
 'H3_int_index_11',
 'H3_int_index_12',
 'H3_int_index_13',
 'H3_int_index_14',
 'H3_int_index_15']


save_ais_to_s3_daily(polygon_hex_df, start_date, end_date)

# extract Rotterdam AIS using radial boundary

In [None]:
start_date = datetime.fromisoformat("2023-01-01")
end_date = datetime.fromisoformat("2023-05-31")

query_polys = []
query_polys.append([ "rotterdam", 
                       {"type": "Polygon",
                        "coordinates": [ list(gdf.iloc[4]['Port_boundary'].exterior.coords) ] }])

# res 9 giving kernel issues hence using res 6
polygon_hex_df = af.polygon_to_hex_df(polygons=query_polys, hex_resolution=6, overfill=True)
polygon_hex_df.head()

# no duplication
polygon_hex_df.shape
polygon_hex_df.nunique()

# res 6 and upwards
keep_cols = ['message_type',
 'mmsi',
 'dt_insert_utc',
 'longitude',
 'latitude',
 'imo',
 'vessel_name',
 'callsign',
 'vessel_type',
 'vessel_type_code',
 'vessel_type_cargo',
 'vessel_class',
 'length',
 'width',
 'flag_country',
 'flag_code',
 'destination',
 'eta',
 'draught',
 'sog',
 'cog',
 'rot',
 'heading',
 'nav_status',
 'nav_status_code',
 'source',
 'dt_pos_utc',
 'dt_static_utc',
 'vessel_type_main',
 'vessel_type_sub',
 'eeid',
 'H3_int_index_6',
 'H3_int_index_7',
 'H3_int_index_8',
 'H3_int_index_9',
 'H3_int_index_10',
 'H3_int_index_11',
 'H3_int_index_12',
 'H3_int_index_13',
 'H3_int_index_14',
 'H3_int_index_15']


save_ais_to_s3_daily(polygon_hex_df, start_date, end_date)

### check path and size

In [7]:
prefix = "user_temp/adb/new_ports_canals/Port=rotterdam/year=2023/"
list_files_folders_s3(prefix, opt=2)

sub folder :  user_temp/adb/new_ports_canals/Port=rotterdam/year=2023/month=01/
sub folder :  user_temp/adb/new_ports_canals/Port=rotterdam/year=2023/month=02/
sub folder :  user_temp/adb/new_ports_canals/Port=rotterdam/year=2023/month=03/
sub folder :  user_temp/adb/new_ports_canals/Port=rotterdam/year=2023/month=04/
sub folder :  user_temp/adb/new_ports_canals/Port=rotterdam/year=2023/month=05/


In [9]:
#  get_size to validate
for i in [2019, 2020, 2021, 2022, 2023]:
    get_size("ungp-ais-data-historical-backup", f"user_temp/adb/new_ports_canals/Port=rotterdam/year={i}")/1000000000

12.773276094

14.553574967

13.355768191

9.493736435

4.474982575

In [10]:
spark.stop()