# Data Kapal

In [None]:
import pandas as pd
from pyspark.sql import functions as F

#For 3.3.1
#Register Sedona Functions to Spark
from sedona.register import SedonaRegistrator
SedonaRegistrator.registerAll(spark)


#For 3.3.2
from shapely.geometry import Point, Polygon, mapping
import h3.api.numpy_int as h3int 

In [None]:
basepath = "s3a://ungp-ais-data-historical-backup/exact-earth-data/transformed/prod/"

In [None]:
#Read Data January 1 2022
df = spark.read.parquet(basepath+ "year=2022/month=01")

In [None]:
# HHH coba mengurutkan berdasarkan mmsi
df.createOrReplaceTempView("temp_df")
dfH =  spark.sql("""
SELECT
    mmsi,
    MIN(dt_pos_utc) AS dt_pos_utc,
    H3_int_index_8,
    ROUND(latitude, 2) AS latitude,
    ROUND(longitude, 2) AS longitude
FROM
    temp_df
GROUP BY
    mmsi, H3_int_index_8, ROUND(latitude, 2), ROUND(longitude, 2)
ORDER BY
    mmsi
                """)
dfH.show(50)

In [None]:
# HHH
# melihat setiap mmsi muncul berapa kali
# Menambahkan DataFrame hasil penghitungan ke dalam sebuah DataFrame baru
result_df = spark.sql("""
    SELECT mmsi, COUNT(*) AS jumlah_muncul
    FROM temp_df
    GROUP BY mmsi
    ORDER BY mmsi
""")

# Menampilkan hasil
result_df.show(30)

In [None]:
import folium
from folium.plugins import MarkerCluster

m = folium.Map(location=[2, 120], zoom_start=4.2, tiles="CartoDB positron")
# latitude = -6.2088
# longitude = 106.8456
# folium.Marker([latitude, longitude], popup='Ini adalah titik').add_to(m)
dfH = dfH.toPandas()

# for _, r in dfH.iterrows():
#     lat = r["latitude"]
#     lon = r["longitude"]
#     folium.Marker(
#         location=[lat, lon],
#         popup="length: {} <br> area: {}".format(r["buffer_area"], r["buffer_area"]),
#     ).add_to(m)
m
# Simpan peta sebagai file HTML
# my_map.save('map_with_markers.html')


In [None]:
# Initialize

In [None]:
from ais import functions as af
from pyspark.sql import functions as F
from pyspark.sql.types import StringType

In [None]:
import h3.api.numpy_int as h3int
from shapely.geometry import mapping, Polygon, Point

from multiprocessing import Pool
import tqdm

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np

import folium

In [None]:
pd.set_option('display.max_columns', None) #Show all columns in pandas df
pd.set_option('display.max_rows', 100) #Show 100 rows in pandas df
pd.options.display.float_format = '{:.10f}'.format #Show float with 10 decimal points in pandas df

from IPython.core.interactiveshell import InteractiveShell #allow multiple outputs in one jupyter cell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
#bucket = "ungp-ais-data-historical-backup"
#path = f"s3a://{bucket}/user_temp/adb/"
path = "s3a://ungp-ais-data-historical-backup/exact-earth-data/transformed/prod/"

# Buffer Polygons

In [None]:
def get_wpi():
    # wpi = gpd.read_file("https://msi.nga.mil/api/publications/download?key=16694622/SFH00000/WPI_Shapefile.zip")
    #wpi = pd.read_csv("https://raw.githubusercontent.com/muhammadhanief/cobapostgre/main/hasilgdf.csv")\
            .rename(columns={'latitude':'Latitude','longitude':'Longitude'})
# print(df4)
     wpi = pd.read_csv("https://msi.nga.mil/api/publications/download?type=view&key=16920959/SFH00000/UpdatedPub150.csv") \
             [['World Port Index Number','Main Port Name','UN/LOCODE','Country Code','Harbor Size','Harbor Type','Latitude','Longitude']] \
             .rename(columns={'Country Code':'Country','Main Port Name':'Port'})
    
    geometry=gpd.points_from_xy(wpi['Longitude'],wpi['Latitude'])
    
    wpi = gpd.GeoDataFrame(wpi, geometry=geometry, crs="epsg:4326")
    return wpi

In [None]:
def poly_to_h3(dfseries, h3_res=8):
    return dfseries.apply(lambda x: h3int.polyfill(mapping(x), h3_res, geo_json_conformant=True))

def h3_to_poly(df_series, crs='epsg:4326'):
    return gpd.GeoSeries(df_series.apply(lambda x: Polygon(h3int.h3_set_to_multi_polygon(x, geo_json=True)[0][0])), crs=crs)

def parallelize_dataframe(df, func,n_split=100, n_cores=4):
    df_split = np.array_split(df, n_split)
    pool = Pool(n_cores) 
    mapped_values = list(tqdm.tqdm(pool.imap_unordered(func, df_split), total=n_split))
    pool.close()
    pool.join()
    return pd.concat(mapped_values).sort_index()

In [None]:
def get_utm():
    url = 'https://opendata.arcgis.com/datasets/b294795270aa4fb3bd25286bf09edc51_0.zip'
    utm = gpd.read_file(url)
    utm['UTM'] = utm['ZONE'].astype(str) + utm['ROW_']
    south = ['A','B','C','D','E','F','G','H','J','K','L','M']
    north = ['N','P','Q','R','S','T','U','V','W','X','Y','Z']
    utm['NS'] = np.where(utm['ROW_'].isin(north),"N","S")
    utm['prefix'] = np.where(utm['ROW_'].isin(north),"326","327")
    utm['EPSG'] = 'epsg:' + utm['prefix'] + utm['ZONE'].astype(str).str.zfill(2)
    return utm

## Call Func

In [None]:
wpi = get_wpi()
utm = get_utm()

In [None]:
wpi.info()
utm.info()

In [None]:
wpi.head(20)

## Special Cases: 

In [None]:
wpi.info()

## Save

In [None]:
wpi.to_pickle(path+"ki/WPI.pkl")
utm.to_pickle(path+"ki/UTM.pkl")

## Generate buffers

In [None]:
# generate buffer HHH coba
import geopandas as gpd
from shapely.geometry import Point

# Contoh GeoDataFrame dengan kolom latitude dan longitude
gdf = gpd.GeoDataFrame(geometry=[Point(xy) for xy in zip(wpi['Longitude'], wpi['Latitude'])])

# Buat buffer dengan radius 0.1 derajat (harap disesuaikan sesuai kebutuhan)
buffer_radius = 0.1
gdf['buffer'] = gdf['geometry'].buffer(buffer_radius)

gdf_with_port = gdf.merge(pd.DataFrame(wpi), left_index=True, right_index=True)

# Tampilkan GeoDataFrame
print(gdf_with_port)
gdf_with_port.info()

In [None]:
# # generate buffer dengan radius kotak
# import geopandas as gpd
# import pandas as pd
# from shapely.geometry import Point, box

# # Contoh GeoDataFrame dengan kolom latitude dan longitude
# data = {'Latitude': [40.7128, 34.0522, 41.8781],
#         'Longitude': [-74.0060, -118.2437, -87.6298],
#         'Port': ['Port_A', 'Port_B', 'Port_C']}
# wpi = pd.DataFrame(data)

# # Buat GeoDataFrame dari DataFrame
# gdf = gpd.GeoDataFrame(geometry=[Point(xy) for xy in zip(wpi['Longitude'], wpi['Latitude'])])

# # Hitung faktor konversi dari kilometer ke derajat
# conversion_factor = 1 / 111.32  # Sekitar 111.32 km per derajat

# # Buat buffer persegi dengan sisi sepanjang 22 km
# buffer_radius_km = 22
# buffer_radius_deg = buffer_radius_km * conversion_factor
# gdf['buffer'] = gdf.apply(lambda row: box(row.geometry.x - buffer_radius_deg, row.geometry.y - buffer_radius_deg,
#                                           row.geometry.x + buffer_radius_deg, row.geometry.y + buffer_radius_deg), axis=1)

# # Gabungkan dengan DataFrame yang berisi kolom 'Port'
# gdf_with_port = gdf.merge(wpi, left_index=True, right_index=True)

# # Tampilkan GeoDataFrame
# print(gdf_with_port)
# gdf_with_port.info()


In [None]:
# ports_df.info()

In [None]:
# ports_df[ports_df['Country']=="Indonesia"]
#ports_df[ports_df['Country']=="Singapore"]['grouped_port'].iloc[0]

In [None]:
# ports_df = ports_df[ports_df['Country']=="Indonesia"]

In [None]:
# HHH started
# pip install geopandas

In [None]:
# pip install geodatasets

In [40]:
import geopandas as gpd
import geodatasets
import folium
import matplotlib.pyplot as plt

generated new fontManager


In [20]:
m = folium.Map(location=[2, 120], zoom_start=4.2, tiles="CartoDB positron")
# latitude = -6.2088
# longitude = 106.8456
# folium.Marker([latitude, longitude], popup='Ini adalah titik').add_to(m)
m

In [21]:
for _, r in gdf_with_port.iterrows():
    # Without simplifying the representation of each borough,
    # the map might not be displayed
    # sim_geo = gpd.GeoSeries(r["buffer_22KM"]).simplify(tolerance=0.001)
    sim_geo = gpd.GeoSeries(r["buffer"]).simplify(tolerance=0.001)
    geo_j = sim_geo.to_json()
    geo_j = folium.GeoJson(data=geo_j, style_function=lambda x: {"fillColor": "orange"})
    folium.Popup(r["Port"]).add_to(geo_j)
    geo_j.add_to(m)
    
m

<folium.map.Popup at 0x7f644c3e1eb0>

<folium.features.GeoJson at 0x7f6441b7fc70>

<folium.map.Popup at 0x7f6446be1100>

<folium.features.GeoJson at 0x7f6443801d90>

<folium.map.Popup at 0x7f64434832b0>

<folium.features.GeoJson at 0x7f6443801cd0>

<folium.map.Popup at 0x7f644394b310>

<folium.features.GeoJson at 0x7f644394b760>

<folium.map.Popup at 0x7f644394b370>

<folium.features.GeoJson at 0x7f644394b3a0>

<folium.map.Popup at 0x7f64416848b0>

<folium.features.GeoJson at 0x7f6441684670>

<folium.map.Popup at 0x7f644165ea00>

<folium.features.GeoJson at 0x7f644165e160>

<folium.map.Popup at 0x7f64416840a0>

<folium.features.GeoJson at 0x7f644165ef40>

<folium.map.Popup at 0x7f6441684340>

<folium.features.GeoJson at 0x7f6441684820>

<folium.map.Popup at 0x7f644165efa0>

<folium.features.GeoJson at 0x7f64416845e0>

<folium.map.Popup at 0x7f644161a370>

<folium.features.GeoJson at 0x7f6441684430>

<folium.map.Popup at 0x7f644161a820>

<folium.features.GeoJson at 0x7f644161a7c0>

<folium.map.Popup at 0x7f644161a940>

<folium.features.GeoJson at 0x7f644161a4c0>

<folium.map.Popup at 0x7f644161aa30>

<folium.features.GeoJson at 0x7f644161a6a0>

<folium.map.Popup at 0x7f644161ab80>

<folium.features.GeoJson at 0x7f644161a610>

<folium.map.Popup at 0x7f644161acd0>

<folium.features.GeoJson at 0x7f644161a490>

<folium.map.Popup at 0x7f644161ab20>

<folium.features.GeoJson at 0x7f64416bb190>

<folium.map.Popup at 0x7f644161a970>

<folium.features.GeoJson at 0x7f644161afa0>

<folium.map.Popup at 0x7f644161adf0>

<folium.features.GeoJson at 0x7f644161ad60>

<folium.map.Popup at 0x7f644161ae80>

<folium.features.GeoJson at 0x7f644161adc0>

<folium.map.Popup at 0x7f6441647220>

<folium.features.GeoJson at 0x7f644161a340>

<folium.map.Popup at 0x7f64416473a0>

<folium.features.GeoJson at 0x7f644161ac70>

<folium.map.Popup at 0x7f6441647160>

<folium.features.GeoJson at 0x7f64416474f0>

<folium.map.Popup at 0x7f64416478b0>

<folium.features.GeoJson at 0x7f64416471c0>

<folium.map.Popup at 0x7f64416470d0>

<folium.features.GeoJson at 0x7f6441647100>

<folium.map.Popup at 0x7f64416477f0>

<folium.features.GeoJson at 0x7f64416477c0>

<folium.map.Popup at 0x7f6441647940>

<folium.features.GeoJson at 0x7f6441647730>

<folium.map.Popup at 0x7f6441647a60>

<folium.features.GeoJson at 0x7f64416474c0>

<folium.map.Popup at 0x7f6441647b80>

<folium.features.GeoJson at 0x7f6441647610>

<folium.map.Popup at 0x7f6441647cd0>

<folium.features.GeoJson at 0x7f6441647490>

<folium.map.Popup at 0x7f6441647b20>

<folium.features.GeoJson at 0x7f644161aca0>

<folium.map.Popup at 0x7f64416473d0>

<folium.features.GeoJson at 0x7f6441647fa0>

<folium.map.Popup at 0x7f6441647df0>

<folium.features.GeoJson at 0x7f6441647d60>

<folium.map.Popup at 0x7f6441647e80>

<folium.features.GeoJson at 0x7f6441647dc0>

<folium.map.Popup at 0x7f648ea7a190>

<folium.features.GeoJson at 0x7f6441647520>

<folium.map.Popup at 0x7f648ea7a3a0>

<folium.features.GeoJson at 0x7f6441647c70>

<folium.map.Popup at 0x7f648ea7a160>

<folium.features.GeoJson at 0x7f648ea7a4f0>

<folium.map.Popup at 0x7f648ea7a5e0>

<folium.features.GeoJson at 0x7f648ea7a1f0>

<folium.map.Popup at 0x7f648ea7a700>

<folium.features.GeoJson at 0x7f648ea7a0d0>

<folium.map.Popup at 0x7f648ea7a820>

<folium.features.GeoJson at 0x7f648ea7a460>

<folium.map.Popup at 0x7f648ea7ad00>

<folium.features.GeoJson at 0x7f648ea7a730>

<folium.map.Popup at 0x7f648ea7a4c0>

<folium.features.GeoJson at 0x7f648ea7aa60>

<folium.map.Popup at 0x7f648ea7abb0>

<folium.features.GeoJson at 0x7f648ea7a610>

<folium.map.Popup at 0x7f648ea7aac0>

<folium.features.GeoJson at 0x7f648ea7aaf0>

<folium.map.Popup at 0x7f648ea7ab50>

<folium.features.GeoJson at 0x7f6441647ca0>

<folium.map.Popup at 0x7f648ea7a760>

<folium.features.GeoJson at 0x7f648ea7afd0>

<folium.map.Popup at 0x7f648ea7a430>

<folium.features.GeoJson at 0x7f648ea7ac40>

<folium.map.Popup at 0x7f648ea7ad90>

<folium.features.GeoJson at 0x7f648ea7a640>

<folium.map.Popup at 0x7f648ea29160>

<folium.features.GeoJson at 0x7f648ea7af70>

<folium.map.Popup at 0x7f648ea293d0>

<folium.features.GeoJson at 0x7f648ea29490>

<folium.map.Popup at 0x7f648ea29550>

<folium.features.GeoJson at 0x7f648ea294f0>

<folium.map.Popup at 0x7f648ea29640>

<folium.features.GeoJson at 0x7f648ea292e0>

<folium.map.Popup at 0x7f648ea29760>

<folium.features.GeoJson at 0x7f648ea294c0>

<folium.map.Popup at 0x7f648ea29880>

<folium.features.GeoJson at 0x7f648ea29130>

<folium.map.Popup at 0x7f648ea29cd0>

<folium.features.GeoJson at 0x7f648ea29700>

<folium.map.Popup at 0x7f648ea29df0>

<folium.features.GeoJson at 0x7f648ea298b0>

<folium.map.Popup at 0x7f648ea29f10>

<folium.features.GeoJson at 0x7f648ea299d0>

<folium.map.Popup at 0x7f648ea29a00>

<folium.features.GeoJson at 0x7f648ea29400>

<folium.map.Popup at 0x7f648ea29c40>

<folium.features.GeoJson at 0x7f648ea29520>

<folium.map.Popup at 0x7f648ea29f70>

<folium.features.GeoJson at 0x7f648ea291c0>

<folium.map.Popup at 0x7f648ea29dc0>

<folium.features.GeoJson at 0x7f648ea29b80>

<folium.map.Popup at 0x7f648ea29d90>

<folium.features.GeoJson at 0x7f648ea29e20>

<folium.map.Popup at 0x7f648e9d8370>

<folium.features.GeoJson at 0x7f648ea299a0>

<folium.map.Popup at 0x7f648e9d8130>

<folium.features.GeoJson at 0x7f648e9d80a0>

<folium.map.Popup at 0x7f648e9d82e0>

<folium.features.GeoJson at 0x7f648e9d8520>

<folium.map.Popup at 0x7f648e9d86d0>

<folium.features.GeoJson at 0x7f648e9d8460>

<folium.map.Popup at 0x7f648e9d8af0>

<folium.features.GeoJson at 0x7f648e9d8550>

<folium.map.Popup at 0x7f648e9d8c10>

<folium.features.GeoJson at 0x7f648e9d8700>

<folium.map.Popup at 0x7f648e9d8d30>

<folium.features.GeoJson at 0x7f648e9d8820>

<folium.map.Popup at 0x7f648e9d8e50>

<folium.features.GeoJson at 0x7f648e9d87c0>

<folium.map.Popup at 0x7f648e9d8940>

<folium.features.GeoJson at 0x7f648e9d8040>

<folium.map.Popup at 0x7f648e9d8b20>

<folium.features.GeoJson at 0x7f648e9d83d0>

<folium.map.Popup at 0x7f648e9d84c0>

<folium.features.GeoJson at 0x7f648e9d8be0>

<folium.map.Popup at 0x7f648e9d8dc0>

<folium.features.GeoJson at 0x7f648e9d88e0>

<folium.map.Popup at 0x7f648e9d8e80>

<folium.features.GeoJson at 0x7f648e9d8eb0>

<folium.map.Popup at 0x7f648ea04160>

<folium.features.GeoJson at 0x7f648e9d8df0>

<folium.map.Popup at 0x7f648ea043d0>

<folium.features.GeoJson at 0x7f648ea04460>

<folium.map.Popup at 0x7f648ea04430>

<folium.features.GeoJson at 0x7f648ea044c0>

<folium.map.Popup at 0x7f648ea04610>

<folium.features.GeoJson at 0x7f648ea04100>

<folium.map.Popup at 0x7f648ea04760>

<folium.features.GeoJson at 0x7f648ea04490>

<folium.map.Popup at 0x7f648ea04850>

<folium.features.GeoJson at 0x7f648ea04640>

<folium.map.Popup at 0x7f648ea04670>

<folium.features.GeoJson at 0x7f648ea04040>

<folium.map.Popup at 0x7f648ea04a90>

<folium.features.GeoJson at 0x7f648ea04790>

<folium.map.Popup at 0x7f648ea04b80>

<folium.features.GeoJson at 0x7f648ea049d0>

<folium.map.Popup at 0x7f648ea04a30>

<folium.features.GeoJson at 0x7f648e9d8c40>

<folium.map.Popup at 0x7f648ea04a60>

<folium.features.GeoJson at 0x7f648ea04c10>

<folium.map.Popup at 0x7f648ea04fd0>

<folium.features.GeoJson at 0x7f648ea04e20>

<folium.map.Popup at 0x7f648ea04f40>

<folium.features.GeoJson at 0x7f648ea04e50>

<folium.map.Popup at 0x7f648ea04ac0>

<folium.features.GeoJson at 0x7f648ea04ee0>

<folium.map.Popup at 0x7f648e9af310>

<folium.features.GeoJson at 0x7f648ea04130>

<folium.map.Popup at 0x7f648e9af430>

<folium.features.GeoJson at 0x7f648e9af040>

<folium.map.Popup at 0x7f648e9af100>

<folium.features.GeoJson at 0x7f648e9af520>

<folium.map.Popup at 0x7f648e9af610>

<folium.features.GeoJson at 0x7f648e9af0a0>

<folium.map.Popup at 0x7f648e9af730>

<folium.features.GeoJson at 0x7f648e9af4c0>

<folium.map.Popup at 0x7f648e9af8e0>

<folium.features.GeoJson at 0x7f648e9af070>

<folium.map.Popup at 0x7f648e9afa00>

<folium.features.GeoJson at 0x7f648e9af460>

<folium.map.Popup at 0x7f648e9afac0>

<folium.features.GeoJson at 0x7f648e9af550>

<folium.map.Popup at 0x7f648e9afbe0>

<folium.features.GeoJson at 0x7f648e9af3a0>

<folium.map.Popup at 0x7f648e9af9d0>

<folium.features.GeoJson at 0x7f648ea04970>

<folium.map.Popup at 0x7f648e9affd0>

<folium.features.GeoJson at 0x7f648e9af6d0>

<folium.map.Popup at 0x7f648e9afca0>

<folium.features.GeoJson at 0x7f648e9aff40>

<folium.map.Popup at 0x7f648e9af160>

<folium.features.GeoJson at 0x7f648e9afd00>

<folium.map.Popup at 0x7f648e9af250>

<folium.features.GeoJson at 0x7f648e9afd60>

<folium.map.Popup at 0x7f648e95c340>

<folium.features.GeoJson at 0x7f648e9af9a0>

<folium.map.Popup at 0x7f648e95c490>

<folium.features.GeoJson at 0x7f648e95c100>

<folium.map.Popup at 0x7f648e95c5e0>

<folium.features.GeoJson at 0x7f648e95c520>

<folium.map.Popup at 0x7f648e95c670>

<folium.features.GeoJson at 0x7f648e95c460>

<folium.map.Popup at 0x7f648e95c7f0>

<folium.features.GeoJson at 0x7f648e95c550>

<folium.map.Popup at 0x7f648e95c910>

<folium.features.GeoJson at 0x7f648e95c040>

<folium.map.Popup at 0x7f648e95ca30>

<folium.features.GeoJson at 0x7f648e95c4c0>

<folium.map.Popup at 0x7f648e95cb50>

<folium.features.GeoJson at 0x7f648e95c220>

<folium.map.Popup at 0x7f648e95c610>

<folium.features.GeoJson at 0x7f648e95cc10>

<folium.map.Popup at 0x7f648e95caf0>

<folium.features.GeoJson at 0x7f648e9aff70>

<folium.map.Popup at 0x7f648e95cd30>

<folium.features.GeoJson at 0x7f648e95cfa0>

<folium.map.Popup at 0x7f648e95ca90>

<folium.features.GeoJson at 0x7f648e95cdc0>

In [22]:
for _, r in gdf_with_port.iterrows():
    lat = r["Latitude"]
    lon = r["Longitude"]
    folium.Marker(
        location=[lat, lon],
        # popup="length: {} <br> area: {}".format(r["buffer_area"], r["buffer_area"]),
    ).add_to(m)

m

<folium.map.Marker at 0x7f648e95c310>

<folium.map.Marker at 0x7f6441684a90>

<folium.map.Marker at 0x7f6443844df0>

<folium.map.Marker at 0x7f6441684940>

<folium.map.Marker at 0x7f648e95ca60>

<folium.map.Marker at 0x7f644169d730>

<folium.map.Marker at 0x7f64416840d0>

<folium.map.Marker at 0x7f648e98c310>

<folium.map.Marker at 0x7f644169dfd0>

<folium.map.Marker at 0x7f6443844af0>

<folium.map.Marker at 0x7f648e98c250>

<folium.map.Marker at 0x7f6443483190>

<folium.map.Marker at 0x7f648e8a7a00>

<folium.map.Marker at 0x7f64416847c0>

<folium.map.Marker at 0x7f648e8a7eb0>

<folium.map.Marker at 0x7f6445fd56a0>

<folium.map.Marker at 0x7f648e8a79a0>

<folium.map.Marker at 0x7f648e8a7bb0>

<folium.map.Marker at 0x7f648e8a76d0>

<folium.map.Marker at 0x7f648e98c160>

<folium.map.Marker at 0x7f64416846d0>

<folium.map.Marker at 0x7f648e8a7f70>

<folium.map.Marker at 0x7f644165e640>

<folium.map.Marker at 0x7f64416846a0>

<folium.map.Marker at 0x7f644165e910>

<folium.map.Marker at 0x7f644165ec70>

<folium.map.Marker at 0x7f644165e370>

<folium.map.Marker at 0x7f648e8a78b0>

<folium.map.Marker at 0x7f644169de80>

<folium.map.Marker at 0x7f648e8a7b80>

<folium.map.Marker at 0x7f644165e220>

<folium.map.Marker at 0x7f648e8a7d30>

<folium.map.Marker at 0x7f648e8d2220>

<folium.map.Marker at 0x7f648e8d2040>

<folium.map.Marker at 0x7f648e8d22b0>

<folium.map.Marker at 0x7f648e8d22e0>

<folium.map.Marker at 0x7f648e8d20a0>

<folium.map.Marker at 0x7f648e8d2340>

<folium.map.Marker at 0x7f648e8d2370>

<folium.map.Marker at 0x7f648e8d23a0>

<folium.map.Marker at 0x7f648e8d23d0>

<folium.map.Marker at 0x7f648e8d2400>

<folium.map.Marker at 0x7f648e8d2430>

<folium.map.Marker at 0x7f648e8d2460>

<folium.map.Marker at 0x7f648e8d2490>

<folium.map.Marker at 0x7f648e8d24c0>

<folium.map.Marker at 0x7f648e8d24f0>

<folium.map.Marker at 0x7f648e8d2520>

<folium.map.Marker at 0x7f648e8d2280>

<folium.map.Marker at 0x7f648e8d25b0>

<folium.map.Marker at 0x7f648e8d25e0>

<folium.map.Marker at 0x7f648e8d2610>

<folium.map.Marker at 0x7f648e8d2640>

<folium.map.Marker at 0x7f648e8d2670>

<folium.map.Marker at 0x7f648e8d26a0>

<folium.map.Marker at 0x7f648e8d26d0>

<folium.map.Marker at 0x7f648e8d2700>

<folium.map.Marker at 0x7f648e8d2730>

<folium.map.Marker at 0x7f648e8d2760>

<folium.map.Marker at 0x7f648e8d2790>

<folium.map.Marker at 0x7f648e8d27c0>

<folium.map.Marker at 0x7f648e8d27f0>

<folium.map.Marker at 0x7f648e8d2820>

<folium.map.Marker at 0x7f648e8d2850>

<folium.map.Marker at 0x7f648e8d2880>

<folium.map.Marker at 0x7f648e8d28b0>

<folium.map.Marker at 0x7f648e8d28e0>

<folium.map.Marker at 0x7f648e8d2910>

<folium.map.Marker at 0x7f648e8d2940>

<folium.map.Marker at 0x7f648e8d2970>

<folium.map.Marker at 0x7f648e8d29a0>

<folium.map.Marker at 0x7f648e8d29d0>

<folium.map.Marker at 0x7f648e8d2a00>

<folium.map.Marker at 0x7f648e8d2a30>

<folium.map.Marker at 0x7f648e8d2a60>

<folium.map.Marker at 0x7f648e8d2100>

<folium.map.Marker at 0x7f648e8d2ac0>

<folium.map.Marker at 0x7f648e8d2af0>

<folium.map.Marker at 0x7f648e8d2b20>

<folium.map.Marker at 0x7f648e8d2b50>

<folium.map.Marker at 0x7f648e8d2b80>

<folium.map.Marker at 0x7f648e8d2bb0>

<folium.map.Marker at 0x7f648e8d2be0>

<folium.map.Marker at 0x7f648e8d2c10>

<folium.map.Marker at 0x7f648e8d2550>

<folium.map.Marker at 0x7f648e8d2c70>

<folium.map.Marker at 0x7f648e8d2ca0>

<folium.map.Marker at 0x7f648e8d2cd0>

<folium.map.Marker at 0x7f648e8d2580>

<folium.map.Marker at 0x7f648e8d2130>

<folium.map.Marker at 0x7f648e8d2d60>

<folium.map.Marker at 0x7f648e8d2d90>

<folium.map.Marker at 0x7f648e8d2dc0>

<folium.map.Marker at 0x7f648e8d2df0>

<folium.map.Marker at 0x7f648e8d2e20>

<folium.map.Marker at 0x7f648e8d2e50>

<folium.map.Marker at 0x7f648e8d2e80>

<folium.map.Marker at 0x7f648e8d2eb0>

<folium.map.Marker at 0x7f648e8d2ee0>

<folium.map.Marker at 0x7f648e8d2f10>

<folium.map.Marker at 0x7f648e8d2f40>

<folium.map.Marker at 0x7f644165e4f0>

<folium.map.Marker at 0x7f648e8d2310>

<folium.map.Marker at 0x7f644165e430>

<folium.map.Marker at 0x7f648e8a7f10>

<folium.map.Marker at 0x7f648e8d2d00>

<folium.map.Marker at 0x7f648e8d21f0>

<folium.map.Marker at 0x7f648e8d2a90>

<folium.map.Marker at 0x7f644165e310>

<folium.map.Marker at 0x7f648e8d2160>

<folium.map.Marker at 0x7f648e8d2f70>

<folium.map.Marker at 0x7f648e8d2250>

<folium.map.Marker at 0x7f648e8d2d30>

<folium.map.Marker at 0x7f648e861190>

<folium.map.Marker at 0x7f648e8611c0>

In [None]:
from shapely.geometry import Point
import geopandas as gpd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import BooleanType

# Fungsi untuk menentukan apakah titik berada di dalam salah satu port
def is_inside_port(row, ports):
    point = Point(row['longitude'], row['latitude'])
    for port in ports:
        if point.within(port):
            return True
    return False

# Convert gdf_with_port to GeoDataFrame
gdf_with_port = gpd.GeoDataFrame(gdf_with_port, geometry='geometry_x')

# Ambil kolom geometri dari GeoDataFrame port
ports = gdf_with_port['geometry_x'].tolist()

# UDF (User Defined Function) untuk menerapkan fungsi pada setiap baris DataFrame Spark
@F.udf(returnType=BooleanType())
def is_inside_port_udf(latitude, longitude):
    point = Point(float(longitude), float(latitude))
    for port in ports:
        if point.within(port):
            return True
    return False

# Tambahkan kolom baru 'in_port' ke DataFrame kapal
dfH = dfH.withColumn('in_port', is_inside_port_udf(col('latitude'), col('longitude')))

# Tampilkan DataFrame hasil
dfH.show(10)


In [1]:
gdf_with_port.info()
type(dfH)

NameError: name 'gdf_with_port' is not defined

In [27]:
# ports_df[ports_df['buffer_grouped_id'].isin(ports_df[ports_df['Port']=="Shanghai"].buffer_grouped_id)]

In [30]:
ports_df.to_pickle(path+"ki/wpi_22KM_v2.pkl")

In [31]:
for i in range(8,13):
    ports_df[f'H3_int_index_{i}'] = ports_df['location'].apply(lambda x: h3int.geo_to_h3(x.y, x.x, i))

In [32]:
ports_df[['port_id','H3_int_index_8','H3_int_index_9','H3_int_index_10','H3_int_index_11','H3_int_index_12']].drop_duplicates() \
    .to_parquet(path+"ki/global_point")

In [33]:
multiple_ports = ports_df[ports_df['grouped_port'].str.len() > 1]['buffer_grouped_id'].unique()

In [34]:
ports_grouped_df = ports_grouped_df.merge(
    ports_df[~ports_df['buffer_grouped_id'].isin(multiple_ports)][['buffer_grouped_id','port_id']],
    on=['buffer_grouped_id'],
    how='left')

In [35]:
ports_grouped_df.info()
ports_grouped_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7304278 entries, 0 to 7304277
Data columns (total 3 columns):
 #   Column             Dtype  
---  ------             -----  
 0   buffer_grouped_h3  object 
 1   buffer_grouped_id  int64  
 2   port_id            float64
dtypes: float64(1), int64(1), object(1)
memory usage: 222.9+ MB


Unnamed: 0,buffer_grouped_h3,buffer_grouped_id,port_id
0,615323611256848383,0,
1,615323620899553279,0,
2,615323654649020415,0,
3,615323622252216319,0,
4,615323613060399103,0,


In [36]:
ports_grouped_df.to_parquet(f"{path}ki/wpi_22KM_grouped/")

# Manually drawn boundaries for passageways

drawn from https://geojson.io/

## babel 

In [105]:
babel1 = Polygon([
          [
            43.27895975093108,
            12.479248963885425
          ],
          [
            43.60145612579805,
            12.755308659453817
          ],
          [
            44.08237177252951,
            12.589708545824834
          ],
          [
            43.38645854255341,
            11.98160302124407
          ],
          [
            43.27895975093108,
            12.479248963885425
          ]
        ]
)

In [106]:
babel2 = Polygon([
          [
            42.368054706734966,
            13.38358837752314
          ],
          [
            43.15449323491927,
            13.944351521664089
          ],
          [
            42.9621269762267,
            14.45445207099813
          ],
          [
            42.03990050073,
            13.763074483095224
          ],
          [
            42.368054706734966,
            13.38358837752314
          ],
        ])

## Hormuz 

In [107]:
hormuz1 =Polygon( [
          [
            56.54209088104923,
            26.345701990099286
          ],
          [
            57.052493823986566,
            26.44681101991337
          ],
          [
            57.18991000093121,
            26.094746776191414
          ],
          [
            56.48074437348609,
            25.98230244908771
          ],
          [
            56.54209088104923,
            26.345701990099286
          ]
        ])

In [108]:
hormuz2 = Polygon([[
            56.14664785983311,
            26.144080251788964
          ],
          [
            55.53733355176945,
            26.77871964243961
          ],
          [
            55.15065331780599,
            26.723786588768462
          ],
          [
            55.97674290854613,
            25.83600264889769
          ],
          [
            56.14664785983311,
            26.144080251788964
          ]])

## Bering

In [109]:
bering1 = Polygon([
          [
            -170.73805004714242,
            65.56530230079281
          ],
          [
            -167.4732181822378,
            65.33405060664742
          ],
          [
            -166.26813960465665,
            64.455343087779
          ],
          [
            -172.2669123721415,
            64.67003594428746
          ],
          [
            -170.73805004714242,
            65.56530230079281
          ]])

In [110]:
bering2 = Polygon([
          [
            -170.35531957639424,
            66.36461724180995
          ],
          [
            -167.20860235541656,
            65.92759008186346
          ],
          [
            -165.30051047721545,
            66.44104425186944
          ],
          [
            -171.618533288113,
            66.97481018244048
          ],
          [
            -170.35531957639424,
            66.36461724180995
          ]
        ])

## png

In [111]:
png1 = Polygon([
          [
            143.54747970550113,
            -8.959293027678143
          ],
          [
            142.89046905873647,
            -11.405714245437778
          ],
          [
            143.62825963140688,
            -12.783360006146125
          ],
          [
            145.26514781705595,
            -7.845220282519449
          ],
          [
            143.52183304603545,
            -8.934225968514397
          ],
          [
            143.48343990990293,
            -8.934472529535554
          ]
        ])

In [112]:
png2 = Polygon([
          [
            141.39271814215806,
            -9.189806079027534
          ],
          [
            142.09358699203204,
            -11.11538407055042
          ],
          [
            141.6541520914178,
            -12.345911576286795
          ],
          [
            139.92154089960428,
            -8.231751508509248
          ],
          [
            141.379938290852,
            -9.21494394104458
          ]
        ])

## japan

In [113]:
jpn1 = Polygon([
          [
            142.2350265373035,
            45.273705203409946
          ],
          [
            142.20296982512048,
            46.0911353961655
          ],
          [
            143.4291390661225,
            46.1910875705841
          ],
          [
            143.29289803934455,
            44.39853853651127
          ],
          [
            142.2350265373035,
            45.26242459433013
          ]
        ])

In [114]:
jpn2 = Polygon([
          [
            141.8856631131332,
            46.13701668299231
          ],
          [
            141.6131810595772,
            45.30899071377837
          ],
          [
            140.8117632550007,
            45.263882887638744
          ],
          [
            140.77970654281762,
            46.66755482490876
          ],
          [
            141.90169146922472,
            46.13701668299231
          ]
        ])

## Japan South Korea

In [115]:
sokor1 = Polygon([
          [
            129.33668711989776,
            35.421842495600686
          ],
          [
            130.9810206803253,
            34.362152214650436
          ],
          [
            132.39044944640608,
            35.140196165223344
          ],
          [
            129.46882106671785,
            36.762381401369495
          ],
          [
            129.38073176883773,
            35.4038943526013
          ]
        ])

In [116]:
sokor2 = Polygon([
          [
            127.91991870561918,
            34.712878558134406
          ],
          [
            129.44679963162667,
            33.27661791984407
          ],
          [
            128.55122510317955,
            32.5865643647203
          ],
          [
            126.88486921828195,
            34.174092141396684
          ],
          [
            127.91991870561918,
            34.712878558134406
          ]
        ])

In [117]:
sg1 = Polygon([
          [
            104.24203225859111,
            1.6505628383036992
          ],
          [
            104.2788192333881,
            1.3715119639307147
          ],
          [
            104.38918015777926,
            1.1832960825392007
          ],
          [
            104.62937746380851,
            1.1616612164212796
          ],
          [
            104.24203225859111,
            1.6505628383036992
          ]
        ])

In [118]:
sg2 = Polygon([
          [
            103.15560681340821,
            0.9016872915739782
          ],
          [
            103.49289539213794,
            1.284020633524463
          ],
          [
            103.34895695408125,
            1.5503324363120328
          ],
          [
            102.99448169618063,
            1.0842682575379996
          ],
          [
            103.15560681340821,
            0.9016872915739782
          ]
        ])

In [119]:
sg3 = Polygon(
    [
          [
            98.23683440342586,
            4.4179433814712326
          ],
          [
            100.39630762290955,
            5.682757950093304
          ],
          [
            99.722551978431,
            6.901925781850082
          ],
          [
            97.13982200792748,
            5.27003248678588
          ],
          [
            98.23683440342586,
            4.4179433814712326
          ]
        ]
)

In [120]:
# Danish
db1 = Polygon(
     [
          [
            10.447154042413331,
            56.548141239881005
          ],
          [
            12.428792950137222,
            56.89586359969775
          ],
          [
            11.872694559580765,
            57.745245752027785
          ],
          [
            10.435630899003229,
            57.52269766661777
          ],
          [
            10.447154042413331,
            56.548141239881005
          ]
        ])
db2 = Polygon(
    [
          [
            13.115907618875838,
            55.37982784568749
          ],
          [
            12.858896486789348,
            54.34444711605286
          ],
          [
            14.315292901946151,
            53.96820165710426
          ],
          [
            14.279596911378064,
            55.436571345079244
          ],
          [
             13.115907618875838,
            55.37982784568749
          ]
        ])

## All

In [121]:
passthru_manual = gpd.GeoDataFrame([
    [babel1,"Bab El-Mandeb SE", "Bab El-Mandeb Strait",1,1],
    [babel2,"Bab El-Mandeb NW", "Bab El-Mandeb Strait",2,1],
    [hormuz1,"Hormuz E","Strait of Hormuz",1,2],
    [hormuz2,"Hormuz W","Strait of Hormuz",2,2],
    [bering1, "Bering S","Bering Strait",1,3],
    [bering2,"Bering N","Bering Strait", 2,3],
    [png1, "Torres E", "Torres Strait",1,4],
    [png2, "Torres W", "Torres Strait",2,4],
    [jpn1, "La Pérouse E","La Pérouse Strait",1,5],
    [jpn2, "La Pérouse W","La Pérouse Strait",2,5],
    [sokor1, "Korea NE", "Korea Strait", 1, 6],
    [sokor2,"Korea SW", "Korea Strait", 2, 6],
    [sg1,"Singapore 1", "Singapore Strait", 1, 17],
    [sg2,"Singapore 2", "Singapore Strait", 2, 17],
    [sg3,"Singapore 3", "Singapore Strait", 3, 17],
    [db1,"Danish N","Danish Straits",1,18],
    [db2,"Danish S","Danish Straits",2,18]
    
],
columns=['geometry','Passage_Part','Passage','passage_part_id','passage_id'],
crs="epsg:4326")

passthru_manual.to_pickle(f"{path}ki/Passthru.pkl")

In [122]:
passthru_manual['h3'] = poly_to_h3(passthru_manual.geometry)
passthru_h3 = passthru_manual[['h3','passage_part_id','passage_id']].explode("h3", ignore_index=True)

In [123]:
passthru_h3.to_parquet(f"{path}ki/passage_manual/")

In [124]:
passthru_h3 = None

In [125]:
# ports_df[ports_df['buffer_grouped_id'].isin(passthru_buffer['buffer_grouped_id'])][['Country','Port','buffer_grouped_id']].drop_duplicates()

In [126]:
passthru_buffer = pd.DataFrame([
    [868, "Suez Canal Anchorage N", "Suez Canal",1,7],
    [864, "Suez Canal Anchorage S", "Suez Canal",2,7],
    [865, "Suez Canal Great Bitter Lake", "Suez Canal",3,7],
    [579, "Gibraltar Strait","Gibraltar Strait",1,8],
    [642,"English Channel","English Channel",1,10],
    [885,"Bosphorus Strait","Bosphorus Strait",1,11],
    [798, "Dardanelles Strait", "Dardanelles Strait", 1,12],
    [730, "Cape of Good Hope", "Cape of Good Hope", 1,13],
    [213, "Panama Canal N", "Panama Canal",1,14],
    [211,"Panama Canal S", "Panama Canal",2,14],
    [309, "Magellan Strait N", "Magellan Strait", 1,15],
    [306, "Magellan Strait S", "Magellan Strait", 2, 15],
], columns = ["buffer_grouped_id","Passage Part", "Passage","passage_part_id","passage_id"]
)
    


In [127]:
passthru_buffer.to_pickle(f"{path}ki/Passthru_Buffer.pkl")

# Overlapping ports reference

In [6]:
ports_df = pd.read_pickle(path+"ki/wpi_22KM_v2.pkl")
multiple_ports = ports_df[ports_df['grouped_port'].apply(lambda x: len(x)) > 1].buffer_grouped_id.unique()

NameError: name 'path' is not defined

In [129]:
ports_overlap = ports_df[ports_df['buffer_grouped_id'].isin(multiple_ports)].set_index(['buffer_grouped_id','port_id'])['buffer_22KM']
ports_overlap_h3 = parallelize_dataframe(ports_overlap, poly_to_h3,100,4) \
                        .reset_index() \
                        .explode("buffer_22KM", ignore_index=True)

Closing down clientserver connection
Closing down clientserver connection
Closing down clientserver connection
Closing down clientserver connection


100%|██████████| 100/100 [00:05<00:00, 19.95it/s]


In [130]:
ports_overlap_h3.to_parquet(f"{path}ki/overlapping/")

# Combine All

In [133]:
#all buffer grouped id with multiple ports

overlap_sdf = spark.read.parquet(f"{path}ki/overlapping/")
overlap_sdf.printSchema()
overlap_sdf.count()
overlap_sdf.show()

root
 |-- buffer_grouped_id: long (nullable = true)
 |-- port_id: double (nullable = true)
 |-- buffer_22KM: decimal(20,0) (nullable = true)



8159729

+-----------------+-------+------------------+
|buffer_grouped_id|port_id|       buffer_22KM|
+-----------------+-------+------------------+
|                7|55770.0|614902042860716031|
|                7|55770.0|614899994710769663|
|                7|55770.0|614902026330963967|
|                7|55770.0|614900011240521727|
|                7|55770.0|614902044999811071|
|                7|55770.0|614902044708306943|
|                7|55770.0|614899992571674623|
|                7|55770.0|614899988876492799|
|                7|55770.0|614899995002273791|
|                7|55770.0|614899988293484543|
|                7|55770.0|614902044416802815|
|                7|55770.0|614899991015587839|
|                7|55770.0|614900010657513471|
|                7|55770.0|614902042277707775|
|                7|55770.0|614899995392344063|
|                7|55770.0|614899993154682879|
|                7|55770.0|614899993253249023|
|                7|55770.0|614899991114153983|
|            

In [134]:
overlap_sdf.select(F.countDistinct("buffer_22KM")).show()

+---------------------------+
|count(DISTINCT buffer_22KM)|
+---------------------------+
|                    4808538|
+---------------------------+



In [135]:
overlap_agg_sdf = \
overlap_sdf.withColumnRenamed("buffer_22KM","H3_int_index_8") \
            .groupBy("H3_int_index_8") \
            .agg(F.collect_set("port_id").alias("port_id_list"),
                 F.first("buffer_grouped_id").alias("buffer_grouped_id"),
                 F.countDistinct("port_id").alias("port_count")
                ) \
            .withColumn("port_id", F.when(F.col("port_count")==1, F.col("port_id_list").getItem(0)))
overlap_agg_sdf.printSchema()
overlap_agg_sdf.count()

root
 |-- H3_int_index_8: decimal(20,0) (nullable = true)
 |-- port_id_list: array (nullable = false)
 |    |-- element: double (containsNull = false)
 |-- buffer_grouped_id: long (nullable = true)
 |-- port_count: long (nullable = false)
 |-- port_id: double (nullable = true)



4808538

In [136]:
overlap_agg_sdf.show(n=10)

+------------------+------------+-----------------+----------+-------+
|    H3_int_index_8|port_id_list|buffer_grouped_id|port_count|port_id|
+------------------+------------+-----------------+----------+-------+
|612509340533784575|   [20910.0]|              841|         1|20910.0|
|612509340542173183|   [20910.0]|              841|         1|20910.0|
|612509340546367487|   [20910.0]|              841|         1|20910.0|
|612509340561047551|   [20910.0]|              841|         1|20910.0|
|612509340571533311|   [20910.0]|              841|         1|20910.0|
|612509340577824767|   [20910.0]|              841|         1|20910.0|
|612509340579921919|   [20910.0]|              841|         1|20910.0|
|612509340600893439|   [20910.0]|              841|         1|20910.0|
|612509340634447871|   [20910.0]|              841|         1|20910.0|
|612509340642836479|   [20910.0]|              841|         1|20910.0|
+------------------+------------+-----------------+----------+-------+
only s

In [137]:
#manually drawn polygons for passageways
passthru_sdf = spark.read.parquet(f"{path}ki/passage_manual/") \
                    .withColumnRenamed("h3","H3_int_index_8") \
                    .withColumnRenamed("passage_part_id","passage_part_id_manual") \
                    .withColumnRenamed("passage_id","passage_id_manual")
passthru_sdf.printSchema()
passthru_sdf.count()

root
 |-- H3_int_index_8: decimal(20,0) (nullable = true)
 |-- passage_part_id_manual: long (nullable = true)
 |-- passage_id_manual: long (nullable = true)



421193

In [138]:
passthru_sdf.select(F.countDistinct("H3_int_index_8")).show()

+------------------------------+
|count(DISTINCT H3_int_index_8)|
+------------------------------+
|                        421193|
+------------------------------+



In [140]:
passthru_buffer_sdf = spark.createDataFrame(pd.read_pickle(f"{path}ki/Passthru_Buffer.pkl") \
                                               [['buffer_grouped_id','passage_part_id','passage_id']] \
                                                .rename(columns={'passage_part_id':'passage_part_id_buffer',
                                                                 'passage_id':'passage_id_buffer'}
                                                       )
                                           )
passthru_buffer_sdf.printSchema()
passthru_buffer_sdf.count()

root
 |-- buffer_grouped_id: long (nullable = true)
 |-- passage_part_id_buffer: long (nullable = true)
 |-- passage_id_buffer: long (nullable = true)





12

In [141]:
#22KM buffer grouped, with port ids attached for buffers with single port
grouped_sdf = spark.read.parquet(f"{path}ki/wpi_22KM_grouped/") \
                    .select("buffer_grouped_h3","buffer_grouped_id","port_id") \
                    .withColumnRenamed("buffer_grouped_h3","H3_int_index_8")
grouped_sdf.printSchema()
grouped_sdf.count()

root
 |-- H3_int_index_8: decimal(20,0) (nullable = true)
 |-- buffer_grouped_id: long (nullable = true)
 |-- port_id: double (nullable = true)



7304278

In [142]:
grouped_sdf.select("H3_int_index_8","port_id").distinct().count()

7304278

In [143]:
grouped_sdf.select(F.countDistinct("H3_int_index_8")).show()

+------------------------------+
|count(DISTINCT H3_int_index_8)|
+------------------------------+
|                       7304278|
+------------------------------+



In [144]:
grouped_sdf.groupby("H3_int_index_8").count().filter(F.col("count")>1).show(n=10)

+--------------+-----+
|H3_int_index_8|count|
+--------------+-----+
+--------------+-----+



In [73]:
grouped_sdf.select(F.max("buffer_grouped_id")).show()

+----------------------+
|max(buffer_grouped_id)|
+----------------------+
|                  1508|
+----------------------+



In [74]:
combined_sdf =grouped_sdf \
            .join(overlap_agg_sdf.drop("buffer_grouped_id").withColumnRenamed("port_id","single_port_id"), #buffer with multiple port, fill port id for hexes with single port
                   on="H3_int_index_8",
                   how="left"
                  ) \
            .join(passthru_sdf, #manually drawn passageway
                   on = "H3_int_index_8",
                   how="outer"
                  ) \
            .join(passthru_buffer_sdf, #buffers with passageways
                  on = "buffer_grouped_id",
                  how = "left"
                 ) \
            .withColumn("port_id",F.coalesce("port_id","single_port_id")) \
            .withColumn("passage_part_id",F.coalesce("passage_part_id_manual","passage_part_id_buffer")) \
            .withColumn("passage_id", F.coalesce("passage_id_manual","passage_id_buffer")) \
            .withColumn("passage_id_temp", F.lit(2000) + F.col("passage_id")) \
            .withColumn("buffer_grouped_id", F.coalesce("buffer_grouped_id","passage_id_temp")) \
            .drop("single_port_id",
                  "passage_part_id_manual","passage_part_id_buffer","passage_id_manual","passage_id_buffer", "passage_id_temp") 

combined_sdf.printSchema()
combined_sdf.count()

root
 |-- buffer_grouped_id: long (nullable = true)
 |-- H3_int_index_8: decimal(20,0) (nullable = true)
 |-- port_id: double (nullable = true)
 |-- port_id_list: array (nullable = true)
 |    |-- element: double (containsNull = false)
 |-- port_count: long (nullable = true)
 |-- passage_part_id: long (nullable = true)
 |-- passage_id: long (nullable = true)



7694797

In [75]:
#should be same as count above, i.e. no duplicate hex
combined_sdf.select(F.countDistinct("H3_int_index_8")).show()

+------------------------------+
|count(DISTINCT H3_int_index_8)|
+------------------------------+
|                       7694797|
+------------------------------+



In [76]:
#all hexes have buffer_grouped_id, even the passageways
combined_sdf.filter(F.col("buffer_grouped_id").isNotNull()).count()

7694797

In [77]:
#suez canal
combined_sdf.filter(F.col("passage_id")==7).select("buffer_grouped_id","port_id","port_id_list","passage_id","passage_part_id").distinct().show()

+-----------------+-------+------------------+----------+---------------+
|buffer_grouped_id|port_id|      port_id_list|passage_id|passage_part_id|
+-----------------+-------+------------------+----------+---------------+
|              864|   null|[48120.0, 48121.0]|         7|              2|
|              865|   null|[47970.0, 47974.0]|         7|              3|
|              865|47974.0|         [47974.0]|         7|              3|
|              868|48104.0|         [48104.0]|         7|              1|
|              864|48121.0|         [48121.0]|         7|              2|
|              868|48106.0|         [48106.0]|         7|              1|
|              864|48120.0|         [48120.0]|         7|              2|
|              868|   null|[48104.0, 48106.0]|         7|              1|
|              868|   null|[48108.0, 48106.0]|         7|              1|
|              865|47970.0|         [47970.0]|         7|              3|
|              868|48108.0|         [4

In [78]:
combined_sdf.filter(F.col("passage_id")==6).select("buffer_grouped_id","port_id","port_id_list","passage_id","passage_part_id").distinct().show()

+-----------------+-------+------------------+----------+---------------+
|buffer_grouped_id|port_id|      port_id_list|passage_id|passage_part_id|
+-----------------+-------+------------------+----------+---------------+
|             2006|   null|              null|         6|              1|
|             2006|   null|              null|         6|              2|
|             1349|   null|[60370.0, 60376.0]|         6|              2|
|             1349|60370.0|         [60370.0]|         6|              2|
|             1352|60400.0|         [60400.0]|         6|              1|
|             1357|60410.0|              null|         6|              1|
|             1338|62340.0|              null|         6|              2|
|             1340|61720.0|         [61720.0]|         6|              1|
|             1350|61730.0|              null|         6|              1|
+-----------------+-------+------------------+----------+---------------+



In [79]:
combined_sdf.write.mode("overwrite").parquet(f"{path}ki/global_polygon/")

In [2]:
pip install holoviews --user

Closing down clientserver connection
[0mCollecting holoviews
  Using cached holoviews-1.17.1-py2.py3-none-any.whl.metadata (21 kB)
Collecting panel>=0.13.1 (from holoviews)
  Using cached panel-1.2.3-py2.py3-none-any.whl.metadata (22 kB)
Collecting bokeh<3.3.0,>=3.1.1 (from panel>=0.13.1->holoviews)
  Using cached bokeh-3.1.1-py3-none-any.whl (8.3 MB)
Using cached holoviews-1.17.1-py2.py3-none-any.whl (4.3 MB)
Using cached panel-1.2.3-py2.py3-none-any.whl (20.1 MB)
[0mInstalling collected packages: bokeh, panel, holoviews
[0mSuccessfully installed bokeh-3.1.1 holoviews-1.17.1 panel-1.2.3
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import holoviews as hv
from holoviews import opts, dim
from bokeh.sampledata.les_mis import data

hv.extension('bokeh')
hv.output(size=200)
links = pd.DataFrame(data['links'])
print(links.head(3))
hv.Chord(links)

Error while sending or receiving.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 503, in send_command
    self.socket.sendall(command.encode("utf-8"))
ConnectionResetError: [Errno 104] Connection reset by peer
Closing down clientserver connection
Exception while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 503, in send_command
    self.socket.sendall(command.encode("utf-8"))
ConnectionResetError: [Errno 104] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 506, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetwor

ModuleNotFoundError: No module named 'holoviews'