# Initialization

In [1]:
import pandas as pd
from pyspark.sql import functions as F

# Register Sedona Functions to Spark
from sedona.register import SedonaRegistrator

# Import SparkSession
from pyspark.sql import SparkSession

# Inisialisasi sesi Spark
# spark = SparkSession.builder \
#     .appName("AIS Data Processing") \
#     .config("spark.rpc.message.maxSize", 512) \
#     .getOrCreate()

# Register Sedona Functions to Spark
SedonaRegistrator.registerAll(spark)

# For 3.3.2
from shapely.geometry import Point, Polygon, mapping
import h3.api.numpy_int as h3int



Closing down clientserver connection


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, count, countDistinct, when, expr
import calendar
from pyspark.sql.types import IntegerType
from pyspark.sql.window import Window

In [3]:
import pandas as pd
from IPython.display import HTML
import base64
from pyspark.sql.functions import year, month, dayofmonth, hour, minute, second
from pyspark.sql.functions import monotonically_increasing_id

In [4]:
## needed for git use
import sys
import subprocess

GITLAB_USER = "read_aistt"  #For use of members of AIS Task Team, read only access
GITLAB_TOKEN = "J1Kk8tArfyXB6dZvFcWW"
ais_package = f"git+https://{GITLAB_USER}:{GITLAB_TOKEN}@code.officialstatistics.org/trade-task-team-phase-1/ais.git"

In [5]:
### Newly installed packages
from ais import functions as af

In [6]:
###Packages within Kernel by default
import geopandas as gpd
import h3
import matplotlib
import matplotlib.pyplot as plt
from shapely.ops import transform
from shapely.geometry import Polygon
from datetime import datetime
import requests

generated new fontManager


In [7]:
#Sedona Imports
import sedona.sql
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from sedona.core.SpatialRDD import PolygonRDD, PointRDD
from sedona.core.enums import FileDataSplitter

In [8]:
import folium
from shapely.geometry import box
from folium.plugins import MarkerCluster

# Asean

In [9]:
#Definisikan koordinat kotak pembatas untuk perairan Asia Tenggara
asia_tenggara_bbox = {
    "min_lon": 95.0,    # Garis bujur minimum
    "min_lat": -10.0,   # Garis lintang minimum
    "max_lon": 141.0,   # Garis bujur maksimum
    "max_lat": 25.0     # Garis lintang maksimum
}

In [10]:
#Inisialisasi peta Folium
m = folium.Map(location=[0, 100], zoom_start=5)  # Pusatkan peta pada Asia Tenggara (0° lintang, 100° bujur)

# Buat kotak pembatas menggunakan Rectangle
rectangle = folium.Rectangle(
    bounds=[[asia_tenggara_bbox['min_lat'], asia_tenggara_bbox['min_lon']],
            [asia_tenggara_bbox['max_lat'], asia_tenggara_bbox['max_lon']]],
    color='blue',
    fill=True,
    fill_color='blue',
    fill_opacity=0.2,
)

# Tambahkan kotak ke peta
rectangle.add_to(m)

# Tampilkan peta
m.save('southeast_asia_bbox.html')  # Simpan peta sebagai file HTML
m


In [11]:
# Definisikan batas kotak sebagai polygon
bbox_polygon = box(asia_tenggara_bbox['min_lon'], asia_tenggara_bbox['min_lat'],
                   asia_tenggara_bbox['max_lon'], asia_tenggara_bbox['max_lat'])

# Buat GeoDataFrame dengan satu baris yang berisi polygon ini
bbox_gdf = gpd.GeoDataFrame(geometry=[bbox_polygon])

In [12]:
# Resolusi H3 yang diinginkan
resolution = 8

# Fungsi untuk mengisi poligon dengan H3 index
def fill_with_h3(row):
    h3_indexes_str = h3.polyfill(row.geometry.__geo_interface__, resolution, geo_json_conformant=True)
    h3_indexes_int = [h3.string_to_h3(h) for h in h3_indexes_str]
    return h3_indexes_int

# Terapkan fungsi ke setiap baris GeoDataFrame
bbox_gdf['h3_indexes'] = bbox_gdf.apply(fill_with_h3, axis=1)

# Mengumpulkan semua H3 index ke dalam satu set untuk menghilangkan duplikat
h3_index_set = set()
for indexes in bbox_gdf['h3_indexes']:
    h3_index_set.update(indexes)

# Konversi set H3 index ke dalam list
h3_indexes_int = list(h3_index_set)

# Get AIS Data Indonesia

In [13]:
start_date = datetime.fromisoformat("2022-01-01")
end_date = datetime.fromisoformat("2022-12-31")

In [None]:
ais_data=af.get_ais(spark,
                    start_date = start_date,
                    end_date = end_date,
                    h3_list = h3_indexes_int)

In [None]:
# ais_data.count()

In [15]:
# Ekstrak tahun dari kolom yang berisi tanggal atau waktu
ais_data = ais_data.withColumn("tahun", date_format("dt_pos_utc", "yyyy"))

# Filter data untuk tahun 2022
ais_data = ais_data.filter(ais_data["tahun"] == 2022)

In [None]:
# ais_data.count()

In [16]:
aiss_data = ais_data.select('mmsi', 'vessel_type', 'dt_pos_utc')

In [17]:
# Filter kapal "passenger" & "pleasure craft"
data_ais = aiss_data.filter((col("vessel_type") == 'Passenger') | (col("vessel_type") == 'Pleasure Craft'))

In [19]:
# Path
base_path = "s3a://ungp-ais-data-historical-backup/user_temp/"
path_unique = base_path + "222011349/"

In [21]:
#save as parquet
# data_ais.write.option("header",True).mode("overwrite").parquet(path_unique + "data-ais-asean-passenger-and-pleasurecraft-2022.parquet")

In [None]:
# Read Data
data_ais = spark.read.parquet(path_unique + "data-ais-asean-passenger-and-pleasurecraft-2022.parquet", header=True)

In [28]:
# jumlah_record_per_bulan = data_ais.withColumn("months", date_format("dt_pos_utc", "MMMM")) \
#     .groupBy("months").agg(count("*").alias("jumlah_record_per_bulan"))

# jumlah_record_per_bulan.show(12)

In [20]:
# Ekstrak tahun, bulan, dan hari dari kolom tanggal
data_ais = data_ais.withColumn("tahun", date_format(col("dt_pos_utc"), "yyyy"))
data_ais = data_ais.withColumn("bulan", date_format(col("dt_pos_utc"), "MM"))
data_ais = data_ais.withColumn("hari", date_format(col("dt_pos_utc"), "dd"))

# Filter data untuk rentang tertentu
start_date = "2022-01-01"
end_date = "2022-01-15"

data_ais_filtered = data_ais.filter((col("dt_pos_utc").between(start_date, end_date)))

# Visualisasi

In [34]:
# Initialize Spark Session with updated configurations
spark = SparkSession.builder \
    .appName("AIS Data Processing") \
    .config("spark.rpc.message.maxSize", "512m") \
    .config("spark.driver.maxResultSize", "2g") \
    .getOrCreate()

In [None]:
# Inisialisasi peta Folium
n = folium.Map(location=[0, 0], zoom_start=2)
marker_cluster = MarkerCluster().add_to(n)

# Fungsi untuk menentukan warna marker berdasarkan jenis kapal
def get_marker_color(vessel_type):
    if vessel_type == 'Passenger':
        return 'blue'
    elif vessel_type == 'Pleasure Craft':
        return 'green'
    else:
        return 'gray'  # Jika terdapat jenis kapal lainnya

# Fungsi untuk memproses partisi data dan menambahkan marker ke peta
def process_partition(iterator):
    for row in iterator:
        lat = float(row.latitude)
        lon = float(row.longitude)
        vessel_type = row.vessel_type
        
        # Tentukan warna marker berdasarkan jenis kapal
        marker_color = get_marker_color(vessel_type)
        
        # Tambahkan marker ke cluster
        folium.CircleMarker(location=[lat, lon], radius=5, color=marker_color, fill=True).add_to(marker_cluster)

# Ubah data ke Pandas DataFrame untuk mengurangi ukuran partisi dan batch processing
ais_pandas = data_ais_filtered.toPandas()

# Proses data dalam batch yang lebih kecil
batch_size = 1000
for start in range(0, len(ais_pandas), batch_size):
    end = start + batch_size
    batch = ais_pandas[start:end]
    process_partition(batch.itertuples(index=False))

# Simpan peta sebagai file HTML
n.save('ais_visualization.html')

# Tampilkan peta
n

# Saving File

In [None]:
#save as parquet
data_ais.write.option("header",True).mode("overwrite").parquet(path_unique + "data-ais-asean-passenger-and-pleasurecraft-2022.parquet")

# Stop Spark Session

In [24]:
spark.stop()