In [1]:
#For 3.3.1
#Register Sedona Functions to Spark
from sedona.register import SedonaRegistrator
SedonaRegistrator.registerAll(spark)

True

In [2]:
#For 3.3.2
from shapely.geometry import Point, Polygon, mapping
import h3.api.numpy_int as h3int

In [3]:
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.functions import col, when, date_format

In [4]:
basepath = "s3a://ungp-ais-data-historical-backup/exact-earth-data/transformed/prod/"

In [5]:
# Path
save_path = "s3a://ungp-ais-data-historical-backup/user_temp/"
path_unique = save_path + "222011349/"

# Data AIS

In [6]:
#Read Data
df_data = spark.read.parquet(basepath + "year=2022")

In [7]:
# Ekstrak tahun dari kolom yang berisi tanggal atau waktu
df_data = df_data.withColumn("tahun", date_format("dt_pos_utc", "yyyy"))

# Filter data untuk tahun 2022
df_data = df_data.filter(df_data["tahun"] == 2022)

In [8]:
# Save Data
df_data.write.option("header", True).mode("overwrite").parquet(path_unique + "data-ais-dunia-2022.parquet")

In [6]:
# Read Data
data_ais = spark.read.parquet(path_unique + "data-ais-dunia-2022.parquet", header=True)

In [7]:
data_ais.count()

8898333301

# Filter Data

In [7]:
# Filter MMSI
filtered_mmsi = data_ais.filter(col("mmsi").between(100000000, 999999999))

In [9]:
filtered_mmsi.count()

8897196247

In [8]:
# Filter IMO
filtered_imo = filtered_mmsi.filter(col("imo").between(1000000, 9999999))

In [11]:
filtered_imo.count()

4805707952

In [9]:
# Filter Status Navigasi
filtered_nav_status_code = filtered_imo.filter(col("nav_status_code").between(0, 14))

# Nilai yang akan di-filter
nav_status_values = ['Under Way Using Engine', 'At Anchor', 'Restricted Manoeuvrability', 'Moored', 'Engaged In Fishing', 'Underway Sailing']

# Filter data berdasarkan nilai 'nav_status'
filtered_nav_status = filtered_nav_status_code.filter(col("nav_status").isin(nav_status_values))

In [13]:
filtered_nav_status.count()

4559967734

In [10]:
# Filter Tipe Kapal
filtered_vessel_type_code = filtered_nav_status.filter(col("vessel_type_code").between(1, 255))

# Nilai yang akan di-filter
vessel_type_values = ['Sailing', 'Tanker', 'Other', 'Pleasure Craft', 'Passenger', 'Fishing', 'Port Tender', 'Dredging', 'Cargo']

# Filter data berdasarkan nilai 'vessel_type'
filtered_vessel_type = filtered_vessel_type_code.filter(col("vessel_type").isin(vessel_type_values))

In [15]:
filtered_vessel_type.count()

3868353761

In [11]:
# Filter Negara Kapal
filtered_flag_country_code = filtered_vessel_type.filter(col("flag_code").between(201, 775))

# Filter data berdasarkan nilai 'flag_country'
# bagi jadi dua, Indonesia dan negara asing
filtered_flag_country = filtered_flag_country_code.withColumn("status_country", when(col("flag_country") == "Indonesia", "Indonesia").otherwise("Asing"))

In [17]:
filtered_flag_country.count()

3808952401

In [12]:
# Filter Latitude Longitude (tidak perlu)
filtered_lat = filtered_flag_country.filter(col("latitude").between(-90, 90))
filtered_lat_long = filtered_lat.filter(col("longitude").between(-180, 180))

In [19]:
filtered_lat_long.count()

3808952401

In [13]:
# Filter dt_pos_utc (tidak perlu) 
pattern = "^[1-9][0-9]{0,3}-[1-9]|0[1-9]|1[0-2]-[1-9]|0[1-9]|[1-2][0-9]|3[0-1] [0-9]|0[0-9]|1[0-9]|2[0-3]:[0-5][0-9]:[0-5][0-9]$"

# Filter data untuk mendapatkan hanya nilai 'dt_pos_utc' yang tidak valid
filtered_dt_pos_utc = filtered_lat_long.filter(F.col("dt_pos_utc").rlike(pattern))

In [21]:
filtered_dt_pos_utc.count()

3808952401

In [14]:
# Filter Record Duplikat

# Menghapus record duplikat berdasarkan semua kolom (variabel)
filtered_duplicates = filtered_dt_pos_utc.dropDuplicates()

In [None]:
filtered_duplicates.count()

Error while receiving.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=60>
Closing down clientserver connection
Closing down clientserver connection


ERROR: Exception while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=60>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR: KeyboardInterrupt while

In [None]:
# Filter Pergerakan Anomali

# Tentukan kriteria pergerakan tidak normal
kriteria_pergerakan_tidak_normal = (
    ((col("nav_status").isin(['At Anchor', 'Moored'])) &
    (col("sog") > 1)) | 
    ((col("nav_status").isin(['Not Under Command', 'Underway Sailing', 'Under Way Using Engine', 'Engaged In Fishing', 'Restricted Manoeuvrability', 'Aground'])) &
    (col("sog") < 1))
)

# kriteria pergerakan normal
kriteria_pergerakan_normal = ~kriteria_pergerakan_tidak_normal

# Filter pergerakan normal
filtered_pergerakan_normal = filtered_duplicates.filter(kriteria_pergerakan_normal)

In [None]:
filtered_pergerakan_normal.count()

## Save Data

In [17]:
data_ais_filtered = filtered_pergerakan_normal

In [18]:
# Save Data
data_ais_filtered.write.option("header", True).mode("overwrite").parquet(path_unique + "data-ais-filter-dunia-2022.parquet")

In [28]:
spark.stop()