# Initialialize

In [2]:
#Sedona Imports
import sedona.sql
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from sedona.core.SpatialRDD import PolygonRDD, PointRDD
from sedona.core.enums import FileDataSplitter
import pyspark.sql.types as pst
from pyspark import StorageLevel
from pyspark.sql import SparkSession 

In [3]:
spark = SparkSession.builder \
    .appName('Vessel_Traffic_Indonesia') \
    .config("spark.serializer", KryoSerializer.getName) \
    .config("spark.kryo.registrator", SedonaKryoRegistrator.getName) \
    .config("spark.sql.parquet.enableVectorizedReader", "false") \
    .config('spark.jars.packages', 'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,org.apache.sedona:sedona-viz-3.0_2.12:1.0.1-incubating') \
    .getOrCreate()

SedonaRegistrator.registerAll(spark)

True

In [4]:
import subprocess
import sys

In [5]:
GITLAB_USER = "read aistt"
GITLAB_TOKEN = "J1KkstArfyXB6dZvFchN"
git_package = f"git+https://(GITLAB_USER):(GITLAB_TOKEN)@code.officialstatistics.org/trade-task-team-phase-1/ais.git"
std_out = subprocess.run([sys.executable, "-m", "pip", "install", git_package], capture_output=True, text=True) .stdout
print(std_out)

Collecting git+https://%28GITLAB_USER%29:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git
  Cloning https://%28GITLAB_USER%29:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git to /tmp/pip-req-build-fez5t5w5



In [6]:
GITLAB_USER = 'ml_group_read_only'
GITLAB_TOKEN = 'eac7ZwiseRdeLwmBsrsm'

# Main: for using from current issued version
git_package = f"git+https://{GITLAB_USER}:{GITLAB_TOKEN}@code.officialstatistics.org/mlpolygonsalgorithm/ml-group-polygons.git"

std_out = subprocess.run([sys.executable, "-m", "pip", "install",git_package], capture_output=True, text=True).stdout
print(std_out) 

Collecting git+https://ml_group_read_only:****@code.officialstatistics.org/mlpolygonsalgorithm/ml-group-polygons.git
  Cloning https://ml_group_read_only:****@code.officialstatistics.org/mlpolygonsalgorithm/ml-group-polygons.git to /tmp/pip-req-build-gvqw054f
  Resolved https://ml_group_read_only:****@code.officialstatistics.org/mlpolygonsalgorithm/ml-group-polygons.git to commit 89f1aab64fee28c2f86e86d6fa7b55118882b1e8
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: unece-ais
  Building wheel for unece-ais (setup.py): started
  Building wheel for unece-ais (setup.py): finished with status 'done'
  Created wheel for unece-ais: filename=unece_ais-0.0.4-py3-none-any.whl size=12493 sha256=55874ebc35e96e87add4807c68082c4686eb6dd104cb51879665007f1e5e45bc
  Stored in directory: /tmp/pip-ephem-wheel-cache-euzijljz/wheels/61/b5/f9/bcf024b104169c32950c03a4605d2d07ea9da07cae7bed5e3e
Successfully built u

In [7]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, count, countDistinct, when, expr, unix_timestamp
from pyspark.sql.functions import year, month, dayofmonth, hour, minute, second
from pyspark.sql.functions import monotonically_increasing_id, lead, lag, abs, row_number
from pyspark.sql.functions import concat_ws, split, lit, min, max
from pyspark.sql.types import IntegerType, StringType, StructType
from pyspark.sql.window import Window

from shapely.geometry import Point, Polygon, mapping
from IPython.display import HTML
from ais import functions as af
from unece_ais import unece_ais as un
from multiprocessing import Pool

In [8]:
import h3.api.numpy_int as h3int
import matplotlib.pyplot as plt
import geopandas as gpd
import seaborn as sns
import pandas as pd
import numpy as np
import calendar
import base64
import folium
import tqdm
import h3

generated new fontManager


In [9]:
pd.set_option('display.max_columns', None) #Show all columns in pandas df
pd.set_option('display.max_rows', 100) #Show 100 rows in pandas df
pd.options.display.float_format = '{:.10f}'.format #Show float with 10 decimal points in pandas df

from IPython.core.interactiveshell import InteractiveShell #allow multiple outputs in one jupyter cell
InteractiveShell.ast_node_interactivity = "all"

In [10]:
# Path
base_path = "s3a://ungp-ais-data-historical-backup/user_temp/"
path_unique = base_path + "222011349/"

# Data

## Port-AOI Indonesia

In [10]:
# Read Data
port_aoi = spark.read.parquet(path_unique + "ports_indonesia_v2.parquet", header=True)

In [11]:
#explode data port
port_aoi_exploded = port_aoi.select("Port", F.explode("h3_cluster_index").alias("boundary_h3"))

## Data AIS

In [12]:
# Read Data
data_ais_filter = spark.read.parquet(path_unique + "data-ais-ihs-indonesia-by-mmsi-filter-2022.parquet", header=True)

In [13]:
# Filter Record < 10

# Menghitung jumlah record per MMSI
record_counts = data_ais_filter.groupBy("mmsi").agg(count("*").alias("record_count"))

# Mendapatkan MMSI dengan record kurang dari 10
mmsi_less_than_10 = record_counts.filter(col("record_count") < 10).select("mmsi")

# Menghapus MMSI dengan record kurang dari 10 dari DataFrame asli menggunakan left_anti join
mmsi_with_record_great_10 = data_ais_filter.join(mmsi_less_than_10, on="mmsi", how="left_anti")

In [14]:
# Filter SOG > 3 berjumlah < 20 

# Filter data berdasarkan kondisi SOG lebih dari 3
filtered_data = mmsi_with_record_great_10.filter(col("sog") > 3)

# Kelompokkan data berdasarkan MMSI dan hitung jumlah catatan
grouped_data = filtered_data.groupBy("mmsi").agg(count("*").alias("record_count"))

# Filter MMSI yang memiliki SOG lebih dari 3 tetapi kurang dari 20
filtered_mmsi = grouped_data.filter((col("record_count") < 20)).select("mmsi")

# Hapus baris yang terkait dengan MMSI yang telah difilter dari DataFrame
mmsi_with_sog_greater3_greater20 = mmsi_with_record_great_10.join(filtered_mmsi, "mmsi", "left_anti")

In [15]:
# Select beberapa kolom 
data_ais = mmsi_with_sog_greater3_greater20.select("mmsi", "imo", "nav_status", "vessel_type", "flag_country", "status_country", "OperatorCountryOfRegistration", "OperatorCountryofDomicileName", "draught", "latitude", "longitude", "dt_pos_utc", "sog", "H3_int_index_8")

In [None]:
data_ais.count()

1443592577

# Masuk-Keluar Indonesia

## Match Port-AOI & Vessel

In [16]:
#Cek kecocokan H3 kapal dg port

# Gabungkan dua DataFrame berdasarkan kondisi
joined_data = data_ais.join(port_aoi_exploded, 
                             data_ais['H3_int_index_8'] == port_aoi_exploded['boundary_h3'], 
                             how='left')

# Tentukan nilai kolom 'position' berdasarkan hasil join
match_port_aoi = joined_data.withColumn("position", 
                                  when(col("boundary_h3").isNull(), "out port")
                                  .otherwise("in port"))

# Selecting relevant columns and filtering out null values
match_port_aoi_select = match_port_aoi.select("mmsi", "Port", "dt_pos_utc", col("flag_country").alias('fc_vessel'), col("status_country").alias('sc_vessel'), "vessel_type", col("nav_status").alias('ns_vessel'), "draught", "position")

In [17]:
# Drop Duplicate jika ada
match_port_aoi_select = match_port_aoi_select.dropDuplicates()

In [18]:
match_port_aoi_select = match_port_aoi_select.orderBy("mmsi", "dt_pos_utc", "Port")

## Arus Masuk-Keluar

### Filter Out Port

In [19]:
# Langkah 1: Buang MMSI yang hanya memiliki nilai "out port" (tanpa "in port")

mmsi_with_in_port_only = match_port_aoi_select.filter(match_port_aoi_select.position == "in port").select("mmsi").distinct()
mmsi_with_out_port_only = match_port_aoi_select.filter(match_port_aoi_select.position == "out port").select("mmsi").distinct()

mmsi_to_keep = mmsi_with_in_port_only.join(mmsi_with_out_port_only, "mmsi", "left")

# Buang MMSI yang hanya memiliki nilai "out port" (tanpa "in port"), tetapi pertahankan nilai "out port" jika ada setidaknya satu nilai "in port"
filtered_data1 = match_port_aoi_select.join(mmsi_to_keep, "mmsi", "inner")

In [20]:
filtered_data1 = filtered_data1.orderBy("mmsi", "dt_pos_utc", "Port")

In [21]:
# Langkah 2: Identifikasi waktu "in port" pertama dan terakhir untuk setiap MMSI
# Membuat window specification untuk mengelompokkan data berdasarkan MMSI dan mengurutkannya berdasarkan waktu
window_spec = Window.partitionBy("mmsi").orderBy("dt_pos_utc", "Port")

# Menetapkan flag untuk record "in port" pertama dan terakhir
filtered_data2 = filtered_data1.withColumn("first_in_port", F.min(F.when(F.col("position") == "in port", F.col("dt_pos_utc"))).over(window_spec))
filtered_data2 = filtered_data2.withColumn("last_in_port", F.max(F.when(F.col("position") == "in port", F.col("dt_pos_utc"))).over(window_spec))

# Menetapkan flag untuk record "out port" yang berada sebelum dan sesudah "in port" pertama dan terakhir, serta di antaranya
filtered_data2 = filtered_data2.withColumn("before_first_in_port", F.lead("position").over(window_spec) == "in port")
filtered_data2 = filtered_data2.withColumn("after_last_in_port", F.lag("position").over(window_spec) == "in port")

In [22]:
filtered_data2 = filtered_data2.orderBy("mmsi", "dt_pos_utc", "Port")

In [23]:
# Filter record "out port" yang memenuhi kriteria
filtered_data2 = filtered_data2.filter(
    (F.col("position") == "in port") |
    ((F.col("position") == "out port") & (F.col("before_first_in_port") 
                                          | F.col("after_last_in_port") 
                                         )
    )
)

In [24]:
# Daftar kolom yang ingin dijatuhkan
kolom_drop = ["first_in_port", "last_in_port", "before_first_in_port", "after_last_in_port"]

# Menjatuhkan kolom yang tidak diperlukan dari DataFrame
filtered_data2 = filtered_data2.drop(*kolom_drop)

In [25]:
filtered_data2 = filtered_data2.orderBy("mmsi", "dt_pos_utc", "Port")

### Filter In Port

In [26]:
# Definisikan window specification
window_spec = Window.partitionBy("mmsi").orderBy("dt_pos_utc", "Port")

# Tentukan apakah Port sama dengan baris sebelumnya
filter_data = filtered_data2.withColumn(
    "same_port_as_previous",
    F.when(
        F.lag("Port").over(window_spec) == F.col("Port"), 
        True
    ).otherwise(False)
)

In [27]:
# Kolom Port sama dengan baris sebelumnya untuk baris setelahnya
filter_data = filter_data.withColumn("same_port_next", F.lead("same_port_as_previous").over(window_spec))

In [28]:
# Membuat window specification untuk mengelompokkan data berdasarkan MMSI dan mengurutkannya berdasarkan waktu
window_spec = Window.partitionBy("mmsi").orderBy("dt_pos_utc", "Port")

# Menambahkan kolom baru untuk menghitung jumlah baris "in port" di antara dua baris "out port" berturut-turut
filter_data = filter_data.withColumn("in_port_count", F.sum(F.when(F.col("position") == "out port", 0).otherwise(1)).over(window_spec))

# Menetapkan flag untuk record "out port" pertama dan terakhir
filter_data = filter_data.withColumn("first_out_port", F.min(F.when(F.col("position") == "out port", F.col("dt_pos_utc"))).over(window_spec))
filter_data = filter_data.withColumn("last_out_port", F.max(F.when(F.col("position") == "out port", F.col("dt_pos_utc"))).over(window_spec))

# Menetapkan flag untuk record "in port" yang berada sebelum dan sesudah "out port" pertama dan terakhir, serta di antaranya
filter_data = filter_data.withColumn("before_first_out_port", F.lead("position").over(window_spec) == "out port")
filter_data = filter_data.withColumn("after_last_out_port", F.lag("position").over(window_spec) == "out port")

In [29]:
filter_data = filter_data.orderBy("mmsi", "dt_pos_utc", "Port")

In [30]:
# Filter record "in port" yang memenuhi kriteria
filter_final = filter_data.filter(
    (F.col("position") == "out port") |
    (
        (F.col("position") == "in port") 
        & 
        (
            (F.col("same_port_as_previous") == False)
            |
            (
                (F.col("same_port_as_previous") == True) 
                & ((F.col("before_first_out_port") == True) 
                    | (F.col("after_last_out_port") == True)
                   | (F.col("same_port_next") == False)
                  )
            )
        )
    )
)

In [31]:
# Daftar kolom yang ingin dijatuhkan
kolom_drop = ["same_port_as_previous", "same_port_next", "in_port_count", "first_out_port", "last_out_port", "before_first_out_port", "after_last_out_port"]

# Menjatuhkan kolom yang tidak diperlukan dari DataFrame
filter_final = filter_final.drop(*kolom_drop)

In [32]:
# Drop Duplicate jika ada
filter_final = filter_final.dropDuplicates()

In [33]:
filter_final = filter_final.orderBy("mmsi", "dt_pos_utc", "Port")

### Labeli In Port

In [34]:
# Membuat kolom-kolom baru dengan nilai awal '-'
after_filter = filter_final.withColumn("masuk_pelabuhan", lit("-")) \
                   .withColumn("keluar_pelabuhan", lit("-")) \
                   .withColumn("masuk_indo", lit("-")) \
                   .withColumn("keluar_indo", lit("-"))

In [35]:
# Mendapatkan baris-baris dengan urutan waktu
window_spec = Window.partitionBy("mmsi").orderBy("dt_pos_utc", "Port")

# Tambahkan kolom prev_position
after_filter = after_filter.withColumn("prev_position", F.lag("position", 1).over(window_spec))

# Tambahkan kolom next_position
after_filter = after_filter.withColumn("next_position", F.lead("position", 1).over(window_spec))

In [36]:
# Definisikan window specification
window_spec = Window.partitionBy("mmsi").orderBy("dt_pos_utc", "Port")

# Tentukan apakah Port sama dengan baris sebelumnya
after_filter = after_filter.withColumn(
    "same_port_as_previous",
    F.when(
        F.lag("Port").over(window_spec) == F.col("Port"), 
        True
    ).otherwise(False)
)

# Tentukan apakah Port sama dengan baris sebelumnya
after_filter = after_filter.withColumn("same_port_next", F.lead("same_port_as_previous").over(window_spec))

In [37]:
# Mendapatkan baris-baris dengan urutan waktu
window_spec = Window.partitionBy("mmsi").orderBy("dt_pos_utc", "Port")

match_port_aoi_select_in = after_filter.filter(col("position") == "in port")

# Menambahkan kolom baru untuk menandai baris pertama dengan nilai "in port" dari semua baris "in port" untuk suatu MMSI
match_port_aoi_select_in = match_port_aoi_select_in.withColumn("first_in_port_all", 
                                (lag("position", 1).over(window_spec).isNull()) & (col("position") == "in port"))

# Menambahkan kolom baru untuk mendeteksi baris terakhir dengan nilai "in port" dari semua baris "in port" untuk suatu MMSI
match_port_aoi_select_in = match_port_aoi_select_in.withColumn("last_in_port_all", 
                                (lead("position", 1).over(window_spec).isNull()) & (col("position") == "in port"))

In [38]:
# Gabungkan kembali dengan DataFrame asli
joined_data_port = after_filter.join(match_port_aoi_select_in, 
                             ["mmsi", "Port", "dt_pos_utc", "fc_vessel", "sc_vessel", "vessel_type", "ns_vessel", "draught", "position", "masuk_pelabuhan", "keluar_pelabuhan", "masuk_indo", "keluar_indo", "prev_position", "next_position", "same_port_as_previous", "same_port_next"], 
                             how='outer')

# Select kolom yang relevan dan isi nilai NULL dengan False
match_port = joined_data_port.select("mmsi", "Port", "dt_pos_utc", "fc_vessel", "sc_vessel", "vessel_type", "ns_vessel", "draught", "position", "masuk_pelabuhan", "keluar_pelabuhan", "masuk_indo", "keluar_indo", "prev_position", "next_position", "same_port_as_previous", "same_port_next", "first_in_port_all", "last_in_port_all")

In [39]:
match_port = match_port.filter(
    (F.col("position") == "out port") |
    (
        (F.col("position") == "in port") 
        & 
        (
            (F.col("first_in_port_all").isNotNull())
            &
            (F.col("last_in_port_all").isNotNull())
        )
    )
)

In [40]:
match_port = match_port.orderBy("mmsi", "dt_pos_utc", "Port")

In [41]:
# Mendefinisikan kondisi untuk setiap baris
condition1 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "True") & (match_port["last_in_port_all"] == "False")) & (match_port["prev_position"].isNull() & ((match_port["next_position"] == "in port") & (match_port["same_port_next"] == False))))                                                      
condition2 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "True") & (match_port["last_in_port_all"] == "False")) & (match_port["prev_position"].isNull() & (match_port["next_position"] == "out port")))                                                      

condition3 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "True") & (match_port["last_in_port_all"] == "False")) & ((match_port["prev_position"] == "out port") & (match_port["next_position"] == "out port")))
condition4 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "True") & (match_port["last_in_port_all"] == "False")) & ((match_port["prev_position"] == "out port") & ((match_port["next_position"] == "in port") & (match_port["same_port_next"] == False))))
condition5 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "True") & (match_port["last_in_port_all"] == "False")) & ((match_port["prev_position"] == "out port") & ((match_port["next_position"] == "in port") & (match_port["same_port_next"] == True))))

condition6 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "False") & (match_port["last_in_port_all"] == "False")) & ((match_port["prev_position"] == "out port") & (match_port["next_position"] == "out port")))
condition7 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "False") & (match_port["last_in_port_all"] == "False")) & ((match_port["prev_position"] == "out port") & ((match_port["next_position"] == "in port") & (match_port["same_port_next"] == False))))
condition8 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "False") & (match_port["last_in_port_all"] == "False")) & ((match_port["prev_position"] == "out port") & ((match_port["next_position"] == "in port") & (match_port["same_port_next"] == True))))

condition9 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "False") & (match_port["last_in_port_all"] == "False")) & ((match_port["prev_position"] == "in port") & (match_port["same_port_as_previous"] == True) & (match_port["next_position"] == "out port"))) 
condition10 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "False") & (match_port["last_in_port_all"] == "False")) & ((match_port["prev_position"] == "in port") & (match_port["same_port_as_previous"] == True) & ((match_port["next_position"] == "in port") & (match_port["same_port_next"] == False)))) 

condition11 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "False") & (match_port["last_in_port_all"] == "False")) & ((match_port["prev_position"] == "in port") & (match_port["same_port_as_previous"] == False) & (match_port["next_position"] == "out port"))) 
condition12 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "False") & (match_port["last_in_port_all"] == "False")) & ((match_port["prev_position"] == "in port") & (match_port["same_port_as_previous"] == False) & ((match_port["next_position"] == "in port") & (match_port["same_port_next"] == False)))) 
condition13 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "False") & (match_port["last_in_port_all"] == "False")) & ((match_port["prev_position"] == "in port") & (match_port["same_port_as_previous"] == False) & ((match_port["next_position"] == "in port") & (match_port["same_port_next"] == True)))) 

condition14 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "False") & (match_port["last_in_port_all"] == "False")) & ((match_port["prev_position"] == "out port") & (match_port["next_position"] == "out port")))
condition15 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "False") & (match_port["last_in_port_all"] == "False")) & (((match_port["prev_position"] == "in port") & (match_port["same_port_as_previous"] == False)) & (match_port["next_position"] == "out port")))
condition16 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "False") & (match_port["last_in_port_all"] == "False")) & (((match_port["prev_position"] == "in port") & (match_port["same_port_as_previous"] == True)) & (match_port["next_position"] == "out port")))

condition17 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "False") & (match_port["last_in_port_all"] == "False")) & ((match_port["prev_position"] == "out port") & (match_port["next_position"].isNull())))
condition18 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "False") & (match_port["last_in_port_all"] == "False")) & (((match_port["prev_position"] == "in port") & (match_port["same_port_as_previous"] == False) & (match_port["next_position"].isNull()))))

condition19 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "False") & (match_port["last_in_port_all"] == "True")) & ((match_port["prev_position"] == "out port") & (match_port["next_position"] == "out port")))
condition20 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "False") & (match_port["last_in_port_all"] == "True")) & (((match_port["prev_position"] == "in port") & (match_port["same_port_as_previous"] == False)) & (match_port["next_position"] == "out port")))
condition21 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "False") & (match_port["last_in_port_all"] == "True")) & (((match_port["prev_position"] == "in port") & (match_port["same_port_as_previous"] == True)) & (match_port["next_position"] == "out port")))

condition22 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "False") & (match_port["last_in_port_all"] == "True")) & ((match_port["prev_position"] == "out port") & (match_port["next_position"].isNull())))
condition23 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "False") & (match_port["last_in_port_all"] == "True")) & (((match_port["prev_position"] == "in port") & (match_port["same_port_as_previous"] == False) & (match_port["next_position"].isNull()))))

condition24 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "True") & (match_port["last_in_port_all"] == "True")) & ((match_port["prev_position"] == "out port")))
condition25 = (match_port["position"] == "in port") & (((match_port["first_in_port_all"] == "True") & (match_port["last_in_port_all"] == "True")) & ((match_port["next_position"] == "out port")))

In [42]:
# Mengisi kolom-kolom yang sesuai berdasarkan kondisi
port_traffic = match_port.withColumn("masuk_pelabuhan", when((condition3 | condition4 | condition5 | condition6 | condition7 | condition8 | condition11 | condition12 | condition13 | condition14 | condition15 | condition17 | condition18 | condition19 | condition20 | condition22 | condition23 | condition24), "masuk").otherwise("-")) \
                   .withColumn("keluar_pelabuhan", when((condition1 | condition2 | condition3 | condition4 | condition6 | condition7 | condition9 | condition10 | condition11 | condition12 | condition14 | condition15 | condition16 | condition19 | condition20 | condition21 | condition25), "keluar").otherwise("-")) \
                   .withColumn("masuk_indo", when((condition3 | condition4 | condition5 | condition24), "masuk").otherwise("-")) \
                   .withColumn("keluar_indo", when((condition19 | condition20 | condition21 | condition25), "keluar").otherwise("-"))

In [43]:
# Daftar kolom yang ingin dijatuhkan
kolom_drop = ["prev_position", "next_position", "same_port_as_previous", "same_port_next", "first_in_port_all", "last_in_port_all"]

# Menjatuhkan kolom yang tidak diperlukan dari DataFrame
port_traffic1 = port_traffic.drop(*kolom_drop)

In [44]:
port_traffic1 = port_traffic1.orderBy("mmsi", "dt_pos_utc", "Port")

In [45]:
port_traffic_in = port_traffic1.filter(col("position") == "in port")

### Labeli Out Port

In [46]:
# Mendapatkan baris-baris dengan urutan waktu
window_spec = Window.partitionBy("mmsi").orderBy("dt_pos_utc", "Port")

# Tambahkan kolom prev_position
port_traffic1 = port_traffic1.withColumn("prev_position", F.lag("position", 1).over(window_spec))

# Tambahkan kolom next_position
port_traffic1 = port_traffic1.withColumn("next_position", F.lead("position", 1).over(window_spec))

In [47]:
# Mendapatkan baris-baris dengan urutan waktu
window_spec = Window.partitionBy("mmsi").orderBy("dt_pos_utc", "Port")

match_port_aoi_select_out = port_traffic1.filter(col("position") == "out port")

# Menambahkan kolom baru untuk menandai baris pertama dengan nilai "out port" dari semua baris "out port" untuk suatu MMSI
match_port_aoi_select_out = match_port_aoi_select_out.withColumn("first_out_port_all", 
                                (lag("position", 1).over(window_spec).isNull()) & (col("position") == "out port"))

# Menambahkan kolom baru untuk mendeteksi baris terakhir dengan nilai "out port" dari semua baris "out port" untuk suatu MMSI
match_port_aoi_select_out = match_port_aoi_select_out.withColumn("last_out_port_all", 
                                (lead("position", 1).over(window_spec).isNull()) & (col("position") == "out port"))

In [48]:
# Gabungkan kembali dengan DataFrame asli
joined_data = port_traffic1.join(match_port_aoi_select_out, 
                             ["mmsi", "Port", "dt_pos_utc", "fc_vessel", "sc_vessel", "vessel_type", "ns_vessel", "draught", "position", "masuk_pelabuhan", "keluar_pelabuhan", "masuk_indo", "keluar_indo", "prev_position", "next_position"], 
                             how='outer')

# Select kolom yang relevan dan isi nilai NULL dengan False
port_traffic2 = joined_data.select("mmsi", "Port", "dt_pos_utc", "fc_vessel", "sc_vessel", "vessel_type", "ns_vessel", "draught", "position", "masuk_pelabuhan", "keluar_pelabuhan", "masuk_indo", "keluar_indo", "prev_position", "next_position", "first_out_port_all", "last_out_port_all")

In [49]:
port_traffic2 = port_traffic2.filter(
    (F.col("position") == "in port") |
    (
        (F.col("position") == "out port") 
        & 
        (
            (F.col("first_out_port_all").isNotNull())
            &
            (F.col("last_out_port_all").isNotNull())
        )
    )
)

In [50]:
port_traffic2 = port_traffic2.orderBy("mmsi", "dt_pos_utc", "Port")

In [51]:
# Mendefinisikan kondisi untuk setiap baris
condition1 = (port_traffic2["position"] == "out port") & ((port_traffic2["first_out_port_all"] == "True") & (port_traffic2["next_position"] == "in port"))
condition2 = (port_traffic2["position"] == "out port") & ((port_traffic2["last_out_port_all"]  == "True") & (port_traffic2["prev_position"] == "in port"))
condition3 = (port_traffic2["position"] == "out port") & ((port_traffic2["next_position"] == "in port"))
condition4 = (port_traffic2["position"] == "out port") & ((port_traffic2["prev_position"] == "in port"))

In [52]:
# Mengisi kolom-kolom yang sesuai berdasarkan kondisi
port_traffic3 = port_traffic2.withColumn("masuk_pelabuhan", when((condition1 | condition3), "masuk").otherwise("-")) \
                   .withColumn("keluar_pelabuhan", when((condition2 | condition4), "keluar").otherwise("-")) \
                   .withColumn("masuk_indo", when((condition1), "masuk").otherwise("-")) \
                   .withColumn("keluar_indo", when((condition2), "keluar").otherwise("-"))

In [53]:
# Daftar kolom yang ingin dijatuhkan
kolom_drop = ["prev_position", "next_position", "first_out_port_all", "last_out_port_all"]

# Menjatuhkan kolom yang tidak diperlukan dari DataFrame
port_traffic3 = port_traffic3.drop(*kolom_drop)

In [54]:
port_traffic3 = port_traffic3.orderBy("mmsi", "dt_pos_utc", "Port")

In [55]:
port_traffic_out = port_traffic3.filter(col("position") == "out port")

### Final Data

In [56]:
result_out_in = port_traffic_in.unionAll(port_traffic_out)

In [57]:
result_out_in = result_out_in.orderBy("mmsi", "dt_pos_utc", "Port")

In [None]:
result_out_in.count()

83573468

### Save Data

In [None]:
# Save Data
result_out_in.write.option("header", True).mode("overwrite").parquet(path_unique + "data-ais-ihs-indonesia-by-mmsi-masuk-keluar-indonesia-2022-cb-rev-v2.parquet")

### Read Data

In [11]:
# Read Data
result_out_in = spark.read.parquet(path_unique + "data-ais-ihs-indonesia-by-mmsi-masuk-keluar-indonesia-2022-cb-rev-v2.parquet", header=True)

### Masuk Pelabuhan

In [12]:
# Masuk
result_in_port = result_out_in.filter((col("masuk_pelabuhan") == "masuk") & (col("position") == "in port"))

In [12]:
result_in_port.count()

81359530

In [13]:
# Kapal Asing
result_in_port_asing = result_in_port.filter(col("sc_vessel") == "Asing")

In [14]:
result_in_port_asing.count()

64078207

### Keluar Pelabuhan

In [15]:
# Keluar
result_out_port = result_out_in.filter((col("keluar_pelabuhan") == "keluar") & (col("position") == "in port"))

In [16]:
result_out_port.count()

81359462

In [17]:
# Kapal Asing
result_out_port_asing = result_out_port.filter(col("sc_vessel") == "Asing")

In [18]:
result_out_port_asing.count()

64078185

### Masuk Indonesia

In [19]:
# Masuk
result_in_indo = result_out_in.filter((col("masuk_indo") == "masuk") & (col("position") == "in port"))

In [20]:
result_in_indo.count()

19752

In [21]:
# Kapal Asing
result_in_indo_asing = result_in_indo.filter(col("sc_vessel") == "Asing")

In [22]:
result_in_indo_asing.count()

18612

### Keluar Indonesia

In [23]:
# Keluar
result_out_indo = result_out_in.filter((col("keluar_indo") == "keluar") & (col("position") == "in port"))

In [24]:
result_out_indo.count()

19684

In [25]:
# Kapal Asing
result_out_indo_asing = result_out_indo.filter(col("sc_vessel") == "Asing")

In [26]:
result_out_indo.count()

19684

## Menghitung Jumlah per Bulan

### Masuk Pelabuhan

In [27]:
# Hitung jumlah kapal masuk
vessel_in_count_month = result_in_port.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(F.count("mmsi").alias("vessel_in"))

In [None]:
# Tampilkan hasil
vessel_in_count_month.show()

+---------+---------+
|   months|vessel_in|
+---------+---------+
|     July|  7232276|
| November|  4939254|
| February|  6352658|
|  January|  6409181|
|    March|  6626199|
|  October|  7788984|
|      May|  6434724|
|   August|  7290616|
|    April|  6390581|
|     June|  6738835|
| December|  7729024|
|September|  7427198|
+---------+---------+



In [None]:
# Hitung jumlah kapal masuk
vessel_in_count_country = result_in_port.select("fc_vessel","mmsi").groupBy("fc_vessel").agg(F.count("mmsi").alias("vessel_in"))

In [None]:
# Tampilkan hasil
vessel_in_count_country.show(vessel_in_count_country.count(), truncate = False)

+--------------------------------+---------+
|fc_vessel                       |vessel_in|
+--------------------------------+---------+
|Philippines                     |130903   |
|Singapore                       |27181230 |
|Germany                         |65837    |
|France                          |89459    |
|Dominica                        |21590    |
|Belgium                         |205513   |
|Bahamas                         |1032879  |
|Malta                           |1763983  |
|Marshall Islands                |4973921  |
|Cayman Islands                  |494662   |
|Netherlands Antilles            |73963    |
|Spain                           |9564     |
|Denmark                         |307911   |
|Barbados                        |41535    |
|Nauru                           |45       |
|Saint Kitts and Nevis           |98320    |
|USA                             |91438    |
|Cyprus                          |725437   |
|Liberia                         |5985521  |
|Honduras 

In [None]:
# Hitung jumlah kapal masuk
vessel_in_count_vess_type = result_in_port.select("vessel_type","mmsi").groupBy("vessel_type").agg(F.count("mmsi").alias("vessel_in"))

In [None]:
# Tampilkan hasil
vessel_in_count_vess_type.show()

+--------------+---------+
|   vessel_type|vessel_in|
+--------------+---------+
|       Sailing|   297545|
|        Tanker| 41014907|
|         Other|  5812723|
|Pleasure Craft|   844717|
|     Passenger|  3235710|
|       Fishing|    69816|
|   Port Tender|   113086|
|      Dredging|   608548|
|         Cargo| 29362478|
+--------------+---------+



In [None]:
# Di Pelabuhan Merak
result_in_port_merak = result_in_port.filter(col("Port") == "Tanjung Sekong")

# Passenger
result_in_port_merak_p = result_in_port_merak.filter(col("vessel_type") == "Passenger")

In [None]:
# Hitung jumlah kapal masuk
vessel_in_count_month = result_in_port_merak_p.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(F.count("mmsi").alias("vessel_in"))

In [None]:
# Tampilkan hasil
vessel_in_count_month.show()

+---------+---------+
|   months|vessel_in|
+---------+---------+
| November|    41162|
| December|    64484|
|September|    58585|
|     July|    30189|
| February|    30184|
|  January|    35158|
|    March|    34002|
|  October|    63175|
|      May|    23161|
|   August|    44452|
|    April|    27393|
|     June|    21502|
+---------+---------+



In [13]:
# Hitung jumlah kapal masuk Cargo & Passenger
vessel_in_count_month = result_in_port.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((col("vessel_type") == 'Cargo') | (col("vessel_type") == 'Passenger') | (col("vessel_type") == 'Pleasure Craft')) \
    .groupBy("months").agg(F.count("mmsi").alias("vessel_in"))


In [14]:
vessel_in_count_month.show()

+---------+---------+
|   months|vessel_in|
+---------+---------+
|     July|  2923520|
| November|  2108278|
| February|  2539916|
|  January|  2640111|
|    March|  2579092|
|  October|  3404878|
|      May|  2485526|
|   August|  3105325|
|    April|  2551061|
|     June|  2664621|
| December|  3331719|
|September|  3108858|
+---------+---------+



### Keluar Pelabuhan

In [None]:
# Hitung jumlah kapal keluar
vessel_out_count_month = result_out_port.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(F.count("mmsi").alias("vessel_out"))

In [None]:
# Tampilkan hasil
vessel_out_count_month.show()

+---------+----------+
|   months|vessel_out|
+---------+----------+
|     July|   7232233|
| November|   4939240|
| February|   6352682|
|  January|   6410138|
|    March|   6626269|
|  October|   7788977|
|      May|   6434689|
|   August|   7290646|
|    April|   6390623|
|     June|   6738855|
| December|   7727946|
|September|   7427164|
+---------+----------+



In [None]:
# Hitung jumlah kapal keluar
vessel_out_count_country = result_out_port.select("fc_vessel","mmsi").groupBy("fc_vessel").agg(F.count("mmsi").alias("vessel_out"))

In [None]:
# Tampilkan hasil
vessel_out_count_country.show(vessel_out_count_country.count(), truncate = False)

+--------------------------------+----------+
|fc_vessel                       |vessel_out|
+--------------------------------+----------+
|Philippines                     |130900    |
|Singapore                       |27181244  |
|Germany                         |65837     |
|France                          |89459     |
|Dominica                        |21590     |
|Belgium                         |205512    |
|Bahamas                         |1032867   |
|Malta                           |1763981   |
|Marshall Islands                |4973933   |
|Cayman Islands                  |494667    |
|Netherlands Antilles            |73964     |
|Spain                           |9563      |
|Denmark                         |307904    |
|Barbados                        |41539     |
|Nauru                           |44        |
|Saint Kitts and Nevis           |98329     |
|USA                             |91437     |
|Cyprus                          |725439    |
|Liberia                         |

In [None]:
# Hitung jumlah kapal keluar
vessel_out_count_vess_type = result_out_port.select("vessel_type","mmsi").groupBy("vessel_type").agg(F.count("mmsi").alias("vessel_out"))

In [None]:
# Tampilkan hasil
vessel_out_count_vess_type.show()

+--------------+----------+
|   vessel_type|vessel_out|
+--------------+----------+
|       Sailing|    297542|
|        Tanker|  41014886|
|         Other|   5812710|
|Pleasure Craft|    844727|
|     Passenger|   3235713|
|       Fishing|     69825|
|   Port Tender|    113082|
|      Dredging|    608545|
|         Cargo|  29362432|
+--------------+----------+



In [None]:
# Di Pelabuhan Merak
result_out_port_merak = result_out_port.filter(col("Port") == "Tanjung Sekong")

# Passenger
result_out_port_merak_p = result_out_port_merak.filter(col("vessel_type") == "Passenger")

In [None]:
# Hitung jumlah kapal keluar
vessel_out_count_month = result_out_port_merak_p.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(F.count("mmsi").alias("vessel_out"))

In [None]:
# Tampilkan hasil
vessel_out_count_month.show()

+---------+----------+
|   months|vessel_out|
+---------+----------+
| November|     41159|
| December|     64450|
|September|     58584|
|     July|     30188|
| February|     30184|
|  January|     35158|
|    March|     34002|
|  October|     63175|
|      May|     23161|
|   August|     44452|
|    April|     27393|
|     June|     21501|
+---------+----------+



### Masuk Indonesia

In [None]:
# Hitung jumlah kapal masuk
vessel_in_count_month = result_in_indo.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(F.count("mmsi").alias("vessel_in"))

In [None]:
# Tampilkan hasil
vessel_in_count_month.show()

+---------+---------+
|   months|vessel_in|
+---------+---------+
|     July|      982|
| November|      525|
| February|     2738|
|  January|     6437|
|    March|     2155|
|  October|      770|
|      May|     1290|
|   August|      910|
|    April|     1502|
|     June|     1073|
| December|      643|
|September|      727|
+---------+---------+



In [None]:
# Hitung jumlah kapal masuk
vessel_in_count_country = result_in_indo.select("fc_vessel","mmsi").groupBy("fc_vessel").agg(F.count("mmsi").alias("vessel_in"))

In [None]:
# Tampilkan hasil
vessel_in_count_country.show(vessel_in_count_country.count(), truncate = False)

+--------------------------------+---------+
|fc_vessel                       |vessel_in|
+--------------------------------+---------+
|Philippines                     |80       |
|Singapore                       |1478     |
|Germany                         |30       |
|France                          |38       |
|Dominica                        |3        |
|Belgium                         |39       |
|Bahamas                         |453      |
|Malta                           |769      |
|Marshall Islands                |2511     |
|Cayman Islands                  |90       |
|Netherlands Antilles            |10       |
|Spain                           |10       |
|Denmark                         |200      |
|Barbados                        |45       |
|Nauru                           |4        |
|Saint Kitts and Nevis           |28       |
|USA                             |28       |
|Cyprus                          |311      |
|Liberia                         |2753     |
|Honduras 

In [None]:
# Hitung jumlah kapal masuk
vessel_in_count_vess_type = result_in_indo.select("vessel_type","mmsi").groupBy("vessel_type").agg(F.count("mmsi").alias("vessel_in"))

In [None]:
# Tampilkan hasil
vessel_in_count_vess_type.show()

### Keluar Indonesia

In [None]:
# Hitung jumlah kapal keluar
vessel_out_count_month = result_out_indo.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(F.count("mmsi").alias("vessel_out"))

In [None]:
# Tampilkan hasil
vessel_out_count_month.show()

+---------+----------+
|   months|vessel_out|
+---------+----------+
|     July|      1081|
| November|      2355|
| February|       528|
|  January|       521|
|    March|       674|
|  October|      2560|
|      May|       784|
|   August|      1292|
|    April|       736|
|     June|       903|
| December|      6651|
|September|      1599|
+---------+----------+



In [None]:
# Hitung jumlah kapal keluar
vessel_out_count_country = result_out_indo.select("fc_vessel","mmsi").groupBy("fc_vessel").agg(F.count("mmsi").alias("vessel_out"))

In [None]:
# Tampilkan hasil
vessel_out_count_country.show(vessel_out_count_country.count(), truncate = False)

+--------------------------------+----------+
|fc_vessel                       |vessel_out|
+--------------------------------+----------+
|Philippines                     |77        |
|Singapore                       |1492      |
|Germany                         |30        |
|France                          |38        |
|Dominica                        |3         |
|Belgium                         |38        |
|Bahamas                         |441       |
|Malta                           |767       |
|Marshall Islands                |2523      |
|Cayman Islands                  |95        |
|Netherlands Antilles            |11        |
|Spain                           |9         |
|Denmark                         |193       |
|Barbados                        |49        |
|Nauru                           |3         |
|Saint Kitts and Nevis           |37        |
|USA                             |27        |
|Cyprus                          |313       |
|Liberia                         |

In [None]:
# Hitung jumlah kapal keluar
vessel_out_count_vess_type = result_out_indo.select("vessel_type","mmsi").groupBy("vessel_type").agg(F.count("mmsi").alias("vessel_out"))

In [None]:
# Tampilkan hasil
vessel_out_count_vess_type.show()

+--------------+----------+
|   vessel_type|vessel_out|
+--------------+----------+
|       Sailing|        34|
|        Tanker|      5903|
|         Other|       479|
|Pleasure Craft|        71|
|     Passenger|       286|
|       Fishing|       222|
|      Dredging|        71|
|         Cargo|     12611|
|   Port Tender|         7|
+--------------+----------+



In [None]:
spark.stop()