# Initialialize

In [1]:
#Sedona Imports
import sedona.sql
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from sedona.core.SpatialRDD import PolygonRDD, PointRDD
from sedona.core.enums import FileDataSplitter
import pyspark.sql.types as pst
from pyspark import StorageLevel
from pyspark.sql import SparkSession 

In [2]:
spark = SparkSession.builder \
    .appName('Vessel_Traffic_Indonesia') \
    .config("spark.serializer", KryoSerializer.getName) \
    .config("spark.kryo.registrator", SedonaKryoRegistrator.getName) \
    .config("spark.sql.parquet.enableVectorizedReader", "false") \
    .config('spark.jars.packages', 'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,org.apache.sedona:sedona-viz-3.0_2.12:1.0.1-incubating') \
    .getOrCreate()

SedonaRegistrator.registerAll(spark)

True

In [3]:
import subprocess
import sys

In [4]:
GITLAB_USER = "read aistt"
GITLAB_TOKEN = "J1KkstArfyXB6dZvFchN"
git_package = f"git+https://(GITLAB_USER):(GITLAB_TOKEN)@code.officialstatistics.org/trade-task-team-phase-1/ais.git"
std_out = subprocess.run([sys.executable, "-m", "pip", "install", git_package], capture_output=True, text=True) .stdout
print(std_out)

Collecting git+https://%28GITLAB_USER%29:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git
  Cloning https://%28GITLAB_USER%29:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git to /tmp/pip-req-build-o8k0vjeh



In [5]:
GITLAB_USER = 'ml_group_read_only'
GITLAB_TOKEN = 'eac7ZwiseRdeLwmBsrsm'

# Main: for using from current issued version
git_package = f"git+https://{GITLAB_USER}:{GITLAB_TOKEN}@code.officialstatistics.org/mlpolygonsalgorithm/ml-group-polygons.git"

std_out = subprocess.run([sys.executable, "-m", "pip", "install",git_package], capture_output=True, text=True).stdout
print(std_out) 

Collecting git+https://ml_group_read_only:****@code.officialstatistics.org/mlpolygonsalgorithm/ml-group-polygons.git
  Cloning https://ml_group_read_only:****@code.officialstatistics.org/mlpolygonsalgorithm/ml-group-polygons.git to /tmp/pip-req-build-kgap330z
  Resolved https://ml_group_read_only:****@code.officialstatistics.org/mlpolygonsalgorithm/ml-group-polygons.git to commit 89f1aab64fee28c2f86e86d6fa7b55118882b1e8
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: unece-ais
  Building wheel for unece-ais (setup.py): started
  Building wheel for unece-ais (setup.py): finished with status 'done'
  Created wheel for unece-ais: filename=unece_ais-0.0.4-py3-none-any.whl size=12493 sha256=c151611ca799e025a61043869b86bf0040478bc1bbe9a3710538215041017759
  Stored in directory: /tmp/pip-ephem-wheel-cache-e4whdvnd/wheels/61/b5/f9/bcf024b104169c32950c03a4605d2d07ea9da07cae7bed5e3e
Successfully built u

In [6]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, count, countDistinct, when, expr, unix_timestamp
from pyspark.sql.functions import year, month, dayofmonth, hour, minute, second
from pyspark.sql.functions import monotonically_increasing_id, lead, lag, abs, row_number
from pyspark.sql.functions import concat_ws, split, lit, min, max, first, desc, sum as _sum
from pyspark.sql.types import IntegerType, StringType, StructType
from pyspark.sql.window import Window

from shapely.geometry import Point, Polygon, mapping
from IPython.display import HTML
from ais import functions as af
# from unece_ais import unece_ais as un
from multiprocessing import Pool

In [7]:
import h3.api.numpy_int as h3int
import matplotlib.pyplot as plt
import geopandas as gpd
import seaborn as sns
import pandas as pd
import numpy as np
import calendar
import base64
import folium
import tqdm
import h3

generated new fontManager


In [8]:
pd.set_option('display.max_columns', None) #Show all columns in pandas df
pd.set_option('display.max_rows', 100) #Show 100 rows in pandas df
pd.options.display.float_format = '{:.10f}'.format #Show float with 10 decimal points in pandas df

from IPython.core.interactiveshell import InteractiveShell #allow multiple outputs in one jupyter cell
InteractiveShell.ast_node_interactivity = "all"

In [9]:
# Path
base_path = "s3a://ungp-ais-data-historical-backup/user_temp/"
path_unique = base_path + "222011349/"

### Read Data

In [10]:
# Read Data
result_out_in = spark.read.parquet(path_unique + "data-ais-ihs-indonesia-by-mmsi-masuk-keluar-indonesia-2022-rev-v3.parquet", header=True)

### Masuk Pelabuhan

In [11]:
# Masuk
result_in_port = result_out_in.filter((col("masuk_pelabuhan") == "masuk") & (col("position") == "in port"))

### Keluar Pelabuhan

In [12]:
# Keluar
result_out_port = result_out_in.filter((col("keluar_pelabuhan") == "keluar") & (col("position") == "in port"))

## Menghitung Jumlah per Bulan

In [13]:
# Daftar port yang dikecualikan
excluded_ports = ["Ampenan", "Anyer Lor", "Ardjuna Oil Field", "Belanak Field Terminal", "Belida Marine Terminal", "Blanglancang",
                  "Blinyu", "Cilacap", "Cinta Oil Terminal", "Gunung Batu Besar", "Jabung Batanghari Marine Terminal",
                  "Jakarta", "Kasim Terminal", "Kijang", "Kota Baru", "Kuala Kapus", "Kupang", "Lalang Marine Terminal",
                  "Lawi Lawi Oil Terminal", "Lingkas", "Mangkasa Oil Terminal", "Manokwari Road", "Merak Mas Terminal", 
                  "Merauke", "Miei", "Muntok", "North Pulau Laut Coal Terminal", "Pangkalpinang", "Poleng Oil Field", "Ramba",
                  "Sabang", "Sailolof", "Salawati", "Semarang", "Senipah Oil Terminal", "Stagen", "Sungaigerong", "Surabaya", 
                  "Tanah Merah", "Tanjung Arang  (Bunyu)", "Tanjung Gerem", "Tanjung Sangata", "Tanjung Sekong", "Tanjungpinang", 
                  "Tanjunguban", "Tarempah", "Teluk Beo", "Tuban", "Udang Oilfield", "Ujung Pandang", "Uleelheue", 
                  "Widuri Marine Terminal"]

In [18]:
# 1. Tambahkan kolom 'Bulan' dari 'dt_pos_utc'
result_out_in = result_out_in.withColumn("Bulan", date_format(col("dt_pos_utc"), "MMM"))

# 2. Filter untuk pelabuhan tertentu
# result_out_in = result_out_in.filter(col("Port").isin(excluded_ports))

# 3. Memfilter kapal yang masuk dan keluar
result_in_port = result_out_in.filter((col("masuk_pelabuhan") == "masuk") & (col("position") == "in port"))
result_out_port = result_out_in.filter((col("keluar_pelabuhan") == "keluar") & (col("position") == "in port"))

# 4. Menghitung jumlah 'Masuk' dan 'Keluar' berdasarkan 'Port', 'Bulan', dan 'sc_vessel'
grouped_in_df = result_in_port.groupBy("Bulan", "Port", "sc_vessel").agg(
    _sum(lit(1)).alias("Masuk")
)

grouped_out_df = result_out_port.groupBy("Bulan", "Port", "sc_vessel").agg(
    _sum(lit(1)).alias("Keluar")
)

# 5. Menggabungkan DataFrame masuk dan keluar
grouped_df = grouped_in_df.join(grouped_out_df, on=["Bulan", "Port", "sc_vessel"], how="outer").fillna(0)

# 6. Menambahkan kolom 'Kapal' ke grouped_df
grouped_df = grouped_df.withColumn("Kapal", when(col("sc_vessel") == "Indonesia", "DN").otherwise("LN"))

# 7. Mengatur kolom yang dibutuhkan untuk hasil akhir
grouped_df = grouped_df.select("Bulan", "Port", "sc_vessel", "Kapal", "Masuk", "Keluar")

# 8. Menghitung 'Total' untuk setiap 'Port' dan 'sc_vessel'
total_df = grouped_df.groupBy("Bulan", "Port").agg(
    _sum("Masuk").alias("Masuk"),
    _sum("Keluar").alias("Keluar")
).withColumn("Kapal", lit("Total")).withColumn("sc_vessel", lit("Total"))

# 9. Menambahkan baris 'Total' ke DataFrame
final_df = grouped_df.unionByName(total_df)

# 10. Mengatur kolom dan urutan sesuai dengan format yang diinginkan
final_df = final_df.select(col("Bulan"), col("Port").alias("Pelabuhan"), "Masuk", "Keluar", "sc_vessel", "Kapal")

In [19]:
# Menampilkan hasil akhir
final_df.show(final_df.count(), truncate = False)

+-----+---------------------------------+-----+------+---------+-----+
|Bulan|Pelabuhan                        |Masuk|Keluar|sc_vessel|Kapal|
+-----+---------------------------------+-----+------+---------+-----+
|Apr  |Ampenan                          |9    |9     |Indonesia|DN   |
|Apr  |Anyer Lor                        |204  |203   |Asing    |LN   |
|Apr  |Anyer Lor                        |17   |17    |Indonesia|DN   |
|Apr  |Ardjuna Oil Field                |66   |67    |Indonesia|DN   |
|Apr  |Belanak Field Terminal           |12   |12    |Indonesia|DN   |
|Apr  |Belida Marine Terminal           |26   |25    |Indonesia|DN   |
|Apr  |Blanglancang                     |97   |97    |Asing    |LN   |
|Apr  |Blanglancang                     |286  |286   |Indonesia|DN   |
|Apr  |Cilacap                          |8    |9     |Asing    |LN   |
|Apr  |Cilacap                          |98   |97    |Indonesia|DN   |
|Apr  |Cinta Oil Terminal               |15   |15    |Indonesia|DN   |
|Apr  

In [20]:
# 1. Tambahkan kolom 'Bulan' dari 'dt_pos_utc'
# result_out_in = result_out_in.withColumn("Bulan", date_format(col("dt_pos_utc"), "MMM"))

# 2. Filter untuk pelabuhan tertentu
# result_out_in = result_out_in.filter(col("Port").isin(excluded_ports))

# 3. Memfilter kapal yang masuk dan keluar
result_in_port = result_out_in.filter((col("masuk_pelabuhan") == "masuk") & (col("position") == "in port"))
result_out_port = result_out_in.filter((col("keluar_pelabuhan") == "keluar") & (col("position") == "in port"))

# 4. Menghitung jumlah 'Masuk' dan 'Keluar' berdasarkan 'Port', 'vessel_type', dan 'sc_vessel'
grouped_in_df = result_in_port.groupBy("vessel_type", "Port", "sc_vessel").agg(
    _sum(lit(1)).alias("Masuk")
)

grouped_out_df = result_out_port.groupBy("vessel_type", "Port", "sc_vessel").agg(
    _sum(lit(1)).alias("Keluar")
)

# 5. Menggabungkan DataFrame masuk dan keluar
grouped_df = grouped_in_df.join(grouped_out_df, on=["vessel_type", "Port", "sc_vessel"], how="outer").fillna(0)

# 6. Menambahkan kolom 'Kapal' ke grouped_df
grouped_df = grouped_df.withColumn("Kapal", when(col("sc_vessel") == "Indonesia", "DN").otherwise("LN"))

# 7. Mengatur kolom yang dibutuhkan untuk hasil akhir
grouped_df = grouped_df.select("vessel_type", "Port", "sc_vessel", "Kapal", "Masuk", "Keluar")

# 8. Menghitung 'Total' untuk setiap 'Port' dan 'sc_vessel'
total_df = grouped_df.groupBy("vessel_type", "Port").agg(
    _sum("Masuk").alias("Masuk"),
    _sum("Keluar").alias("Keluar")
).withColumn("Kapal", lit("Total")).withColumn("sc_vessel", lit("Total"))

# 9. Menambahkan baris 'Total' ke DataFrame
final_df = grouped_df.unionByName(total_df)

# 10. Mengatur kolom dan urutan sesuai dengan format yang diinginkan
final_df = final_df.select(col("vessel_type"), col("Port").alias("Pelabuhan"), "Masuk", "Keluar", "sc_vessel", "Kapal")

In [None]:
# Menampilkan hasil akhir
final_df.show(final_df.count(), truncate = False)

+--------------+---------------------------------+-----+------+---------+-----+
|vessel_type   |Pelabuhan                        |Masuk|Keluar|sc_vessel|Kapal|
+--------------+---------------------------------+-----+------+---------+-----+
|Cargo         |Ampenan                          |4    |4     |Indonesia|DN   |
|Cargo         |Anyer Lor                        |2614 |2613  |Asing    |LN   |
|Cargo         |Anyer Lor                        |5    |5     |Indonesia|DN   |
|Cargo         |Ardjuna Oil Field                |197  |198   |Indonesia|DN   |
|Cargo         |Belanak Field Terminal           |1    |1     |Asing    |LN   |
|Cargo         |Blanglancang                     |3    |3     |Asing    |LN   |
|Cargo         |Blanglancang                     |1    |1     |Indonesia|DN   |
|Cargo         |Cilacap                          |48   |50    |Asing    |LN   |
|Cargo         |Cilacap                          |77   |75    |Indonesia|DN   |
|Cargo         |Cinta Oil Terminal      

In [22]:
# 1. Filter untuk pelabuhan tertentu
# result_out_in = result_out_in.filter(col("Port").isin(excluded_ports))

# 2. Memfilter kapal yang masuk dan keluar
result_in_port = result_out_in.filter((col("masuk_pelabuhan") == "masuk") & (col("position") == "in port"))
result_out_port = result_out_in.filter((col("keluar_pelabuhan") == "keluar") & (col("position") == "in port"))

# 3. Menghitung jumlah 'Masuk' dan 'Keluar' berdasarkan 'Port', 'fc_vessel'
grouped_in_df = result_in_port.groupBy("Port", "fc_vessel").agg(
    _sum(lit(1)).alias("Masuk")
)

grouped_out_df = result_out_port.groupBy("Port", "fc_vessel").agg(
    _sum(lit(1)).alias("Keluar")
)

# 4. Menggabungkan DataFrame masuk dan keluar
grouped_df = grouped_in_df.join(grouped_out_df, on=["Port", "fc_vessel"], how="outer").fillna(0)

# 5. Isi nilai null dengan 0
final_df = grouped_df.fillna(0)

# 6. Hitung 'Total' untuk setiap 'Port' dan 'fc_vessel'
total_df = final_df.groupBy("Port").agg(
    _sum("Masuk").alias("Total_Masuk"),
    _sum("Keluar").alias("Total_Keluar")
)

# 7. Tambahkan baris 'Total' ke DataFrame
total_df = total_df.withColumn("fc_vessel", lit("Total")).select("Port", "fc_vessel", col("Total_Masuk").alias("Masuk"), col("Total_Keluar").alias("Keluar"))
final_df = final_df.select("Port", "fc_vessel", "Masuk", "Keluar").unionByName(total_df)

# 8. Mengatur kolom dan urutan sesuai dengan format yang diinginkan
final_df = final_df.select(col("Port").alias("Pelabuhan"), "fc_vessel", "Masuk", "Keluar")

In [23]:
# Menampilkan hasil akhir
final_df.show(final_df.count(), truncate = False)

+---------------------------------+--------------------------------+-----+------+
|Pelabuhan                        |fc_vessel                       |Masuk|Keluar|
+---------------------------------+--------------------------------+-----+------+
|Ampenan                          |Indonesia                       |162  |162   |
|Ampenan                          |Marshall Islands                |1    |1     |
|Anyer Lor                        |Antigua and Barbuda             |3    |3     |
|Anyer Lor                        |Bahamas                         |193  |193   |
|Anyer Lor                        |China                           |32   |32    |
|Anyer Lor                        |Cook Islands                    |14   |14    |
|Anyer Lor                        |Cyprus                          |2    |2     |
|Anyer Lor                        |Denmark                         |1    |1     |
|Anyer Lor                        |Gabon                           |1    |1     |
|Anyer Lor      

### Masuk Pelabuhan

In [16]:
# Memfilter DataFrame berdasarkan array port
result_in_port_same = result_in_port.filter(col("Port").isin(excluded_ports))

# Menambahkan kolom bulan dalam format teks
result_in_port_same = result_in_port_same.withColumn("month", F.date_format("dt_pos_utc", "MMMM"))

# Filter untuk hanya menyertakan tipe kapal tertentu dan sc_vessel "Asing"
result_in_port_filtered = result_in_port_same.filter(
#     (col("vessel_type").isin('Cargo', 'Passenger', 'Pleasure Craft')) &
    (col("sc_vessel") == "Asing")
)

# Menghitung jumlah kapal yang masuk ke setiap pelabuhan setiap bulan
port_month_count = result_in_port_filtered.groupBy("Port", "month").count()

# Memutar (pivot) DataFrame sehingga kolom adalah bulan dan baris adalah pelabuhan
pivot_df = port_month_count.groupBy("Port").pivot("month").agg(first("count"))

# Mengisi nilai null dengan 0
pivot_df = pivot_df.fillna(0)

In [17]:
# Tampilkan hasil
pivot_df.show(pivot_df.count(), truncate = False)

+---------------------+-----+------+--------+--------+-------+----+-----+-----+-----+--------+-------+---------+
|Port                 |April|August|December|February|January|July|June |March|May  |November|October|September|
+---------------------+-----+------+--------+--------+-------+----+-----+-----+-----+--------+-------+---------+
|Kolonodale           |0    |0     |0       |0       |0      |0   |0    |1    |0    |0       |0      |0        |
|Kuala Tanjung        |13   |42    |9       |8       |13     |20  |7    |5    |4    |6       |10     |14       |
|Samarinda            |0    |0     |0       |0       |0      |0   |0    |0    |0    |0       |2      |0        |
|Lhokseumawe          |101  |98    |2       |8       |249    |135 |459  |28   |133  |43      |92     |6        |
|Bitung               |7    |30    |35      |11      |29     |13  |18   |15   |15   |15      |122    |31       |
|Teluk Bayur          |33   |40    |35      |30      |29     |38  |37   |26   |21   |35      |67

In [18]:
# Memfilter DataFrame berdasarkan array port
result_in_port_same = result_in_port.filter(col("Port").isin(excluded_ports))

# Menambahkan kolom bulan dalam format teks
result_in_port_same = result_in_port_same.withColumn("month", F.date_format("dt_pos_utc", "MMMM"))

# Filter untuk hanya menyertakan tipe kapal tertentu dan sc_vessel "Asing"
result_in_port_filtered = result_in_port_same.filter(
#     (col("vessel_type").isin('Cargo', 'Passenger', 'Pleasure Craft')) &
    (col("sc_vessel") == "Indonesia")
)

# Menghitung jumlah kapal yang masuk ke setiap pelabuhan setiap bulan
port_month_count = result_in_port_filtered.groupBy("Port", "month").count()

# Memutar (pivot) DataFrame sehingga kolom adalah bulan dan baris adalah pelabuhan
pivot_df = port_month_count.groupBy("Port").pivot("month").agg(first("count"))

# Mengisi nilai null dengan 0
pivot_df = pivot_df.fillna(0)

In [19]:
# Tampilkan hasil
pivot_df.show(pivot_df.count(), truncate = False)

+---------------------+-----+------+--------+--------+-------+----+----+-----+----+--------+-------+---------+
|Port                 |April|August|December|February|January|July|June|March|May |November|October|September|
+---------------------+-----+------+--------+--------+-------+----+----+-----+----+--------+-------+---------+
|Fakfak               |43   |40    |53      |35      |41     |45  |45  |50   |39  |23      |46     |41       |
|Kolonodale           |4    |3     |4       |4       |4      |7   |3   |5    |5   |2       |2      |3        |
|Kuala Tanjung        |3    |10    |6       |6       |3      |9   |7   |3    |7   |5       |7      |10       |
|Samarinda            |12   |19    |23      |23      |18     |28  |21  |30   |28  |15      |22     |13       |
|Pekalongan           |1    |0     |0       |1       |1      |1   |0   |0    |0   |0       |1      |1        |
|Lhokseumawe          |289  |165   |448     |104     |172    |97  |467 |123  |226 |166     |263    |246      |
|

In [21]:
# Memfilter DataFrame berdasarkan array port
result_in_port_same = result_in_port.filter(~col("Port").isin(excluded_ports))

# Menambahkan kolom bulan dalam format teks
result_in_port_same = result_in_port_same.withColumn("month", F.date_format("dt_pos_utc", "MMMM"))

# Filter untuk hanya menyertakan tipe kapal tertentu dan sc_vessel "Asing"
# result_in_port_filtered = result_in_port_same.filter(
#     (col("vessel_type").isin('Cargo', 'Passenger', 'Pleasure Craft'))
# )

# Menghitung jumlah kapal yang masuk ke setiap pelabuhan setiap bulan
port_month_count = result_in_port_same.groupBy("Port", "month").count()

# Memutar (pivot) DataFrame sehingga kolom adalah bulan dan baris adalah pelabuhan
pivot_df = port_month_count.groupBy("Port").pivot("month").agg(first("count"))

# Mengisi nilai null dengan 0
pivot_df = pivot_df.fillna(0)

In [22]:
# Tampilkan hasil
pivot_df.show(pivot_df.count(), truncate = False)

+---------------------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+
|Port                 |April|August|December|February|January|July |June |March|May  |November|October|September|
+---------------------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+
|Fakfak               |43   |40    |53      |35      |41     |45   |45   |50   |39   |23      |46     |41       |
|Kolonodale           |4    |3     |4       |4       |4      |7    |3    |6    |5    |2       |2      |3        |
|Kuala Tanjung        |16   |52    |15      |14      |16     |29   |14   |8    |11   |11      |17     |24       |
|Samarinda            |12   |19    |23      |23      |18     |28   |21   |30   |28   |15      |24     |13       |
|Pekalongan           |1    |0     |0       |1       |1      |1    |0    |0    |0    |0       |1      |1        |
|Lhokseumawe          |390  |263   |450     |112     |421    |232  |926  |151  |359  |20

In [23]:
# Memfilter DataFrame berdasarkan array port
result_in_port_same = result_in_port.filter(~col("Port").isin(excluded_ports))

# Filter untuk hanya menyertakan tipe kapal tertentu dan sc_vessel "Asing"
result_in_port_filtered = result_in_port_same.filter(
    (col("sc_vessel") == "Asing")
)

# Menghitung jumlah kapal yang masuk ke setiap pelabuhan berdasarkan fc_vessel
port_vessel_count = result_in_port_filtered.groupBy("Port", "fc_vessel").count()

# Menghitung total kapal per fc_vessel di seluruh pelabuhan
total_vessel_count = port_vessel_count.groupBy("fc_vessel").agg(F.sum("count").alias("total_count"))

# Mendapatkan 10 fc_vessel dengan total kapal terbanyak
top_10_vessels = total_vessel_count.orderBy(desc("total_count")).limit(10).select("fc_vessel")

# Menggabungkan kembali untuk memfilter port_vessel_count hanya dengan 10 fc_vessel teratas
filtered_port_vessel_count = port_vessel_count.join(top_10_vessels, on="fc_vessel", how="inner")

# Memutar (pivot) DataFrame sehingga kolom adalah 10 fc_vessel teratas dan baris adalah pelabuhan
pivot_df = filtered_port_vessel_count.groupBy("Port").pivot("fc_vessel").agg(first("count"))

# Mengisi nilai null dengan 0
pivot_df = pivot_df.fillna(0)

In [24]:
# Tampilkan hasil
pivot_df.show(pivot_df.count(), truncate = False)

+---------------------+-----+------+---------+-------+--------+-----+----------------+------+---------+-------+
|Port                 |China|Cyprus|Hong Kong|Liberia|Malaysia|Malta|Marshall Islands|Panama|Singapore|Vietnam|
+---------------------+-----+------+---------+-------+--------+-----+----------------+------+---------+-------+
|Kuala Tanjung        |0    |0     |30       |6      |0       |0    |17              |23    |40       |5      |
|Samarinda            |0    |0     |0        |0      |2       |0    |0               |0     |0        |0      |
|Lhokseumawe          |1    |74    |41       |0      |9       |457  |44              |108   |102      |0      |
|Bitung               |0    |0     |10       |9      |1       |0    |3               |36    |27       |19     |
|Teluk Bayur          |0    |1     |44       |40     |0       |4    |30              |107   |52       |28     |
|Probolinggo          |0    |0     |3        |5      |0       |0    |1               |4     |2        |2

In [25]:
# Memfilter DataFrame berdasarkan array port
result_in_port_same = result_in_port.filter(~col("Port").isin(excluded_ports))

# Filter untuk hanya menyertakan tipe kapal tertentu dan sc_vessel "Asing"
result_in_port_filtered = result_in_port_same.filter(
    (col("sc_vessel") == "Indonesia")
)

# Menghitung jumlah kapal yang masuk ke setiap pelabuhan berdasarkan fc_vessel
port_vessel_count = result_in_port_filtered.groupBy("Port", "fc_vessel").count()

# Menghitung total kapal per fc_vessel di seluruh pelabuhan
total_vessel_count = port_vessel_count.groupBy("fc_vessel").agg(F.sum("count").alias("total_count"))

# Mendapatkan 10 fc_vessel dengan total kapal terbanyak
top_10_vessels = total_vessel_count.orderBy(desc("total_count")).limit(10).select("fc_vessel")

# Menggabungkan kembali untuk memfilter port_vessel_count hanya dengan 10 fc_vessel teratas
filtered_port_vessel_count = port_vessel_count.join(top_10_vessels, on="fc_vessel", how="inner")

# Memutar (pivot) DataFrame sehingga kolom adalah 10 fc_vessel teratas dan baris adalah pelabuhan
pivot_df = filtered_port_vessel_count.groupBy("Port").pivot("fc_vessel").agg(first("count"))

# Mengisi nilai null dengan 0
pivot_df = pivot_df.fillna(0)

In [26]:
# Tampilkan hasil
pivot_df.show(pivot_df.count(), truncate = False)

+---------------------+---------+
|Port                 |Indonesia|
+---------------------+---------+
|Fakfak               |501      |
|Kolonodale           |46       |
|Kuala Tanjung        |76       |
|Samarinda            |252      |
|Pekalongan           |6        |
|Lhokseumawe          |2766     |
|Bitung               |1847     |
|Teluk Bayur          |893      |
|Probolinggo          |107      |
|Tanjungpandan        |28       |
|Belawan              |1767     |
|Pontianak            |3991     |
|Luwuk                |620      |
|Sibolga              |283      |
|Pulau Sambu          |42338    |
|Sampit               |73       |
|Maumere              |263      |
|Tanjungredeb         |196      |
|Raha Roadstead       |149      |
|Bontang Lng Terminal |26       |
|Banjarmasin          |1319     |
|Cirebon              |312      |
|Tanjung Santan       |80       |
|Benoa                |699      |
|Sekupang             |39728    |
|Pangkalansusu        |5        |
|Kendari      

In [27]:
# Memfilter DataFrame berdasarkan array port
result_in_port_same = result_in_port.filter(~col("Port").isin(excluded_ports))

# Filter untuk hanya menyertakan tipe kapal tertentu dan sc_vessel "Asing"
result_in_port_filtered = result_in_port_same.filter(
    (col("sc_vessel") == "Asing")
)

# Menghitung jumlah kapal yang masuk ke setiap pelabuhan berdasarkan vessel_type
port_vessel_count = result_in_port_filtered.groupBy("Port", "vessel_type").count()

# Memutar (pivot) DataFrame sehingga kolom adalah vessel_type dan baris adalah pelabuhan
pivot_df = port_vessel_count.groupBy("Port").pivot("vessel_type").agg(first("count"))

# Mengisi nilai null dengan 0
pivot_df = pivot_df.fillna(0)

In [28]:
# Tampilkan hasil
pivot_df.show(pivot_df.count(), truncate = False)

+---------------------+-----+--------+-------+-----+---------+--------------+-----------+-------+------+
|Port                 |Cargo|Dredging|Fishing|Other|Passenger|Pleasure Craft|Port Tender|Sailing|Tanker|
+---------------------+-----+--------+-------+-----+---------+--------------+-----------+-------+------+
|Kolonodale           |0    |0       |0      |0    |0        |0             |0          |0      |1     |
|Kuala Tanjung        |65   |0       |0      |0    |0        |0             |0          |0      |86    |
|Samarinda            |0    |0       |0      |2    |0        |0             |0          |0      |0     |
|Lhokseumawe          |4    |0       |0      |1    |0        |0             |0          |0      |1349  |
|Bitung               |193  |0       |0      |1    |45       |1             |0          |0      |101   |
|Teluk Bayur          |191  |0       |0      |1    |0        |0             |0          |0      |236   |
|Probolinggo          |9    |0       |0      |0    |2  

In [29]:
# Memfilter DataFrame berdasarkan array port
result_in_port_same = result_in_port.filter(~col("Port").isin(excluded_ports))

# Filter untuk hanya menyertakan tipe kapal tertentu dan sc_vessel "Asing"
result_in_port_filtered = result_in_port_same.filter(
    (col("sc_vessel") == "Indonesia")
)

# Menghitung jumlah kapal yang masuk ke setiap pelabuhan berdasarkan vessel_type
port_vessel_count = result_in_port_filtered.groupBy("Port", "vessel_type").count()

# Memutar (pivot) DataFrame sehingga kolom adalah vessel_type dan baris adalah pelabuhan
pivot_df = port_vessel_count.groupBy("Port").pivot("vessel_type").agg(first("count"))

# Mengisi nilai null dengan 0
pivot_df = pivot_df.fillna(0)

In [30]:
# Tampilkan hasil
pivot_df.show(pivot_df.count(), truncate = False)

+---------------------+-----+--------+-------+-----+---------+--------------+-----------+-------+------+
|Port                 |Cargo|Dredging|Fishing|Other|Passenger|Pleasure Craft|Port Tender|Sailing|Tanker|
+---------------------+-----+--------+-------+-----+---------+--------------+-----------+-------+------+
|Fakfak               |62   |0       |0      |1    |406      |0             |0          |0      |32    |
|Kolonodale           |17   |0       |1      |0    |3        |0             |0          |0      |25    |
|Kuala Tanjung        |66   |0       |0      |2    |0        |0             |0          |0      |8     |
|Samarinda            |74   |0       |2      |26   |0        |74            |0          |0      |76    |
|Pekalongan           |0    |0       |0      |0    |0        |0             |0          |0      |6     |
|Lhokseumawe          |10   |0       |0      |1664 |0        |0             |0          |0      |1092  |
|Bitung               |469  |0       |15     |88   |635

In [31]:
# Memfilter DataFrame berdasarkan array port
result_in_port_same = result_in_port.filter(~col("Port").isin(excluded_ports))


# Menghitung jumlah kapal yang masuk ke setiap pelabuhan berdasarkan vessel_type
port_vessel_count = result_in_port_same.groupBy("Port", "vessel_type").count()

# Memutar (pivot) DataFrame sehingga kolom adalah vessel_type dan baris adalah pelabuhan
pivot_df = port_vessel_count.groupBy("Port").pivot("vessel_type").agg(first("count"))

# Mengisi nilai null dengan 0
pivot_df = pivot_df.fillna(0)

In [32]:
# Tampilkan hasil
pivot_df.show(pivot_df.count(), truncate = False)

+---------------------+-----+--------+-------+-----+---------+--------------+-----------+-------+------+
|Port                 |Cargo|Dredging|Fishing|Other|Passenger|Pleasure Craft|Port Tender|Sailing|Tanker|
+---------------------+-----+--------+-------+-----+---------+--------------+-----------+-------+------+
|Fakfak               |62   |0       |0      |1    |406      |0             |0          |0      |32    |
|Kolonodale           |17   |0       |1      |0    |3        |0             |0          |0      |26    |
|Kuala Tanjung        |131  |0       |0      |2    |0        |0             |0          |0      |94    |
|Samarinda            |74   |0       |2      |28   |0        |74            |0          |0      |76    |
|Pekalongan           |0    |0       |0      |0    |0        |0             |0          |0      |6     |
|Lhokseumawe          |14   |0       |0      |1665 |0        |0             |0          |0      |2441  |
|Bitung               |662  |0       |15     |89   |680

### Keluar Pelabuhan

In [33]:
# Memfilter DataFrame berdasarkan array port
result_out_port_same = result_out_port.filter(~col("Port").isin(excluded_ports))

# Menambahkan kolom bulan dalam format teks
result_out_port_same = result_out_port_same.withColumn("month", F.date_format("dt_pos_utc", "MMMM"))

# Filter untuk hanya menyertakan tipe kapal tertentu dan sc_vessel "Asing"
result_out_port_filtered = result_out_port_same.filter(
#     (col("vessel_type").isin('Cargo', 'Passenger', 'Pleasure Craft')) &
    (col("sc_vessel") == "Asing")
)

# Menghitung jumlah kapal yang masuk ke setiap pelabuhan setiap bulan
port_month_count = result_out_port_filtered.groupBy("Port", "month").count()

# Memutar (pivot) DataFrame sehingga kolom adalah bulan dan baris adalah pelabuhan
pivot_df = port_month_count.groupBy("Port").pivot("month").agg(first("count"))

# Mengisi nilai null dengan 0
pivot_df = pivot_df.fillna(0)

In [34]:
# Tampilkan hasil
pivot_df.show(pivot_df.count(), truncate = False)

+---------------------+-----+------+--------+--------+-------+----+-----+-----+-----+--------+-------+---------+
|Port                 |April|August|December|February|January|July|June |March|May  |November|October|September|
+---------------------+-----+------+--------+--------+-------+----+-----+-----+-----+--------+-------+---------+
|Kolonodale           |0    |0     |0       |0       |0      |0   |0    |1    |0    |0       |0      |0        |
|Kuala Tanjung        |12   |43    |9       |7       |13     |19  |8    |6    |4    |6       |11     |12       |
|Samarinda            |0    |0     |1       |0       |0      |0   |0    |0    |0    |0       |2      |0        |
|Lhokseumawe          |101  |98    |2       |8       |249    |135 |459  |28   |133  |43      |92     |6        |
|Bitung               |9    |28    |34      |12      |30     |14  |16   |13   |14   |17      |121    |34       |
|Teluk Bayur          |33   |40    |34      |30      |28     |41  |33   |27   |21   |34      |68

In [35]:
# Memfilter DataFrame berdasarkan array port
result_out_port_same = result_out_port.filter(~col("Port").isin(excluded_ports))

# Menambahkan kolom bulan dalam format teks
result_out_port_same = result_out_port_same.withColumn("month", F.date_format("dt_pos_utc", "MMMM"))

# Filter untuk hanya menyertakan tipe kapal tertentu dan sc_vessel "Asing"
result_out_port_filtered = result_out_port_same.filter(
#     (col("vessel_type").isin('Cargo', 'Passenger', 'Pleasure Craft')) &
    (col("sc_vessel") == "Indonesia")
)

# Menghitung jumlah kapal yang masuk ke setiap pelabuhan setiap bulan
port_month_count = result_out_port_filtered.groupBy("Port", "month").count()

# Memutar (pivot) DataFrame sehingga kolom adalah bulan dan baris adalah pelabuhan
pivot_df = port_month_count.groupBy("Port").pivot("month").agg(first("count"))

# Mengisi nilai null dengan 0
pivot_df = pivot_df.fillna(0)

In [36]:
# Tampilkan hasil
pivot_df.show(pivot_df.count(), truncate = False)

+---------------------+-----+------+--------+--------+-------+----+----+-----+----+--------+-------+---------+
|Port                 |April|August|December|February|January|July|June|March|May |November|October|September|
+---------------------+-----+------+--------+--------+-------+----+----+-----+----+--------+-------+---------+
|Fakfak               |43   |40    |53      |35      |41     |46  |45  |49   |39  |24      |47     |39       |
|Kolonodale           |4    |3     |4       |5       |3      |7   |3   |5    |5   |3       |1      |3        |
|Kuala Tanjung        |3    |9     |7       |6       |3      |10  |6   |3    |7   |5       |6      |11       |
|Samarinda            |13   |18    |23      |23      |22     |28  |21  |30   |28  |13      |22     |15       |
|Pekalongan           |1    |0     |0       |1       |1      |1   |0   |0    |0   |0       |1      |1        |
|Lhokseumawe          |290  |165   |449     |104     |172    |97  |467 |122  |226 |165     |263    |246      |
|

In [37]:
# Memfilter DataFrame berdasarkan array port
result_out_port_same = result_out_port.filter(~col("Port").isin(excluded_ports))

# Menambahkan kolom bulan dalam format teks
result_out_port_same = result_out_port_same.withColumn("month", F.date_format("dt_pos_utc", "MMMM"))

# Filter untuk hanya menyertakan tipe kapal tertentu dan sc_vessel "Asing"
# result_out_port_filtered = result_out_port_same.filter(
# #     (col("vessel_type").isin('Cargo', 'Passenger', 'Pleasure Craft')) &
#     (col("sc_vessel") == "Asing")
# )

# Menghitung jumlah kapal yang masuk ke setiap pelabuhan setiap bulan
port_month_count = result_out_port_same.groupBy("Port", "month").count()

# Memutar (pivot) DataFrame sehingga kolom adalah bulan dan baris adalah pelabuhan
pivot_df = port_month_count.groupBy("Port").pivot("month").agg(first("count"))

# Mengisi nilai null dengan 0
pivot_df = pivot_df.fillna(0)

In [38]:
# Tampilkan hasil
pivot_df.show(pivot_df.count(), truncate = False)

+---------------------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+
|Port                 |April|August|December|February|January|July |June |March|May  |November|October|September|
+---------------------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+
|Fakfak               |43   |40    |53      |35      |41     |46   |45   |49   |39   |24      |47     |39       |
|Kolonodale           |4    |3     |4       |5       |3      |7    |3    |6    |5    |3       |1      |3        |
|Kuala Tanjung        |15   |52    |16      |13      |16     |29   |14   |9    |11   |11      |17     |23       |
|Samarinda            |13   |18    |24      |23      |22     |28   |21   |30   |28   |13      |24     |15       |
|Pekalongan           |1    |0     |0       |1       |1      |1    |0    |0    |0    |0       |1      |1        |
|Lhokseumawe          |391  |263   |451     |112     |421    |232  |926  |150  |359  |20

In [39]:
# Memfilter DataFrame berdasarkan array port
result_out_port_same = result_out_port.filter(~col("Port").isin(excluded_ports))

# Filter untuk hanya menyertakan tipe kapal tertentu dan sc_vessel "Asing"
result_out_port_filtered = result_out_port_same.filter(
    (col("sc_vessel") == "Asing")
)

# Menghitung jumlah kapal yang masuk ke setiap pelabuhan berdasarkan fc_vessel
port_vessel_count = result_out_port_filtered.groupBy("Port", "fc_vessel").count()

# Menghitung total kapal per fc_vessel di seluruh pelabuhan
total_vessel_count = port_vessel_count.groupBy("fc_vessel").agg(F.sum("count").alias("total_count"))

# Mendapatkan 10 fc_vessel dengan total kapal terbanyak
top_10_vessels = total_vessel_count.orderBy(desc("total_count")).limit(10).select("fc_vessel")

# Menggabungkan kembali untuk memfilter port_vessel_count hanya dengan 10 fc_vessel teratas
filtered_port_vessel_count = port_vessel_count.join(top_10_vessels, on="fc_vessel", how="inner")

# Memutar (pivot) DataFrame sehingga kolom adalah 10 fc_vessel teratas dan baris adalah pelabuhan
pivot_df = filtered_port_vessel_count.groupBy("Port").pivot("fc_vessel").agg(first("count"))

# Mengisi nilai null dengan 0
pivot_df = pivot_df.fillna(0)

In [40]:
# Tampilkan hasil
pivot_df.show(pivot_df.count(), truncate = False)

+---------------------+-----+------+---------+-------+--------+-----+----------------+------+---------+-------+
|Port                 |China|Cyprus|Hong Kong|Liberia|Malaysia|Malta|Marshall Islands|Panama|Singapore|Vietnam|
+---------------------+-----+------+---------+-------+--------+-----+----------------+------+---------+-------+
|Kuala Tanjung        |0    |0     |30       |6      |0       |0    |17              |23    |40       |4      |
|Samarinda            |0    |0     |0        |0      |3       |0    |0               |0     |0        |0      |
|Lhokseumawe          |1    |74    |41       |0      |9       |457  |44              |108   |102      |0      |
|Bitung               |0    |0     |11       |9      |1       |0    |4               |35    |27       |19     |
|Teluk Bayur          |0    |1     |44       |40     |0       |4    |30              |107   |51       |28     |
|Probolinggo          |0    |0     |3        |4      |0       |0    |1               |4     |2        |2

In [41]:
# Memfilter DataFrame berdasarkan array port
result_out_port_same = result_out_port.filter(~col("Port").isin(excluded_ports))

# Filter untuk hanya menyertakan tipe kapal tertentu dan sc_vessel "Asing"
result_out_port_filtered = result_out_port_same.filter(
    (col("sc_vessel") == "Indonesia")
)

# Menghitung jumlah kapal yang masuk ke setiap pelabuhan berdasarkan fc_vessel
port_vessel_count = result_out_port_filtered.groupBy("Port", "fc_vessel").count()

# Menghitung total kapal per fc_vessel di seluruh pelabuhan
total_vessel_count = port_vessel_count.groupBy("fc_vessel").agg(F.sum("count").alias("total_count"))

# Mendapatkan 10 fc_vessel dengan total kapal terbanyak
top_10_vessels = total_vessel_count.orderBy(desc("total_count")).limit(10).select("fc_vessel")

# Menggabungkan kembali untuk memfilter port_vessel_count hanya dengan 10 fc_vessel teratas
filtered_port_vessel_count = port_vessel_count.join(top_10_vessels, on="fc_vessel", how="inner")

# Memutar (pivot) DataFrame sehingga kolom adalah 10 fc_vessel teratas dan baris adalah pelabuhan
pivot_df = filtered_port_vessel_count.groupBy("Port").pivot("fc_vessel").agg(first("count"))

# Mengisi nilai null dengan 0
pivot_df = pivot_df.fillna(0)

In [42]:
# Tampilkan hasil
pivot_df.show(pivot_df.count(), truncate = False)

+---------------------+---------+
|Port                 |Indonesia|
+---------------------+---------+
|Fakfak               |501      |
|Kolonodale           |46       |
|Kuala Tanjung        |76       |
|Samarinda            |256      |
|Pekalongan           |6        |
|Lhokseumawe          |2766     |
|Bitung               |1845     |
|Teluk Bayur          |895      |
|Probolinggo          |103      |
|Tanjungpandan        |28       |
|Belawan              |1762     |
|Pontianak            |3987     |
|Luwuk                |620      |
|Sibolga              |284      |
|Pulau Sambu          |42344    |
|Sampit               |74       |
|Maumere              |264      |
|Tanjungredeb         |196      |
|Raha Roadstead       |148      |
|Bontang Lng Terminal |26       |
|Banjarmasin          |1317     |
|Cirebon              |313      |
|Tanjung Santan       |79       |
|Benoa                |696      |
|Sekupang             |39737    |
|Pangkalansusu        |4        |
|Kendari      

In [43]:
# Memfilter DataFrame berdasarkan array port
result_out_port_port_same = result_out_port.filter(~col("Port").isin(excluded_ports))

# Filter untuk hanya menyertakan tipe kapal tertentu dan sc_vessel "Asing"
result_out_port_filtered = result_out_port_same.filter(
    (col("sc_vessel") == "Asing")
)

# Menghitung jumlah kapal yang masuk ke setiap pelabuhan berdasarkan vessel_type
port_vessel_count = result_out_port_filtered.groupBy("Port", "vessel_type").count()

# Memutar (pivot) DataFrame sehingga kolom adalah vessel_type dan baris adalah pelabuhan
pivot_df = port_vessel_count.groupBy("Port").pivot("vessel_type").agg(first("count"))

# Mengisi nilai null dengan 0
pivot_df = pivot_df.fillna(0)

In [44]:
# Tampilkan hasil
pivot_df.show(pivot_df.count(), truncate = False)

+---------------------+-----+--------+-------+-----+---------+--------------+-----------+-------+------+
|Port                 |Cargo|Dredging|Fishing|Other|Passenger|Pleasure Craft|Port Tender|Sailing|Tanker|
+---------------------+-----+--------+-------+-----+---------+--------------+-----------+-------+------+
|Kolonodale           |0    |0       |0      |0    |0        |0             |0          |0      |1     |
|Kuala Tanjung        |64   |0       |0      |0    |0        |0             |0          |0      |86    |
|Samarinda            |0    |0       |0      |3    |0        |0             |0          |0      |0     |
|Lhokseumawe          |4    |0       |0      |1    |0        |0             |0          |0      |1349  |
|Bitung               |193  |0       |0      |0    |46       |1             |0          |0      |102   |
|Teluk Bayur          |189  |0       |0      |1    |0        |0             |0          |0      |235   |
|Probolinggo          |9    |0       |0      |0    |2  

In [45]:
# Memfilter DataFrame berdasarkan array port
result_out_port_port_same = result_out_port.filter(~col("Port").isin(excluded_ports))

# Filter untuk hanya menyertakan tipe kapal tertentu dan sc_vessel "Asing"
result_out_port_filtered = result_out_port_same.filter(
    (col("sc_vessel") == "Indonesia")
)

# Menghitung jumlah kapal yang masuk ke setiap pelabuhan berdasarkan vessel_type
port_vessel_count = result_out_port_filtered.groupBy("Port", "vessel_type").count()

# Memutar (pivot) DataFrame sehingga kolom adalah vessel_type dan baris adalah pelabuhan
pivot_df = port_vessel_count.groupBy("Port").pivot("vessel_type").agg(first("count"))

# Mengisi nilai null dengan 0
pivot_df = pivot_df.fillna(0)

In [46]:
# Tampilkan hasil
pivot_df.show(pivot_df.count(), truncate = False)

+---------------------+-----+--------+-------+-----+---------+--------------+-----------+-------+------+
|Port                 |Cargo|Dredging|Fishing|Other|Passenger|Pleasure Craft|Port Tender|Sailing|Tanker|
+---------------------+-----+--------+-------+-----+---------+--------------+-----------+-------+------+
|Fakfak               |62   |0       |0      |1    |406      |0             |0          |0      |32    |
|Kolonodale           |17   |0       |1      |0    |3        |0             |0          |0      |25    |
|Kuala Tanjung        |66   |0       |0      |2    |0        |0             |0          |0      |8     |
|Samarinda            |78   |0       |2      |24   |0        |75            |0          |0      |77    |
|Pekalongan           |0    |0       |0      |0    |0        |0             |0          |0      |6     |
|Lhokseumawe          |10   |0       |0      |1664 |0        |0             |0          |0      |1092  |
|Bitung               |468  |0       |16     |89   |636

In [47]:
# Memfilter DataFrame berdasarkan array port
result_out_port_port_same = result_out_port.filter(~col("Port").isin(excluded_ports))


# Menghitung jumlah kapal yang masuk ke setiap pelabuhan berdasarkan vessel_type
port_vessel_count = result_out_port_port_same.groupBy("Port", "vessel_type").count()

# Memutar (pivot) DataFrame sehingga kolom adalah vessel_type dan baris adalah pelabuhan
pivot_df = port_vessel_count.groupBy("Port").pivot("vessel_type").agg(first("count"))

# Mengisi nilai null dengan 0
pivot_df = pivot_df.fillna(0)

In [48]:
# Tampilkan hasil
pivot_df.show(pivot_df.count(), truncate = False)

+---------------------+-----+--------+-------+-----+---------+--------------+-----------+-------+------+
|Port                 |Cargo|Dredging|Fishing|Other|Passenger|Pleasure Craft|Port Tender|Sailing|Tanker|
+---------------------+-----+--------+-------+-----+---------+--------------+-----------+-------+------+
|Fakfak               |62   |0       |0      |1    |406      |0             |0          |0      |32    |
|Kolonodale           |17   |0       |1      |0    |3        |0             |0          |0      |26    |
|Kuala Tanjung        |130  |0       |0      |2    |0        |0             |0          |0      |94    |
|Samarinda            |78   |0       |2      |27   |0        |75            |0          |0      |77    |
|Pekalongan           |0    |0       |0      |0    |0        |0             |0          |0      |6     |
|Lhokseumawe          |14   |0       |0      |1665 |0        |0             |0          |0      |2441  |
|Bitung               |661  |0       |16     |89   |682

In [53]:
spark.stop()