# Initialialize

In [1]:
#Sedona Imports
import sedona.sql
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from sedona.core.SpatialRDD import PolygonRDD, PointRDD
from sedona.core.enums import FileDataSplitter
import pyspark.sql.types as pst
from pyspark import StorageLevel
from pyspark.sql import SparkSession 

In [2]:
spark = SparkSession.builder \
    .appName('Vessel_Traffic_Indonesia') \
    .config("spark.serializer", KryoSerializer.getName) \
    .config("spark.kryo.registrator", SedonaKryoRegistrator.getName) \
    .config("spark.sql.parquet.enableVectorizedReader", "false") \
    .config('spark.jars.packages', 'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,org.apache.sedona:sedona-viz-3.0_2.12:1.0.1-incubating') \
    .getOrCreate()

SedonaRegistrator.registerAll(spark)



Closing down clientserver connection


True

In [3]:
import subprocess
import sys

In [4]:
GITLAB_USER = "read aistt"
GITLAB_TOKEN = "J1KkstArfyXB6dZvFchN"
git_package = f"git+https://(GITLAB_USER):(GITLAB_TOKEN)@code.officialstatistics.org/trade-task-team-phase-1/ais.git"
std_out = subprocess.run([sys.executable, "-m", "pip", "install", git_package], capture_output=True, text=True) .stdout
print(std_out)

Collecting git+https://%28GITLAB_USER%29:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git
  Cloning https://%28GITLAB_USER%29:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git to /tmp/pip-req-build-b9vl13lv



In [5]:
GITLAB_USER = 'ml_group_read_only'
GITLAB_TOKEN = 'eac7ZwiseRdeLwmBsrsm'

# Main: for using from current issued version
git_package = f"git+https://{GITLAB_USER}:{GITLAB_TOKEN}@code.officialstatistics.org/mlpolygonsalgorithm/ml-group-polygons.git"

std_out = subprocess.run([sys.executable, "-m", "pip", "install",git_package], capture_output=True, text=True).stdout
print(std_out) 

Collecting git+https://ml_group_read_only:****@code.officialstatistics.org/mlpolygonsalgorithm/ml-group-polygons.git
  Cloning https://ml_group_read_only:****@code.officialstatistics.org/mlpolygonsalgorithm/ml-group-polygons.git to /tmp/pip-req-build-h5nbf_6q
  Resolved https://ml_group_read_only:****@code.officialstatistics.org/mlpolygonsalgorithm/ml-group-polygons.git to commit 89f1aab64fee28c2f86e86d6fa7b55118882b1e8
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: unece-ais
  Building wheel for unece-ais (setup.py): started
  Building wheel for unece-ais (setup.py): finished with status 'done'
  Created wheel for unece-ais: filename=unece_ais-0.0.4-py3-none-any.whl size=12493 sha256=275ca82ddfbfda654024424fdd08e9712682000b1a3aed4a541a3cefad245f1f
  Stored in directory: /tmp/pip-ephem-wheel-cache-culii53t/wheels/61/b5/f9/bcf024b104169c32950c03a4605d2d07ea9da07cae7bed5e3e
Successfully built u

In [6]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, count, countDistinct, when, expr, unix_timestamp
from pyspark.sql.functions import year, month, dayofmonth, hour, minute, second
from pyspark.sql.functions import monotonically_increasing_id, lead, lag, abs, row_number
from pyspark.sql.functions import concat_ws, split, lit, min, max, first, desc, sum as _sum, when
from pyspark.sql.types import IntegerType, StringType, StructType
from pyspark.sql.window import Window

from shapely.geometry import Point, Polygon, mapping
from IPython.display import HTML
from ais import functions as af
from unece_ais import unece_ais as un
from multiprocessing import Pool

In [7]:
import h3.api.numpy_int as h3int
import matplotlib.pyplot as plt
import geopandas as gpd
import seaborn as sns
import pandas as pd
import numpy as np
import calendar
import base64
import folium
import tqdm
import h3

generated new fontManager


In [8]:
pd.set_option('display.max_columns', None) #Show all columns in pandas df
pd.set_option('display.max_rows', 100) #Show 100 rows in pandas df
pd.options.display.float_format = '{:.10f}'.format #Show float with 10 decimal points in pandas df

from IPython.core.interactiveshell import InteractiveShell #allow multiple outputs in one jupyter cell
InteractiveShell.ast_node_interactivity = "all"

In [9]:
# Path
base_path = "s3a://ungp-ais-data-historical-backup/user_temp/"
path_unique = base_path + "222011349/"

### Read Data

In [10]:
# Read Data
result_out_in_ln = spark.read.parquet(path_unique + "data-ais-ihs-indonesia-by-mmsi-masuk-keluar-dunia-2022-cb-rev-v2.parquet", header=True)

## Menghitung Jumlah per Bulan

In [11]:
# 1. Tambahkan kolom 'Bulan' dari 'dt_pos_utc'
result_out_in_ln = result_out_in_ln.withColumn("Bulan", date_format(col("dt_pos_utc"), "MMM"))

# 2. Memfilter kapal yang masuk dan keluar
result_in_port = result_out_in_ln.filter((col("masuk_pelabuhan") == "masuk") & (col("position") == "in port") & (col("sc_vessel") == "Indonesia"))
result_out_port = result_out_in_ln.filter((col("keluar_pelabuhan") == "keluar") & (col("position") == "in port") & (col("sc_vessel") == "Indonesia"))

# 3. Menghitung jumlah 'Masuk' dan 'Keluar' berdasarkan 'Bulan' dan 'Port'
grouped_in_df = result_in_port.groupBy("Bulan", "Port").agg(
    _sum(lit(1)).alias("Masuk")
)

grouped_out_df = result_out_port.groupBy("Bulan", "Port").agg(
    _sum(lit(1)).alias("Keluar")
)

# 4. Menggabungkan DataFrame masuk dan keluar
grouped_df = grouped_in_df.join(grouped_out_df, on=["Bulan", "Port"], how="outer").fillna(0)

# 5. Isi nilai null dengan 0
final_df = grouped_df.fillna(0)

# 6. Tambahkan kolom 'Kapal' dengan nilai 'Total'
total_df = final_df.withColumn("Kapal", lit("Total")).select("Bulan", "Port", col("Masuk").alias("Masuk"), col("Keluar").alias("Keluar"), "Kapal")

# 7. Mengatur kolom dan urutan sesuai dengan format yang diinginkan
final_df = total_df.select("Bulan", col("Port").alias("Pelabuhan"), "Masuk", "Keluar", "Kapal")

In [13]:
# Menampilkan DataFrame hasil
final_df.show(final_df.count(), truncate = False)

+-----+-------------------------+-----+------+-----+
|Bulan|Pelabuhan                |Masuk|Keluar|Kapal|
+-----+-------------------------+-----+------+-----+
|Apr  |Al Jazeera Port          |1    |1     |Total|
|Apr  |Al Jubayl                |3    |3     |Total|
|Apr  |Bangkok                  |8    |8     |Total|
|Apr  |Batangas City            |4    |4     |Total|
|Apr  |Bedi                     |0    |1     |Total|
|Apr  |Benchamas Terminal       |2    |2     |Total|
|Apr  |Bintulu Port             |2    |2     |Total|
|Apr  |Bur Sa'id                |5    |5     |Total|
|Apr  |Busan                    |5    |5     |Total|
|Apr  |Cape Town                |1    |1     |Total|
|Apr  |Cat Lai                  |1    |1     |Total|
|Apr  |Cebu                     |3    |3     |Total|
|Apr  |Chittagong               |3    |3     |Total|
|Apr  |Colombo                  |11   |12    |Total|
|Apr  |Da Nang                  |6    |6     |Total|
|Apr  |Davao                    |7    |7     |

In [16]:
# 1. Tambahkan kolom 'Bulan' dari 'dt_pos_utc'
result_out_in = result_out_in_ln.withColumn("Bulan", date_format(col("dt_pos_utc"), "MMM"))

# 2. Memfilter kapal yang masuk dan keluar
result_in_port = result_out_in.filter((col("masuk_pelabuhan") == "masuk") & (col("position") == "in port") & (col("sc_vessel") == "Indonesia"))
result_out_port = result_out_in.filter((col("keluar_pelabuhan") == "keluar") & (col("position") == "in port") & (col("sc_vessel") == "Indonesia"))

# 3. Menghitung jumlah 'Masuk' dan 'Keluar' berdasarkan 'Port', 'vessel_type'
grouped_in_df = result_in_port.groupBy("Port", "vessel_type").agg(
    _sum(lit(1)).alias("Masuk")
)

grouped_out_df = result_out_port.groupBy("Port", "vessel_type").agg(
    _sum(lit(1)).alias("Keluar")
)

# 4. Menggabungkan DataFrame masuk dan keluar
grouped_df = grouped_in_df.join(grouped_out_df, on=["Port", "vessel_type"], how="outer").fillna(0)

# 5. Isi nilai null dengan 0
final_df = grouped_df.fillna(0)

# 6. Hitung 'Total' untuk setiap 'Port' dan 'vessel_type'
total_df = final_df.groupBy("Port", "vessel_type").agg(
    _sum("Masuk").alias("Total_Masuk"),
    _sum("Keluar").alias("Total_Keluar")
)

# 7. Tambahkan baris 'Total' ke DataFrame
total_df = total_df.withColumn("vessel_type", lit("Total")).select("Port", "vessel_type", col("Total_Masuk").alias("Masuk"), col("Total_Keluar").alias("Keluar"))
final_df = final_df.select("Port", "vessel_type", "Masuk", "Keluar").union(total_df)

# 8. Mengatur kolom dan urutan sesuai dengan format yang diinginkan
final_df = final_df.select(col("Port").alias("Pelabuhan"), "vessel_type", "Masuk", "Keluar")

In [17]:
# Menampilkan DataFrame hasil
final_df.show(final_df.count(), truncate = False)

+-------------------------+--------------+-----+------+
|Pelabuhan                |vessel_type   |Masuk|Keluar|
+-------------------------+--------------+-----+------+
|Abu Zaby                 |Cargo         |1    |1     |
|Aden                     |Cargo         |3    |4     |
|Al Fujayrah              |Tanker        |4    |4     |
|Al Jazeera Port          |Cargo         |1    |1     |
|Al Jubayl                |Cargo         |6    |6     |
|Al Mukalla               |Cargo         |5    |4     |
|Aparri                   |Other         |1    |1     |
|Apra Harbor              |Other         |7    |7     |
|As Suways                |Tanker        |1    |1     |
|Aviles                   |Tanker        |1    |1     |
|Bangkok                  |Cargo         |112  |111   |
|Bangkok                  |Other         |7    |9     |
|Bangkok                  |Tanker        |3    |3     |
|Basuo                    |Tanker        |3    |3     |
|Batangas City            |Cargo         |21   |

In [55]:
spark.stop()