# Initialialize

In [1]:
#Sedona Imports
import sedona.sql
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
from sedona.core.SpatialRDD import PolygonRDD, PointRDD
from sedona.core.enums import FileDataSplitter
import pyspark.sql.types as pst
from pyspark import StorageLevel
from pyspark.sql import SparkSession 

In [2]:
spark = SparkSession.builder \
    .appName('Vessel_Traffic_Indonesia') \
    .config("spark.serializer", KryoSerializer.getName) \
    .config("spark.kryo.registrator", SedonaKryoRegistrator.getName) \
    .config("spark.sql.parquet.enableVectorizedReader", "false") \
    .config('spark.jars.packages', 'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,org.apache.sedona:sedona-viz-3.0_2.12:1.0.1-incubating') \
    .getOrCreate()

SedonaRegistrator.registerAll(spark)

True

In [3]:
import subprocess
import sys

In [4]:
GITLAB_USER = "read aistt"
GITLAB_TOKEN = "J1KkstArfyXB6dZvFchN"
git_package = f"git+https://(GITLAB_USER):(GITLAB_TOKEN)@code.officialstatistics.org/trade-task-team-phase-1/ais.git"
std_out = subprocess.run([sys.executable, "-m", "pip", "install", git_package], capture_output=True, text=True) .stdout
print(std_out)

Collecting git+https://%28GITLAB_USER%29:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git
  Cloning https://%28GITLAB_USER%29:****@code.officialstatistics.org/trade-task-team-phase-1/ais.git to /tmp/pip-req-build-75yvmuoe



In [5]:
GITLAB_USER = 'ml_group_read_only'
GITLAB_TOKEN = 'eac7ZwiseRdeLwmBsrsm'

# Main: for using from current issued version
git_package = f"git+https://{GITLAB_USER}:{GITLAB_TOKEN}@code.officialstatistics.org/mlpolygonsalgorithm/ml-group-polygons.git"

std_out = subprocess.run([sys.executable, "-m", "pip", "install",git_package], capture_output=True, text=True).stdout
print(std_out) 

Collecting git+https://ml_group_read_only:****@code.officialstatistics.org/mlpolygonsalgorithm/ml-group-polygons.git
  Cloning https://ml_group_read_only:****@code.officialstatistics.org/mlpolygonsalgorithm/ml-group-polygons.git to /tmp/pip-req-build-_kv39q0m
  Resolved https://ml_group_read_only:****@code.officialstatistics.org/mlpolygonsalgorithm/ml-group-polygons.git to commit 89f1aab64fee28c2f86e86d6fa7b55118882b1e8
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: unece-ais
  Building wheel for unece-ais (setup.py): started
  Building wheel for unece-ais (setup.py): finished with status 'done'
  Created wheel for unece-ais: filename=unece_ais-0.0.4-py3-none-any.whl size=12493 sha256=ce416c7fba56866a36706495931a39c6831f5e43ac56f78710c03830a5671f0b
  Stored in directory: /tmp/pip-ephem-wheel-cache-2jsq5blw/wheels/61/b5/f9/bcf024b104169c32950c03a4605d2d07ea9da07cae7bed5e3e
Successfully built u

In [36]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, count, countDistinct, when, expr, unix_timestamp
from pyspark.sql.functions import year, month, dayofmonth, hour, minute, second
from pyspark.sql.functions import monotonically_increasing_id, lead, lag, abs, row_number
from pyspark.sql.functions import concat_ws, split, lit, min, max, coalesce, avg, expr
from pyspark.sql.types import IntegerType, StringType, StructType
from pyspark.sql.window import Window

from shapely.geometry import Point, Polygon, mapping
from IPython.display import HTML
from ais import functions as af
from unece_ais import unece_ais as un
from multiprocessing import Pool

In [7]:
import h3.api.numpy_int as h3int
import matplotlib.pyplot as plt
import geopandas as gpd
import seaborn as sns
import pandas as pd
import numpy as np
import calendar
import base64
import folium
import tqdm
import h3

generated new fontManager


In [8]:
pd.set_option('display.max_columns', None) #Show all columns in pandas df
pd.set_option('display.max_rows', 100) #Show 100 rows in pandas df
pd.options.display.float_format = '{:.10f}'.format #Show float with 10 decimal points in pandas df

from IPython.core.interactiveshell import InteractiveShell #allow multiple outputs in one jupyter cell
InteractiveShell.ast_node_interactivity = "all"

In [9]:
# Path
base_path = "s3a://ungp-ais-data-historical-backup/user_temp/"
path_unique = base_path + "222011349/"

# Data

## Masuk-Keluar Indonesia

In [10]:
# Read Data
result_out_in = spark.read.parquet(path_unique + "data-ais-ihs-indonesia-by-mmsi-masuk-keluar-indonesia-2022-cb-rev-v3.parquet", header=True)

# Waktu di Port Indonesia

In [11]:
# Membuat window specification
window_spec = Window.partitionBy("mmsi").orderBy("dt_pos_utc")

# Menambahkan kolom selisih waktu
result_diff = result_out_in.withColumn(
    "time",
    unix_timestamp(F.lead("dt_pos_utc").over(window_spec)) - unix_timestamp("dt_pos_utc")
)

## Hitung

In [12]:
# Mendapatkan baris-baris dengan urutan waktu
window_spec = Window.partitionBy("mmsi").orderBy("dt_pos_utc", "Port")

# Tambahkan kolom next_position
result_diff = result_diff.withColumn("next_position", F.lead("position", 1).over(window_spec))

In [13]:
# Definisikan window specification
window_spec = Window.partitionBy("mmsi").orderBy("dt_pos_utc", "Port")

# Tentukan apakah Port sama dengan baris sebelumnya
result_diff = result_diff.withColumn(
    "same_port_as_previous",
    F.when(
        F.lag("Port").over(window_spec) == F.col("Port"), 
        True
    ).otherwise(False)
)

In [14]:
# Kolom Port sama dengan baris sebelumnya untuk baris setelahnya
result_diff = result_diff.withColumn("same_port_next", F.lead("same_port_as_previous").over(window_spec))

In [15]:
# Tambahkan kolom next_position
result_diff = result_diff.withColumn("next_keluar", F.lead("keluar_pelabuhan", 1).over(window_spec))

In [16]:
# Hitung Durasi

# Definisikan window specification
window_spec = Window.partitionBy("mmsi").orderBy("dt_pos_utc", "Port")

# Menambah kolom baru dengan nilai diff_time dari baris saat ini ditambah diff_time dari baris sebelumnya
result_diff = result_diff.withColumn(
    "diff_time",
    result_diff["time"] + lead("time", 1).over(window_spec)
)

# Menentukan kondisi untuk menambahkan nilai diff_time dari baris sebelumnya
condition = (col("position") == "in port") & (col("next_position") == "in port") & (col("masuk_pelabuhan") == "masuk") & (col("same_port_next") == True) & (col("next_keluar") == "keluar")

# Mengatur nilai baru untuk baris yang memenuhi kondisi
result_diff = result_diff.withColumn("diff_time", when(condition, result_diff["diff_time"]).otherwise("-"))

In [17]:
# Daftar kolom yang ingin dijatuhkan
kolom_drop = ["next_position", "same_port_as_previous", "same_port_next", "next_keluar"]

# Menjatuhkan kolom yang tidak diperlukan dari DataFrame
result_diff = result_diff.drop(*kolom_drop)

In [18]:
# Filter in port
result_diff = result_diff.filter(col("position") == "in port")

In [19]:
# Filter diff_time
result_diff = result_diff.filter(col("diff_time") != "-")

## Konversi

In [20]:
# Konversi durasi ke jam
result_diff = result_diff.withColumn(
    "diff_time_hours",
    col("diff_time") / 3600
)
result_diff = result_diff.withColumn(
    "diff_time_minutes",
    col("diff_time") / 60
)

In [21]:
# Eliminasi Outlier

# Filter <= 72 jam
result_diff = result_diff.filter(col("diff_time_hours") <= 72)

In [22]:
# Skema kosong untuk DataFrame
schema = StructType([])

# Membuat DataFrame kosong
time_in_port = spark.createDataFrame([], schema)

time_in_port = result_diff.select("mmsi", "Port", "dt_pos_utc", "sc_vessel", "fc_vessel", "vessel_type", "ns_vessel", "diff_time", "diff_time_minutes", "diff_time_hours")

In [23]:
def create_download_link(df, title, filename):
    csv = df.to_csv(index = False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload, title=title, filename=filename)
    return HTML(html)

In [57]:
# 1. Tambahkan kolom 'Bulan' dari 'dt_pos_utc'
# time_in_port = time_in_port.withColumn("Bulan", date_format(col("dt_pos_utc"), "MMM"))

# 2. Gabungkan kolom durasi menjadi satu kolom
new_table = time_in_port.withColumn("durasi", col("diff_time"))
# new_table = time_in_port.withColumn("durasi", col("diff_time_minutes"))
# new_table = time_in_port.withColumn("durasi", col("diff_time_hours"))

# 3. Ubah nilai pada kolom 'sc_vessel'
new_table = new_table.withColumn("sc_vessel", 
                    when(col("sc_vessel") == "Asing", "Luar Negeri")
                    .when(col("sc_vessel") == "Indonesia", "Dalam Negeri")
                    .otherwise(col("sc_vessel")))

# 5. Pilih kolom yang dibutuhkan untuk hasil akhir
final_table = new_table.select("Port", "sc_vessel", "durasi")

In [58]:
# Konversi DataFrame Spark ke Pandas DataFrame
final_table_pd = final_table.toPandas()

In [59]:
# Tentukan jumlah baris per bagian
rows_per_part = 50000 

# Hitung jumlah partisi
num_parts = len(final_table_pd) // rows_per_part + 1

# Membagi DataFrame Pandas menjadi beberapa bagian
partitions = [final_table_pd[i:i+rows_per_part] for i in range(0, len(final_table_pd), rows_per_part)]

In [60]:
def create_download_link(df, title="Download CSV file", filename="data.csv"):
    csv = df.to_csv(index=False)
    href = f'<a href="data:text/csv;base64,{base64.b64encode(csv.encode()).decode()}" download="{filename}">{title}</a>'
    return HTML(href)

# Tampilkan link unduhan untuk setiap partisi
for i, part in enumerate(partitions):
    filename = f"final_table_part_{i+1}.csv"
    display(create_download_link(part, title=f"Download {filename}", filename=filename))

## Rata-Rata & Median

In [55]:
# 1. Tambahkan kolom 'Bulan' dari 'dt_pos_utc'
# time_in_port = time_in_port.withColumn("Bulan", date_format(col("dt_pos_utc"), "MMM"))

# 2. Gabungkan kolom durasi menjadi satu kolom
# new_table = time_in_port.withColumn("durasi", col("diff_time"))
# new_table = time_in_port.withColumn("durasi", col("diff_time_minutes"))
new_table = time_in_port.withColumn("durasi", col("diff_time_hours"))

# # 5. Ubah nilai pada kolom 'sc_vessel'
# new_table = new_table.withColumn("sc_vessel", 
#                     when(col("sc_vessel") == "Asing", "Luar Negeri")
#                     .when(col("sc_vessel") == "Indonesia", "Dalam Negeri")
#                     .otherwise(col("sc_vessel")))
                                 
# 6. Hitung rata-rata & Median durasi
avg_duration = new_table.groupBy("fc_vessel", "Port").agg(avg("durasi").alias("Rata_rata_Durasi"))
median_duration = new_table.groupBy("fc_vessel", "Port").agg(expr("percentile_approx(durasi, 0.5)").alias("Median_Durasi"))

# 7. Gabungkan hasil rata-rata dan median
final_result = avg_duration.join(median_duration, on=["fc_vessel", "Port"])

In [56]:
final_result.show(final_result.count(), truncate = False)

+--------------------------------+---------------------------------+-------------------+-------------------+
|fc_vessel                       |Port                             |Rata_rata_Durasi   |Median_Durasi      |
+--------------------------------+---------------------------------+-------------------+-------------------+
|Alaska                          |Panjang                          |41.80166666666667  |41.80166666666667  |
|Algeria                         |Pulau Sambu                      |0.16666666666666666|0.16666666666666666|
|Antigua and Barbuda             |Anyer Lor                        |33.378194444444446 |29.43861111111111  |
|Antigua and Barbuda             |Balikpapan                       |27.289768518518517 |19.052777777777777 |
|Antigua and Barbuda             |Banten                           |6.800185185185185  |0.31166666666666665|
|Antigua and Barbuda             |Dumai                            |7.369236111111111  |0.15               |
|Antigua and Barbud

In [None]:
spark.stop()