# Inilization

In [1]:
import pandas as pd
from pyspark.sql import functions as F

#For 3.3.1
#Register Sedona Functions to Spark
from sedona.register import SedonaRegistrator
SedonaRegistrator.registerAll(spark)


#For 3.3.2
from shapely.geometry import Point, Polygon, mapping
import h3.api.numpy_int as h3int 

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, count, countDistinct, when, expr, first, desc
import calendar
from pyspark.sql.types import IntegerType
from pyspark.sql.window import Window

In [3]:
import pandas as pd
from IPython.display import HTML
import base64
from pyspark.sql.functions import year, month, dayofmonth, hour, minute, second
from pyspark.sql.functions import monotonically_increasing_id

In [4]:
# Path
base_path = "s3a://ungp-ais-data-historical-backup/user_temp/"
path_unique = base_path + "222011349/"

# Read 1% Data

In [5]:
# Read Data
data_sampel = spark.read.parquet(path_unique + "data-ais-1persen-dunia-2022.parquet", header=True)

# DIM Handling

In [6]:
# Ambil data dua bulan aja

data_2endmonth = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((F.col("months") == "November") | (F.col("months") == "December")).select("mmsi","imo","vessel_type_code","flag_code","dt_pos_utc")

## MMSI

### Default

In [7]:
# Cek dulu

# Tentukan nilai default
default_value_1 = 0

# Hitung jumlah 'mmsi' dengan nilai default per bulan
mmsi_default_value_1 = data_2endmonth.filter(F.col("mmsi") == default_value_1)

# Tampilkan DataFrame Spark hasil akhir
mmsi_default_value_1.show(mmsi_default_value_1.count(), truncate=False)

+----+-------+-------------------+
|mmsi|imo    |dt_pos_utc         |
+----+-------+-------------------+
|0   |null   |2022-11-15 16:51:57|
|0   |null   |2022-11-15 01:30:25|
|0   |8138243|2022-11-15 09:29:27|
|0   |null   |2022-11-15 10:53:07|
|0   |null   |2022-11-19 19:15:50|
|0   |null   |2022-11-14 21:44:33|
|0   |null   |2022-11-19 18:32:42|
|0   |null   |2022-11-16 10:56:28|
|0   |null   |2022-11-19 13:10:36|
|0   |8138243|2022-11-19 23:36:03|
|0   |6924404|2022-11-09 19:07:59|
|0   |8138243|2022-11-11 14:42:41|
|0   |null   |2022-11-09 00:56:09|
|0   |null   |2022-11-19 13:12:05|
|0   |8138243|2022-11-16 10:06:44|
|0   |null   |2022-11-14 22:10:05|
|0   |null   |2022-11-17 10:23:02|
|0   |null   |2022-11-17 02:29:36|
|0   |8138243|2022-11-17 20:29:18|
|0   |null   |2022-11-11 22:30:07|
|0   |null   |2022-11-12 16:40:39|
|0   |8138243|2022-11-18 17:15:03|
|0   |null   |2022-11-19 12:09:31|
|0   |null   |2022-11-19 06:20:32|
|0   |null   |2022-11-17 18:10:23|
|0   |null   |2022-1

In [8]:
# Cek dulu

# Tentukan nilai default
default_value_2 = 1193046

# Hitung jumlah 'mmsi' dengan nilai default per bulan
mmsi_default_value_2 = data_2endmonth.filter(F.col("mmsi") == default_value_2)

# Tampilkan DataFrame Spark hasil akhir
mmsi_default_value_2.show(mmsi_default_value_2.count(), truncate=False)

+-------+---------+-------------------+
|mmsi   |imo      |dt_pos_utc         |
+-------+---------+-------------------+
|1193046|303174162|2022-11-15 12:00:36|
|1193046|303174162|2022-11-14 00:40:57|
|1193046|303174162|2022-11-13 19:07:20|
|1193046|303174162|2022-11-09 13:37:57|
|1193046|303174162|2022-11-09 07:03:41|
|1193046|303174162|2022-11-17 20:49:34|
|1193046|303174162|2022-11-14 11:36:59|
|1193046|303174162|2022-11-12 16:01:18|
|1193046|303174162|2022-11-12 06:33:08|
|1193046|303174162|2022-11-14 04:41:41|
|1193046|303174162|2022-11-10 20:05:57|
|1193046|303174162|2022-11-17 15:04:31|
|1193046|303174162|2022-11-17 18:46:21|
|1193046|303174162|2022-11-17 06:37:30|
|1193046|303174162|2022-11-13 19:57:41|
|1193046|303174162|2022-11-13 11:56:10|
|1193046|303174162|2022-11-11 05:43:38|
|1193046|303174162|2022-11-11 05:06:37|
|1193046|303174162|2022-11-11 06:46:18|
+-------+---------+-------------------+



In [9]:
# Fungsi Handling MMSI Default

def process_default_mmsi(df_data_spark):
    # 1. Cek MMSI yang default
    default_mmsi_data = df_data_spark.filter((F.col('mmsi') == 0) | (F.col('mmsi') == 1193046))

    # 2. Loop melalui setiap record dan langsung ubah dalam DataFrame Spark
    for row in default_mmsi_data.collect():
        mmsi = row['mmsi']
        imo = row['imo']

        # 3. Cek pasangan IMO-nya
        joined_data = df_data_spark.filter(F.col('imo') == imo)

        # 4. Filter IMO yang valid
        valid_imo_data = joined_data.filter(
            (F.col('imo') >= 1000000) & (F.col('imo') <= 9999999)
        )

        # 5. Ambil nilai MMSI yang valid dari IMO
        valid_mmsi_data = valid_imo_data.groupBy('imo', 'mmsi').agg(
            F.countDistinct('mmsi').alias('count')
        )

        max_count_data = valid_mmsi_data.groupBy('imo').agg(
            F.max('count').alias('max_count'),
            F.first('mmsi', ignorenulls=True).alias('mmsi_replacement')
        )

        # 6. Gantikan nilai MMSI yang default dengan nilai yang valid
        mmsi_replacement_row = max_count_data.where(max_count_data['imo'] == imo).select('mmsi_replacement').first()

        if mmsi_replacement_row:
            mmsi_replacement = mmsi_replacement_row['mmsi_replacement']

            df_data_spark = df_data_spark.withColumn(
                "mmsi",
                F.when(
                    (F.col("mmsi") == mmsi) & (F.col("imo") == imo),
                    mmsi_replacement
                ).otherwise(F.col("mmsi"))
            )

    # 7. Drop kolom tambahan
    df_data_spark = df_data_spark.drop("mmsi_replacement", "imo")

    return df_data_spark

In [10]:
# Eksekusi Fungsi

data_mmsid_hand = process_default_mmsi(data_2endmonth)

In [1]:
# Save

#data_mmsid_hand.write.option("header", True).mode("overwrite").parquet(path_unique + "data-ais-mmsid-handling-coba.parquet")

In [None]:
# Read

# data_mmsid_hand = spark.read.parquet(path_unique + "data-ais-mmsid-handling-coba.parquet", header=True)

In [2]:
# Cek Lagi

# Tentukan nilai default
default_value_1 = 0

# Hitung jumlah 'mmsi' dengan nilai default per bulan
mmsi_default_value_1 = data_mmsid_hand.filter(F.col("mmsi") == default_value_1)

# Tampilkan DataFrame Spark hasil akhir
mmsi_default_value_1.show(mmsi_default_value_1.count(), truncate=False)

In [None]:
# Cek Lagi

# Tentukan nilai default
default_value_2 = 1193046

# Hitung jumlah 'mmsi' dengan nilai default per bulan
mmsi_default_value_2 = data_mmsid_hand.filter(F.col("mmsi") == default_value_2).select("mmsi","imo")

# Tampilkan DataFrame Spark hasil akhir
mmsi_default_value_2.show(mmsi_default_value_2.count(), truncate=False)

### Invalid

In [None]:
# Cek dulu

# Filter nilai MMSI yang invalid
filtered_data = data_2endmonth.filter(
    ~(
        ((col('mmsi') >= 100000000) & (col('mmsi') <= 999999999)) |
        (col('mmsi') == 0) |
        (col('mmsi') == 1193046)
    )
)

# Tampilkan hasil filtering
filtered_data = filtered_data.orderBy("mmsi")
filtered_data.show()

In [None]:
# Fungsi Handling MMSI Invalid

def process_invalid_mmsi(df_data_spark):
    # 1. Cek MMSI yang invalid
    invalid_mmsi_data = df_data_spark.filter(
        ~(
            ((col('mmsi') >= 100000000) & (col('mmsi') <= 999999999)) |
            (col('mmsi') == 0) |
            (col('mmsi') == 1193046)
        )
    )

    # 2. Loop melalui setiap record dan langsung ubah dalam DataFrame Spark
    for row in invalid_mmsi_data:
        mmsi = row['mmsi']
        imo = row['imo']

        # 3. Cek pasangan IMO-nya
        joined_data = df_data_spark.filter(col('imo') == imo)

        # 4. Filter IMO yang valid
        valid_imo_data = joined_data.filter(
            (col('imo') >= 1000000) & (col('imo') <= 9999999)
        )

        # 5. Ambil nilai MMSI yang valid dari IMO
        valid_mmsi_data = valid_imo_data.groupBy('imo', 'mmsi').agg(
            countDistinct('mmsi').alias('count')
        )

        max_count_data = valid_mmsi_data.groupBy('imo').agg(
            first('mmsi', 'count').alias('mmsi_replacement')
        )

        # 6. Gantikan nilai MMSI yang invalid dengan nilai yang valid
        df_data_spark = df_data_spark.withColumn(
            "mmsi",
            when(
                (col("mmsi") == mmsi) & (col("imo") == imo),
                max_count_data.select("mmsi_replacement").first()[0]
            ).otherwise(col("mmsi"))
        )

    # 7. Drop kolom tambahan
    df_data_spark = df_data_spark.drop("mmsi_replacement", "imo")

    return df_data_spark

In [None]:
# Eksekusi fungsi
data_mmsii_hand = process_invalid_mmsi(data_2endmonth)

In [None]:
# Cek lagi

# Filter nilai MMSI yang invalid
filtered_data = data_mmsii_hand.filter(
    ~(
        ((col('mmsi') >= 100000000) & (col('mmsi') <= 999999999)) |
        (col('mmsi') == 0) |
        (col('mmsi') == 1193046)
    )
)

# Tampilkan hasil filtering
filtered_data = filtered_data.orderBy("mmsi")
filtered_data.show()

## IMO

### Default & Missing

In [None]:
# Cek dulu

# Hitung jumlah 'mmsi' dengan nilai default per bulan
imo_default_miss_value = data_2endmonth.filter((F.col('imo') == 0) | (F.col('imo').isNull()))

# Tampilkan DataFrame Spark hasil akhir
imo_default_miss_value.show(imo_default_miss_value.count(), truncate=False)

In [None]:
# Fungsi handling IMO Default dan Missing

def process_default_miss_imo(df_data_spark):
    # 1. Cek IMO yang default atau missing
    default_imo_data = df_data_spark.filter((F.col('imo') == 0) | (F.col('imo').isNull()))

    # 2. Loop melalui setiap record dan langsung ubah dalam DataFrame Spark
    for row in default_imo_data.collect():
        mmsi = row['mmsi']
        imo = row['imo']

        # 3. Cek pasangan MMSI-nya
        joined_data = df_data_spark.filter(F.col('mmsi') == mmsi)

        # 4. Filter MMSI yang valid
        valid_mmsi_data = joined_data.filter(
            (F.col('mmsi') >= 100000000) & (F.col('mmsi') <= 999999999)
        )

        # 5. Ambil nilai IMO yang valid dari MMSI
        valid_imo_data = valid_mmsi_data.groupBy('imo', 'mmsi').agg(
            F.countDistinct('imo').alias('count')
        )

        max_count_data = valid_imo_data.groupBy('mmsi').agg(
            F.max('count').alias('max_count'),
            F.first('imo', ignorenulls=True).alias('imo_replacement')
        )

        # 6. Gantikan nilai IMO yang default atau missing dengan nilai yang valid
        imo_replacement_row = max_count_data.where(max_count_data['mmsi'] == mmsi).select('imo_replacement').first()

        if imo_replacement_row:
            imo_replacement = imo_replacement_row['imo_replacement']

            df_data_spark = df_data_spark.withColumn(
                "imo",
                F.when(
                    (F.col("imo") == imo) & (F.col("mmsi") == mmsi),
                    imo_replacement
                ).otherwise(F.col("imo"))
            )

    # 7. Drop kolom tambahan
    df_data_spark = df_data_spark.drop("imo_replacement", "mmsi")

    return df_data_spark

In [None]:
# Eksekusi Fungsi

data_imodm_hand = process_default_miss_imo(data_2endmonth)

In [None]:
# Cek lagi

# Hitung jumlah 'mmsi' dengan nilai default per bulan
imo_default_miss_value = data_imodm_hand.filter((F.col('imo') == 0) | (F.col('imo').isNull()))

# Tampilkan DataFrame Spark hasil akhir
imo_default_miss_value.show(imo_default_miss_value.count(), truncate=False)

### Invalid

In [None]:
# Cek dulu

# Filter nilai IMO yang invalid
filtered_data = data_2endmonth.filter(
    ~(
        ((col('imo') >= 1000000) & (col('imo') <= 9999999)) |
        (col('imo') == 0) |
        (col('imo').isNull())
    )
)

# Tampilkan hasil filtering
filtered_data = filtered_data.orderBy("imo")
filtered_data.show()

In [None]:
# Fungsi handling IMO Invalid

def process_invalid_imo(df_data_spark):
    # 1. Cek IMO yang invalid
    invalid_imo_data = df_data_spark.filter(
        ~(
            ((col('imo') >= 1000000) & (col('imo') <= 9999999)) |
            (col('imo') == 0) |
            (col('imo').isNull())
        )
    )

    # 2. Loop melalui setiap record dan langsung ubah dalam DataFrame Spark
    for row in invalid_imo_data:
        mmsi = row['mmsi']
        imo = row['imo']

        # 3. Cek pasangan MMSI-nya
        joined_data = df_data_spark.filter(col('mmsi') == mmsi)

        # 4. Filter MMSI yang valid
        valid_mmsi_data = joined_data.filter(
            (col('mmsi') >= 100000000) & (col('mmsi') <= 999999999)
        )

        # 5. Ambil nilai IMO yang valid dari MMSI
        valid_imo_data = valid_mmsi_data.groupBy('imo', 'mmsi').agg(
            countDistinct('imo').alias('count')
        )

        max_count_data = valid_imo_data.groupBy('mmsi').agg(
            first('imo', 'count').alias('imo_replacement')
        )

        # 6. Gantikan nilai IMO yang invalid dengan nilai yang valid
        df_data_spark = df_data_spark.withColumn(
            "imo",
            when(
                (col("imo") == imo) & (col("mmsi") == mmsi),
                max_count_data.select("imo_replacement").first()[0]
            ).otherwise(col("imo"))
        )

    # 7. Drop kolom tambahan
    df_data_spark = df_data_spark.drop("imo_replacement", "mmsi")

    return df_data_spark

In [None]:
# Eksekusi Fungsi

data_imoi_hand = process_invalid_imo(data_2endmonth)

In [None]:
# Cek lagi

# Filter nilai IMO yang invalid
filtered_data = data_imoi_hand.filter(
    ~(
        ((col('imo') >= 1000000) & (col('imo') <= 9999999)) |
        (col('imo') == 0) |
        (col('imo').isNull())
    )
)

# Tampilkan hasil filtering
filtered_data = filtered_data.orderBy("imo")
filtered_data.show()

## Tipe Kapal

In [None]:
# Cek dulu

# Filter nilai vess_type yang default atau invalid
filtered_data = data_2endmonth.filter(
    ((col('vessel_type_code') < 1) | (col('vessel_type_code') > 255)) | (col('vessel_type_code') == 0)
)

# Tampilkan hasil filtering
filtered_data = filtered_data.orderBy("vessel_type_code")
filtered_data.show()

In [None]:
# Fungsi handling Vess_Type Default dan Invalid

def process_default_invalid_vessel_type(df_data_spark):
    # 1. Cek Vess_type yang default atau missing
    default_vessel_type_data = df_data_spark.filter(((col('vessel_type_code') < 1) | (col('vessel_type_code') > 255)) | (col('vessel_type_code') == 0))

    # 2. Loop melalui setiap record dan langsung ubah dalam DataFrame Spark
    for row in default_vessel_type_data.collect():
        mmsi = row['mmsi']
        vessel_type_code = row['vessel_type_code']

        # 3. Cek pasangan MMSI-nya
        joined_data = df_data_spark.filter(F.col('mmsi') == mmsi)

        # 4. Filter MMSI yang valid
        valid_mmsi_data = joined_data.filter(
            (F.col('mmsi') >= 100000000) & (F.col('mmsi') <= 999999999)
        )

        # 5. Ambil nilai Vess_type yang valid dari MMSI
        valid_vessel_type_code_data = valid_mmsi_data.groupBy('vessel_type_code', 'mmsi').agg(
            F.countDistinct('vessel_type_code').alias('count')
        )

        max_count_data = valid_vessel_type_code_data.groupBy('mmsi').agg(
            F.max('count').alias('max_count'),
            F.first('vessel_type_code', ignorenulls=True).alias('vessel_type_code_replacement')
        )

        # 6. Gantikan nilai IMO yang default atau missing dengan nilai yang valid
        vessel_type_code_replacement_row = max_count_data.where(max_count_data['mmsi'] == mmsi).select('vessel_type_code_replacement').first()

        if vessel_type_code_replacement_row:
            vessel_type_code_replacement = vessel_type_code_replacement_row['vessel_type_code_replacement']

            df_data_spark = df_data_spark.withColumn(
                "vessel_type_code",
                F.when(
                    (F.col("vessel_type_code") == imo) & (F.col("mmsi") == mmsi),
                    vessel_type_code_replacement
                ).otherwise(F.col("vessel_type_code"))
            )

    # 7. Drop kolom tambahan
    df_data_spark = df_data_spark.drop("vessel_type_code_replacement", "mmsi")

    return df_data_spark

In [None]:
# Eksekusi Fungsi

data_vesstypedi_hand = process_default_invalid_vessel_type(data_2endmonth)

In [None]:
# Cek lagi

# Filter nilai vess_type yang default atau invalid
filtered_data = data_vesstypedi_hand.filter(
    ((col('vessel_type_code') < 1) | (col('vessel_type_code') > 255)) | (col('vessel_type_code') == 0)
)

# Tampilkan hasil filtering
filtered_data = filtered_data.orderBy("vessel_type_code")
filtered_data.show()

## Negara Kapal

In [None]:
# Cek dulu

# Filter nilai flag_code yang null
filtered_data = data_2endmonth.filter(col('flag_code').isNull())

# Tampilkan hasil filtering
filtered_data = filtered_data.orderBy("mmsi")
filtered_data.show()

In [None]:
# Fungsi handling Missing Flag_Code

def process_missing_flag_code(df_data_spark):
    # 1. Cek Flag_Code yang null
    missing_flag_code_data = df_data_spark.filter(F.col('flag_code').isNull())

    # 2. Loop melalui setiap record dan langsung ubah dalam DataFrame Spark
    for row in missing_flag_code_data.collect():
        mmsi = row['mmsi']
        flag_code = row['flag_code']

        # 3. Cek pasangan MMSI-nya
        joined_data = df_data_spark.filter(F.col('mmsi') == mmsi)

        # 4. Filter MMSI yang valid
        valid_mmsi_data = joined_data.filter(
            (F.col('mmsi') >= 100000000) & (F.col('mmsi') <= 999999999)
        )

        # 5. Gantikan nilai flag_code yang null dengan 3 digit pertama dari mmsi jika mmsi valid
        df_data_spark = df_data_spark.withColumn(
            "flag_code",
            F.when(
                (F.col("flag_code").isNull()) & (F.col("mmsi") >= 100000000) & (F.col("mmsi") <= 999999999),
                F.substring(F.col("mmsi").cast("string"), 1, 3)
            ).otherwise(F.col("flag_code"))
        )

    return df_data_spark

In [None]:
# Eksekusi Fungsi

data_flagcodem_hand = process_missing_flag_code(data_2endmonth)

In [None]:
# Cek dulu

# Filter nilai flag_code yang null
filtered_data = data_flagcodem_hand.filter(col('flag_code').isNull())

# Tampilkan hasil filtering
filtered_data = filtered_data.orderBy("mmsi")
filtered_data.show()

# Save Data

In [None]:
# Save Data
sampledd_data.write.option("header", True).mode("overwrite").parquet(path_unique + "data-ais-1persen-dimhandling-dunia-2022.parquet")