# Inilization

In [1]:
import pandas as pd
from pyspark.sql import functions as F

#For 3.3.1
#Register Sedona Functions to Spark
from sedona.register import SedonaRegistrator
SedonaRegistrator.registerAll(spark)


#For 3.3.2
from shapely.geometry import Point, Polygon, mapping
import h3.api.numpy_int as h3int 

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, count, countDistinct, when, expr
import calendar
from pyspark.sql.types import IntegerType
from pyspark.sql.window import Window

In [3]:
import pandas as pd
from IPython.display import HTML
import base64
from pyspark.sql.functions import year, month, dayofmonth, hour, minute, second
from pyspark.sql.functions import monotonically_increasing_id

# Eksplorasi Data

## Data AIS

In [4]:
# Path
save_path = "s3a://ungp-ais-data-historical-backup/user_temp/"
path_unique = save_path + "222011349/"

## Read Data

In [5]:
# Read Data
data_ais = spark.read.parquet(path_unique + "data-ais-indonesia-by-mmsi-th-2022.parquet", header=True)

# Hapus Duplikat

In [6]:
data_sampel = data_ais.distinct()

In [None]:
# Save Data
data_sampel.write.option("header", True).mode("overwrite").parquet(path_unique + "data-ais-indonesia-by-mmsi-th-2022-unik.parquet")

In [5]:
# Read Data
data_sampel = spark.read.parquet(path_unique + "data-ais-indonesia-by-mmsi-th-2022-unik.parquet", header=True)

# Quality Assurance

## Nilai Valid

### MMSI Valid

In [None]:
# Definisikan nilai mmsi yang valid
nilai_valid = [100000000, 999999999]

# Jumlah mmsi yang valid
jumlah_mmsi_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['mmsi'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("mmsi").alias("mmsi_valid"))

# Buat DataFrame hasil
mmsi_valid = jumlah_mmsi_valid_per_bulan

# Tampilkan hasil
mmsi_valid.show()

+---------+----------+
|   months|mmsi_valid|
+---------+----------+
|     July| 155700247|
| November| 119213501|
| February| 138697160|
|  January| 141179948|
|    March| 149788640|
|  October| 155982352|
|      May| 149841097|
|   August| 155661342|
|    April| 148231231|
|     June| 147191969|
| December| 154116337|
|September| 151249370|
+---------+----------+



### IMO Valid

In [7]:
# Definisikan nilai imo yang valid
nilai_valid = [1000000, 9999999]

# Hitung jumlah imo valid per bulan
jumlah_imo_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['imo'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("imo").alias("imo_valid"))

# Buat DataFrame hasil
imo_valid = jumlah_imo_valid_per_bulan

# Tampilkan hasil
imo_valid.show() 

+---------+---------+
|   months|imo_valid|
+---------+---------+
|     July|147391757|
| November|111469663|
| February|130053652|
|  January|131903146|
|    March|140647822|
|  October|147180567|
|      May|141063315|
|   August|147576290|
|    April|139115054|
|     June|138357982|
| December|144900472|
|September|142540565|
+---------+---------+



### Status Navigasi Valid

In [None]:
# Definisikan nilai nav_status_code yang valid
nilai_valid = [0, 14]

# Hitung jumlah nav_status_code  valid per bulan
jumlah_nav_status_code_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['nav_status_code'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("nav_status_code").alias("nav_status_code_valid"))

# Buat DataFrame hasil
nav_status_code_valid = jumlah_nav_status_code_valid_per_bulan

# Tampilkan hasil
nav_status_code_valid.show() 

+---------+---------------------+
|   months|nav_status_code_valid|
+---------+---------------------+
|     July|            148415283|
| November|            112425580|
| February|            131156936|
|  January|            133065154|
|    March|            141923034|
|  October|            147731976|
|      May|            142263960|
|   August|            148223798|
|    April|            140262977|
|     June|            139421676|
| December|            145610063|
|September|            143063930|
+---------+---------------------+



### Tipe Kapal Valid

In [None]:
# Definisikan nilai vessel_type_code yang valid
nilai_valid = [1, 255]

# Hitung jumlah vessel_type_code  valid per bulan
jumlah_vessel_type_code_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['vessel_type_code'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("vessel_type_code").alias("vessel_type_code_valid"))

# Buat DataFrame hasil
vessel_type_code_valid = jumlah_vessel_type_code_valid_per_bulan

# Tampilkan hasil
vessel_type_code_valid.show() 

+---------+----------------------+
|   months|vessel_type_code_valid|
+---------+----------------------+
|     July|             155658110|
| November|             119202801|
| February|             138650896|
|  January|             141138992|
|    March|             149748827|
|  October|             155941907|
|      May|             149811456|
|   August|             155629165|
|    April|             148197720|
|     June|             147151041|
| December|             154067527|
|September|             151212698|
+---------+----------------------+



### Negara Kapal Valid

In [None]:
# Definisikan nilai flag_country_code yang valid
nilai_valid = [201, 775]

# Hitung jumlah flag_country_code tidak valid per bulan
jumlah_flag_country_code_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['flag_code'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("flag_code").alias("flag_country_code_valid"))

# Buat DataFrame hasil
flag_country_code_valid = jumlah_flag_country_code_valid_per_bulan

# Tampilkan hasil
flag_country_code_valid.show() 

+---------+-----------------------+
|   months|flag_country_code_valid|
+---------+-----------------------+
|     July|              155700247|
| November|               96930139|
| February|              138697160|
|  January|              141179948|
|    March|              149788640|
|  October|              155982352|
|      May|              147715918|
|   August|              155661342|
|    April|              148231231|
|     June|              147191969|
| December|              154116337|
|September|              151249370|
+---------+-----------------------+



### Latitude & Longitude Valid

In [None]:
# Definisikan nilai latitude yang valid
nilai_valid = [-90, 90]

# Hitung jumlah latitude tidak valid per bulan
jumlah_latitude_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['latitude'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("latitude").alias("latitude_valid"))

# Buat DataFrame hasil
latitude_valid = jumlah_latitude_valid_per_bulan

# Tampilkan hasil
latitude_valid.show() 

+---------+--------------+
|   months|latitude_valid|
+---------+--------------+
|     July|     155700247|
| November|     119253847|
| February|     138697160|
|  January|     141179948|
|    March|     149788640|
|  October|     155982352|
|      May|     149845329|
|   August|     155661342|
|    April|     148231231|
|     June|     147191969|
| December|     154116337|
|September|     151249370|
+---------+--------------+



In [None]:
# Definisikan nilai longitude yang valid
nilai_valid = [-180, 180]

# Hitung jumlah longitude tidak valid per bulan
jumlah_longitude_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['longitude'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("longitude").alias("longitude_valid"))

# Buat DataFrame hasil
longitude_valid = jumlah_longitude_valid_per_bulan

# Tampilkan hasil
longitude_valid.show()

+---------+---------------+
|   months|longitude_valid|
+---------+---------------+
|     July|      155700247|
| November|      119253847|
| February|      138697160|
|  January|      141179948|
|    March|      149788640|
|  October|      155982352|
|      May|      149845329|
|   August|      155661342|
|    April|      148231231|
|     June|      147191969|
| December|      154116337|
|September|      151249370|
+---------+---------------+



### dt_pos_utc Valid

In [None]:
# Definisikan pola regex untuk nilai 'dt_pos_utc' yang valid
pattern = "^[1-9][0-9]{0,3}-[1-9]|0[1-9]|1[0-2]-[1-9]|0[1-9]|[1-2][0-9]|3[0-1] [0-9]|0[0-9]|1[0-9]|2[0-3]:[0-5][0-9]:[0-5][0-9]$"

# Filter data untuk mendapatkan hanya nilai 'dt_pos_utc' yang tidak valid
dt_pos_utc_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("dt_pos_utc").rlike(pattern)) \
    .groupBy("months").agg(F.count("dt_pos_utc").alias("dt_pos_utc_valid"))

# Buat DataFrame hasil
dt_pos_utc_valid = dt_pos_utc_valid_per_bulan

# Tampilkan hasil
dt_pos_utc_valid.show()  

+---------+----------------+
|   months|dt_pos_utc_valid|
+---------+----------------+
|     July|       155700247|
| November|       119253847|
| February|       138697160|
|  January|       141179948|
|    March|       149788640|
|  October|       155982352|
|      May|       149845329|
|   August|       155661342|
|    April|       148231231|
|     June|       147191969|
| December|       154116337|
|September|       151249370|
+---------+----------------+



## Nilai Default

### MMSI dengan Nilai Default per Bulan

In [None]:
# Tentukan nilai yang ingin dihitung
default_value_1 = 0
default_value_2 = 1193046

# Hitung jumlah 'mmsi' dengan nilai default per bulan
mmsi_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((F.col("mmsi") == default_value_1) | (F.col("mmsi") == default_value_2)) \
    .groupBy("months", "mmsi").agg(F.count("mmsi").alias("mmsi_count_default"))

# Tampilkan DataFrame Spark hasil akhir
mmsi_default_per_month.show()

+--------+-------+------------------+
|  months|   mmsi|mmsi_count_default|
+--------+-------+------------------+
|November|      0|              5528|
|November|1193046|              1360|
|     May|      0|               475|
|     May|1193046|                91|
+--------+-------+------------------+



### IMO dengan Nilai Default per Bulan

In [None]:
# Tentukan nilai yang ingin dihitung
default_value = 0  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'imo' dengan nilai default per bulan
imo_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("imo") == default_value) \
    .groupBy("months").agg(F.count("imo").alias("imo_count_default"))

# Tampilkan DataFrame Spark hasil akhir
imo_default_per_month.show() 

+------+-----------------+
|months|imo_count_default|
+------+-----------------+
+------+-----------------+



### Status Navigasi dengan Nilai Default per Bulan

In [None]:
# Tentukan nilai yang ingin dihitung
default_value = "Not Defined"  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'nav_status' dengan nilai default per bulan
nav_status_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("nav_status") == default_value) \
    .groupBy("months").agg(F.count("nav_status").alias("nav_status_count_default"))

# Tampilkan DataFrame Spark hasil akhir
nav_status_default_per_month.show() 

+---------+------------------------+
|   months|nav_status_count_default|
+---------+------------------------+
|     July|                 2161999|
| November|                 2025493|
| February|                 2356450|
|  January|                 2378087|
|    March|                 2451983|
|  October|                 2397638|
|      May|                 2547225|
|   August|                 2214161|
|    April|                 2453652|
|     June|                 2355284|
| December|                 2590777|
|September|                 2300887|
+---------+------------------------+



### Tipe Kapal dengan Nilai Default per Bulan

In [None]:
# Tentukan nilai yang ingin dihitung
default_value = "Not Available"  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'vessel_type' dengan nilai default per bulan
vessel_type_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("vessel_type") == default_value) \
    .groupBy("months").agg(F.count("vessel_type").alias("vessel_type_count_default"))

# Tampilkan DataFrame Spark hasil akhir
vessel_type_default_per_month.show() 

+---------+-------------------------+
|   months|vessel_type_count_default|
+---------+-------------------------+
|     July|                    42137|
| November|                    51046|
| February|                    46264|
|  January|                    40956|
|    March|                    39813|
|  October|                    40445|
|      May|                    33873|
|   August|                    32177|
|    April|                    33511|
|     June|                    40928|
| December|                    48810|
|September|                    36672|
+---------+-------------------------+



### Negara Kapal dengan Nilai Default per Bulan

In [None]:
# MISAL DEFAULT = 0
# Tentukan nilai yang ingin dihitung
default_value = 0  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'flag_country' dengan nilai default per bulan
flag_country_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("flag_code") == default_value) \
    .groupBy("months").agg(F.count("flag_code").alias("flag_country_count_default"))

# Tampilkan DataFrame Spark hasil akhir
flag_country_default_per_month.show()  

+------+--------------------------+
|months|flag_country_count_default|
+------+--------------------------+
+------+--------------------------+



### Latitude & Longitude dengan Nilai Default per Bulan

In [None]:
# Tentukan nilai yang ingin dihitung
default_value = 91  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'latitude' dengan nilai default per bulan
latitude_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("latitude") == default_value) \
    .groupBy("months").agg(F.count("latitude").alias("latitude_count_default"))

# Tampilkan DataFrame Spark hasil akhir
latitude_default_per_month.show() 

+------+----------------------+
|months|latitude_count_default|
+------+----------------------+
+------+----------------------+



In [None]:
# Tentukan nilai yang ingin dihitung
default_value = 181  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'longitude' dengan nilai default per bulan
longitude_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("longitude") == default_value) \
    .groupBy("months").agg(F.count("longitude").alias("longitude_count_default"))

# Tampilkan DataFrame Spark hasil akhir
longitude_default_per_month.show() 

+------+-----------------------+
|months|longitude_count_default|
+------+-----------------------+
+------+-----------------------+



### dt_pos_utc dengan Nilai Default per Bulan

In [None]:
# Tentukan nilai yang ingin dihitung
default_value = "0-0-0 24:60:60"

# Hitung jumlah nilai default pada kolom 'dt_pos_utc' per bulan
dt_pos_utc_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("dt_pos_utc") == default_value) \
    .groupBy("months").agg(F.count("dt_pos_utc").alias("dt_pos_ut_count_dafault"))

# Tampilkan DataFrame Spark hasil akhir
dt_pos_utc_default_per_month.show()  

+------+-----------------------+
|months|dt_pos_ut_count_dafault|
+------+-----------------------+
+------+-----------------------+



## Tidak Valid

### MMSI Tidak Valid

In [None]:
# Definisikan nilai mmsi yang valid
nilai_valid = [100000000, 999999999]

# Hitung jumlah mmsi tidak valid per bulan
jumlah_mmsi_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['mmsi'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['mmsi'] != 0) & (data_sampel['mmsi'] != 1193046) &
            (data_sampel['mmsi'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("mmsi").alias("mmsi_tidak_valid"))

# Buat DataFrame hasil
mmsi_invalid = jumlah_mmsi_tidak_valid_per_bulan

# Tampilkan hasil
mmsi_invalid.show()


+--------+----------------+
|  months|mmsi_tidak_valid|
+--------+----------------+
|November|           33458|
|     May|            3666|
+--------+----------------+



### IMO Tidak Valid

In [None]:
# Definisikan nilai imo yang valid
nilai_valid = [1000000, 9999999]

# Hitung jumlah imo tidak valid per bulan
jumlah_imo_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['imo'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['imo'] != 0) &
            (data_sampel['imo'].isNotNull()) 
           )\
    .groupBy("months").agg(F.count("imo").alias("imo_tidak_valid"))

# Buat DataFrame hasil
imo_invalid = jumlah_imo_tidak_valid_per_bulan

# Tampilkan hasil
imo_invalid.show() 

+---------+---------------+
|   months|imo_tidak_valid|
+---------+---------------+
|     July|         357533|
| November|         319696|
| February|         335901|
|  January|         379319|
|    March|         380841|
|  October|         341021|
|      May|         457276|
|   August|         360863|
|    April|         422751|
|     June|         404061|
| December|         348589|
|September|         331227|
+---------+---------------+



### Status Navigasi Tidak Valid

In [None]:
# Definisikan nilai nav_status_code yang valid
nilai_valid = [0, 14]

# Hitung jumlah nav_status_code tidak valid per bulan
jumlah_nav_status_code_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['nav_status_code'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['nav_status_code'] != 15) & 
            (data_sampel['nav_status_code'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("nav_status_code").alias("nav_status_code_tidak_valid"))

# Buat DataFrame hasil
nav_status_code_invalid = jumlah_nav_status_code_tidak_valid_per_bulan

# Tampilkan hasil
nav_status_code_invalid.show() 

+---------+---------------------------+
|   months|nav_status_code_tidak_valid|
+---------+---------------------------+
|     July|                    5122965|
| November|                    4802774|
| February|                    5183774|
|  January|                    5736707|
|    March|                    5413623|
|  October|                    5852738|
|      May|                    5034144|
|   August|                    5223383|
|    April|                    5514602|
|     June|                    5415009|
| December|                    5915497|
|September|                    5884553|
+---------+---------------------------+



### Tipe Kapal Tidak Valid

In [None]:
# Definisikan nilai vessel_type_code yang valid
nilai_valid = [1, 255]

# Hitung jumlah vessel_type_code tidak valid per bulan
jumlah_vessel_type_code_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['vessel_type_code'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['vessel_type_code'] != 0) &
            (data_sampel['vessel_type_code'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("vessel_type_code").alias("vessel_type_code_tidak_valid"))

# Buat DataFrame hasil
vessel_type_code_invalid = jumlah_vessel_type_code_tidak_valid_per_bulan

# Tampilkan hasil
vessel_type_code_invalid.show() 

+------+----------------------------+
|months|vessel_type_code_tidak_valid|
+------+----------------------------+
+------+----------------------------+



### Negara Kapal Tidak Valid

In [None]:
# Definisikan nilai flag_country_code yang valid
nilai_valid = [201, 775]

# Hitung jumlah flag_country_code tidak valid per bulan
jumlah_flag_country_code_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['flag_code'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['flag_code'] != 0) &
            (data_sampel['flag_code'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("flag_code").alias("flag_country_code_tidak_valid"))

# Buat DataFrame hasil
flag_country_code_invalid = jumlah_flag_country_code_tidak_valid_per_bulan

# Tampilkan hasil
flag_country_code_invalid.show() 

+------+-----------------------------+
|months|flag_country_code_tidak_valid|
+------+-----------------------------+
+------+-----------------------------+



### Latitude & Longitude Tidak Valid

In [None]:
# Definisikan nilai latitude yang valid
nilai_valid = [-90, 90]

# Hitung jumlah latitude tidak valid per bulan
jumlah_latitude_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['latitude'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['latitude'] != 91) &
            (data_sampel['latitude'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("latitude").alias("latitude_tidak_valid"))

# Buat DataFrame hasil
latitude_invalid = jumlah_latitude_tidak_valid_per_bulan

# Tampilkan hasil
latitude_invalid.show() 

+------+--------------------+
|months|latitude_tidak_valid|
+------+--------------------+
+------+--------------------+



In [None]:
# Definisikan nilai longitude yang valid
nilai_valid = [-180, 180]

# Hitung jumlah longitude tidak valid per bulan
jumlah_longitude_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['longitude'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['longitude'] != 181) &
            (data_sampel['longitude'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("longitude").alias("longitude_tidak_valid"))

# Buat DataFrame hasil
longitude_invalid = jumlah_longitude_tidak_valid_per_bulan

# Tampilkan hasil
longitude_invalid.show()

+------+---------------------+
|months|longitude_tidak_valid|
+------+---------------------+
+------+---------------------+



### dt_pos_utc Tidak Valid

In [None]:
# Definisikan pola regex untuk nilai 'dt_pos_utc' yang valid
pattern = "^[1-9][0-9]{0,3}-[1-9]|0[1-9]|1[0-2]-[1-9]|0[1-9]|[1-2][0-9]|3[0-1] [0-9]|0[0-9]|1[0-9]|2[0-3]:[0-5][0-9]:[0-5][0-9]$"

default_value = "0-0-0 24:60:60"

# Filter data untuk mendapatkan hanya nilai 'dt_pos_utc' yang tidak valid
dt_pos_utc_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~F.col("dt_pos_utc").rlike(pattern)) & 
            (data_sampel['dt_pos_utc'] != default_value) &
            (data_sampel['dt_pos_utc'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("dt_pos_utc").alias("dt_pos_utc_tidak_valid"))

# Buat DataFrame hasil
dt_pos_utc_invalid = dt_pos_utc_tidak_valid_per_bulan

# Tampilkan hasil
dt_pos_utc_invalid.show()  

+------+----------------------+
|months|dt_pos_utc_tidak_valid|
+------+----------------------+
+------+----------------------+



## Missing Value

### MS MMSI

In [None]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "mmsi"
missing_values_per_month_mmsi = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("mmsi").isNull().cast(IntegerType())).alias("mmsi_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_mmsi.show() 

+---------+------------+
|   months|mmsi_missing|
+---------+------------+
|     July|           0|
| November|           0|
| February|           0|
|  January|           0|
|    March|           0|
|  October|           0|
|      May|           0|
|   August|           0|
|    April|           0|
|     June|           0|
| December|           0|
|September|           0|
+---------+------------+



### MS IMO

In [None]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "imo"
missing_values_per_month_imo = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("imo").isNull().cast(IntegerType())).alias("imo_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_imo.show()  

+---------+-----------+
|   months|imo_missing|
+---------+-----------+
|     July|    7950957|
| November|    7464488|
| February|    8307607|
|  January|    8897483|
|    March|    8759977|
|  October|    8460764|
|      May|    8324738|
|   August|    7724189|
|    April|    8693426|
|     June|    8429926|
| December|    8867276|
|September|    8377578|
+---------+-----------+



### MS Status Navigasi

In [None]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "nav_status"
missing_values_per_month_nav_status = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("nav_status").isNull().cast(IntegerType())).alias("nav_status_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_nav_status.show() 

+---------+------------------+
|   months|nav_status_missing|
+---------+------------------+
|     July|                 0|
| November|                 0|
| February|                 0|
|  January|                 0|
|    March|                 0|
|  October|                 0|
|      May|                 0|
|   August|                 0|
|    April|                 0|
|     June|                 0|
| December|                 0|
|September|                 0|
+---------+------------------+



### MS Tipe Kapal

In [None]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "vessel_type"
missing_values_per_month_vessel_type = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("vessel_type").isNull().cast(IntegerType())).alias("vessel_type_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_vessel_type.show() 

+---------+-------------------+
|   months|vessel_type_missing|
+---------+-------------------+
|     July|                  0|
| November|                  0|
| February|                  0|
|  January|                  0|
|    March|                  0|
|  October|                  0|
|      May|                  0|
|   August|                  0|
|    April|                  0|
|     June|                  0|
| December|                  0|
|September|                  0|
+---------+-------------------+



### MS Negara Kapal

In [None]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "flag_country"
missing_values_per_month_flag_code = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("flag_code").isNull().cast(IntegerType())).alias("flag_code_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_flag_code.show() 

+---------+-----------------+
|   months|flag_code_missing|
+---------+-----------------+
|     July|                0|
| November|         22323708|
| February|                0|
|  January|                0|
|    March|                0|
|  October|                0|
|      May|          2129411|
|   August|                0|
|    April|                0|
|     June|                0|
| December|                0|
|September|                0|
+---------+-----------------+



### MS Latitude & Longitude

In [None]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "latitude"
missing_values_per_month_latitude = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("latitude").isNull().cast(IntegerType())).alias("latitude_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_latitude.show() 

+---------+----------------+
|   months|latitude_missing|
+---------+----------------+
|     July|               0|
| November|               0|
| February|               0|
|  January|               0|
|    March|               0|
|  October|               0|
|      May|               0|
|   August|               0|
|    April|               0|
|     June|               0|
| December|               0|
|September|               0|
+---------+----------------+



In [None]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "longitude"
missing_values_per_month_longitude = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("longitude").isNull().cast(IntegerType())).alias("longitude_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_longitude.show() 

+---------+-----------------+
|   months|longitude_missing|
+---------+-----------------+
|     July|                0|
| November|                0|
| February|                0|
|  January|                0|
|    March|                0|
|  October|                0|
|      May|                0|
|   August|                0|
|    April|                0|
|     June|                0|
| December|                0|
|September|                0|
+---------+-----------------+



### MS dt_pos_utc

In [None]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "dt_pos_utc"
missing_values_per_month_dt_pos_utc = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("dt_pos_utc").isNull().cast(IntegerType())).alias("dt_pos_utc_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_dt_pos_utc.show() 

+---------+------------------+
|   months|dt_pos_utc_missing|
+---------+------------------+
|     July|                 0|
| November|                 0|
| February|                 0|
|  January|                 0|
|    March|                 0|
|  October|                 0|
|      May|                 0|
|   August|                 0|
|    April|                 0|
|     June|                 0|
| December|                 0|
|September|                 0|
+---------+------------------+

