# Inilization

In [1]:
import pandas as pd
from pyspark.sql import functions as F

#For 3.3.1
#Register Sedona Functions to Spark
from sedona.register import SedonaRegistrator
SedonaRegistrator.registerAll(spark)


#For 3.3.2
from shapely.geometry import Point, Polygon, mapping
import h3.api.numpy_int as h3int 

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, count, countDistinct, when, expr
import calendar
from pyspark.sql.types import IntegerType
from pyspark.sql.window import Window

In [3]:
import pandas as pd
from IPython.display import HTML
import base64
from pyspark.sql.functions import year, month, dayofmonth, hour, minute, second
from pyspark.sql.functions import monotonically_increasing_id

# Eksplorasi Data

## Data AIS

In [4]:
# Path
save_path = "s3a://ungp-ais-data-historical-backup/user_temp/"
path_unique = save_path + "222011349/"

In [5]:
# Read Data
df_data = spark.read.parquet(path_unique + "data-ais-indonesia-by-mmsi-2022.parquet", header=True)

In [8]:
df_data.count()

1769532126

## Filter Data 2022

In [6]:
# Ekstrak tahun dari kolom yang berisi tanggal atau waktu
df_data = df_data.withColumn("tahun", date_format("dt_pos_utc", "yyyy"))

# Filter data untuk tahun 2022
data_sampel = df_data.filter(df_data["tahun"] == 2022)

In [11]:
data_sampel.count()

1769530772

## Save Data

In [7]:
# Save
data_sampel.write.option("header",True).mode("overwrite").parquet(path_unique + "data-ais-indonesia-by-mmsi-th-2022.parquet")

## Read Data

In [5]:
# Read Data
data_sampel = spark.read.parquet(path_unique + "data-ais-indonesia-by-mmsi-th-2022.parquet", header=True)

## Record per Bulan

In [12]:
jumlah_record_per_bulan = data_sampel.withColumn("months", date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(count("*").alias("jumlah_record_per_bulan"))

In [13]:
jumlah_record_per_bulan.show(12)

+---------+-----------------------+
|   months|jumlah_record_per_bulan|
+---------+-----------------------+
| February|              139182981|
|      May|              149845329|
|     July|              155700247|
|     June|              147191969|
|   August|              155661342|
|  January|              143327127|
|    March|              149788640|
|  October|              155982352|
|September|              151249370|
|    April|              148231231|
| November|              119253847|
| December|              154116337|
+---------+-----------------------+



## Statistical Summary Data AIS

In [14]:
# MMSI, IMO, Status Navigasi, Tipe Kapal, Negara Kapal

# Variabel yang ingin dihitung statistik deskriptif
variables_to_describe = ["mmsi", "imo", "nav_status_code", "vessel_type_code", "flag_code", "sog"]

# Ambil statistik deskriptif tertentu untuk kolom-kolom tersebut
stats_df = data_sampel.select(variables_to_describe).describe()

# Pilih statistik tertentu
selected_stats = ["count", "mean", "stddev", "min", "25%", "50%", "75%", "max"]
stats_df = stats_df.select(["summary"] + [col(var).alias(var) for var in variables_to_describe]).filter(col("summary").isin(selected_stats))

# Tampilkan statistik deskriptif
stats_df.show()

+-------+--------------------+--------------------+------------------+------------------+------------------+-----------------+
|summary|                mmsi|                 imo|   nav_status_code|  vessel_type_code|         flag_code|              sog|
+-------+--------------------+--------------------+------------------+------------------+------------------+-----------------+
|  count|          1769530772|          1669108335|        1769530772|        1769530772|        1745077653|       1769530772|
|   mean| 4.567868926220443E8|1.0113834411052296E7|1.8508174278870353| 71.35050869519436|456.44845294858635|7.069534056787124|
| stddev|1.2776518044793731E8|2.1608514883694395E7|3.8250097516904678|12.393169645436048|127.90227998171707|6.224082691685074|
|    min|                   0|                   1|                 0|                 0|               201|              0.0|
|    max|          1073675264|          1073741823|                16|               255|               750|   

In [15]:
# Latitude, Longitude, dt_pos_utc

# Pemisahan komponen waktu dari kolom dt_pos_utc
data_sampel = data_sampel.withColumn("year", year("dt_pos_utc"))
data_sampel = data_sampel.withColumn("month", month("dt_pos_utc"))
data_sampel = data_sampel.withColumn("day", dayofmonth("dt_pos_utc"))
data_sampel = data_sampel.withColumn("hour", hour("dt_pos_utc"))
data_sampel = data_sampel.withColumn("minute", minute("dt_pos_utc"))
data_sampel = data_sampel.withColumn("second", second("dt_pos_utc"))

# Variabel yang ingin dihitung statistik deskriptif
variables_to_describe_2 = ["latitude", "longitude", "year", "month", "day", "hour", "minute", "second"]

# Ambil statistik deskriptif tertentu untuk kolom-kolom tersebut
stats_df_2 = data_sampel.select(variables_to_describe_2).describe()

# Pilih statistik tertentu
selected_stats = ["count", "mean", "stddev", "min", "25%", "50%", "75%", "max"]
stats_df_2 = stats_df_2.select(["summary"] + [col(var).alias(var) for var in variables_to_describe_2]).filter(col("summary").isin(selected_stats))

# Tampilkan statistik deskriptif
stats_df_2.show()

+-------+------------------+-----------------+--------------------+------------------+------------------+------------------+------------------+------------------+
|summary|          latitude|        longitude|                year|             month|               day|              hour|            minute|            second|
+-------+------------------+-----------------+--------------------+------------------+------------------+------------------+------------------+------------------+
|  count|        1769530772|       1769530772|          1769530772|        1769530772|        1769530772|        1769530772|        1769530772|        1769530772|
|   mean| 5.825686426227572|51.06176116047818|              2022.0| 6.506702935143984|15.787211795941573|11.614630207741875|29.542165805862574|29.252984066784006|
| stddev|25.344814248930494|86.71479105811035|2.266168932666207...|3.4129164839731945| 8.870998362726208|6.9375646116925225|17.306239493617095|17.389617016617862|
|    min|             

## Nilai Unik

### MMSI Unik per Bulan

In [19]:
# Hitung jumlah 'mmsi' yang unik per bulan
unique_mmsi_per_month = data_sampel.withColumn("months", date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(countDistinct("mmsi").alias("unique_mmsi_count"))

# Tampilkan DataFrame Spark
unique_mmsi_per_month.show()

+---------+-----------------+
|   months|unique_mmsi_count|
+---------+-----------------+
|     July|            35014|
| November|            38339|
| February|            34180|
|  January|            34026|
|    March|            34714|
|  October|            35986|
|      May|            35243|
|   August|            35340|
|    April|            34785|
|     June|            34605|
| December|            35766|
|September|            35823|
+---------+-----------------+



### IMO Unik per Bulan

In [20]:
# Hitung jumlah 'imo' yang unik per bulan
unique_imo_per_month = data_sampel.withColumn("months", date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(countDistinct("imo").alias("unique_imo_count"))

# Tampilkan DataFrame Spark
unique_imo_per_month.show()

+---------+----------------+
|   months|unique_imo_count|
+---------+----------------+
|     July|           24705|
| November|           24991|
| February|           24068|
|  January|           24161|
|    March|           24472|
|  October|           24904|
|      May|           24518|
|   August|           24764|
|    April|           24375|
|     June|           24590|
| December|           24912|
|September|           24839|
+---------+----------------+



### Status Navigasi Setahun

In [None]:
# Hitung jumlah 'mmsi' per masing-masing 'nav_status' dengan DataFrame API
mmsi_count_per_nav_status = data_sampel.groupBy("nav_status").agg(F.count("mmsi").alias("mmsi_count"))

# Tampilkan DataFrame Spark
mmsi_count_per_nav_status.show()

+--------------------+----------+
|          nav_status|mmsi_count|
+--------------------+----------+
|              Moored| 223746854|
|Restricted Manoeu...|  13688975|
|             Aground|    410920|
|         Not Defined|  28277882|
|   Not Under Command|  13572253|
|  Engaged In Fishing|   4678671|
|    Underway Sailing|  40991570|
|             Unknown|  65204319|
|           At Anchor| 257546580|
|Under Way Using E...|1121412748|
+--------------------+----------+



### Tipe Kapal Setahun

In [None]:
# Hitung jumlah 'mmsi' per masing-masing 'vessel_type'
mmsi_count_per_vessel_type = data_sampel.groupBy("vessel_type").agg(count("mmsi").alias("mmsi_count"))

# Tampilkan DataFrame Spark
mmsi_count_per_vessel_type.show(mmsi_count_per_vessel_type.count(), truncate = False)

+------------------------------------+----------+
|vessel_type                         |mmsi_count|
+------------------------------------+----------+
|Sailing                             |5086627   |
|Tanker                              |408580328 |
|Ships Not Party to Armed Conflict   |803767    |
|Military                            |1985202   |
|Towing                              |8762577   |
|Reserved                            |3762261   |
|SAR                                 |726339    |
|Unknown                             |28610799  |
|Other                               |46753351  |
|UNAVAILABLE                         |41906     |
|Tug                                 |69560838  |
|Law Enforcement                     |2340110   |
|Pleasure Craft                      |13586891  |
|Passenger                           |26935753  |
|Diving                              |792132    |
|Fishing                             |43863196  |
|Port Tender                         |1490724   |


### Negara Kapal Setahun

In [None]:
# Hitung jumlah 'mmsi' per masing-masing 'flag_country'
mmsi_count_per_flag_country = data_sampel.groupBy("flag_country").agg(count("mmsi").alias("mmsi_count"))

# Tampilkan DataFrame Spark
mmsi_count_per_flag_country.show(mmsi_count_per_flag_country.count(), truncate = False)

+--------------------------------+----------+
|flag_country                    |mmsi_count|
+--------------------------------+----------+
|Sweden                          |1879939   |
|Kiribati                        |804788    |
|Guyana                          |88167     |
|Philippines                     |7624661   |
|Eritrea                         |9650      |
|Djibouti                        |427976    |
|Singapore                       |175202393 |
|Malaysia                        |31869594  |
|Fiji                            |58597     |
|Turkey                          |1938494   |
|Germany                         |2599150   |
|Comoros                         |359528    |
|Cambodia                        |16829     |
|Maldives                        |89640     |
|Jordan                          |69884     |
|Yemen                           |3654      |
|Iraq                            |3574      |
|Crozet Archipelago              |1237      |
|Malawi                          |

# Quality Assurance

## MMSI Unik per

### Status Navigasi

In [21]:
# Hitung jumlah 'mmsi' yang unik berdasarkan 'nav_status'
unique_mmsi_per_nav_status = data_sampel.groupBy("nav_status").agg(countDistinct("mmsi").alias("unique_mmsi_per_nav_status_count"))

# Tampilkan DataFrame Spark
unique_mmsi_per_nav_status.show()

+--------------------+--------------------------------+
|          nav_status|unique_mmsi_per_nav_status_count|
+--------------------+--------------------------------+
|              Moored|                           23599|
|Restricted Manoeu...|                            5081|
|             Aground|                             836|
|         Not Defined|                            6331|
|   Not Under Command|                           15299|
|  Engaged In Fishing|                             493|
|    Underway Sailing|                           11410|
|             Unknown|                           25555|
|           At Anchor|                           24233|
|Under Way Using E...|                           28009|
+--------------------+--------------------------------+



### Tipe Kapal

In [22]:
# Hitung jumlah 'mmsi' yang unik berdasarkan 'vessel_type'
unique_mmsi_per_vessel_type = data_sampel.groupBy("vessel_type").agg(countDistinct("mmsi").alias("unique_mmsi_per_vessel_type_count"))

# Tampilkan DataFrame Spark
unique_mmsi_per_vessel_type.show(unique_mmsi_per_vessel_type.count(), truncate=False)

+------------------------------------+---------------------------------+
|vessel_type                         |unique_mmsi_per_vessel_type_count|
+------------------------------------+---------------------------------+
|Sailing                             |849                              |
|Tanker                              |8555                             |
|Ships Not Party to Armed Conflict   |64                               |
|Military                            |265                              |
|Towing                              |1413                             |
|Reserved                            |1153                             |
|SAR                                 |108                              |
|Unknown                             |20671                            |
|Other                               |2113                             |
|UNAVAILABLE                         |299                              |
|Tug                                 |6351         

### Negara Kapal

In [23]:
# Filter out rows where 'flag_country' is not null
df_filtered = data_sampel.filter(col("flag_country").isNotNull())

# Hitung jumlah 'mmsi' yang unik berdasarkan 'flag_country'
unique_mmsi_per_flag_country = df_filtered.groupBy("flag_country").agg(countDistinct("mmsi").alias("unique_mmsi_per_flag_country_count"))

# Tampilkan DataFrame Spark
unique_mmsi_per_flag_country.show(unique_mmsi_per_flag_country.count(), truncate=False) 

+--------------------------------+----------------------------------+
|flag_country                    |unique_mmsi_per_flag_country_count|
+--------------------------------+----------------------------------+
|Chad                            |1                                 |
|Kiribati                        |45                                |
|Guyana                          |8                                 |
|Philippines                     |163                               |
|Singapore                       |3587                              |
|Malaysia                        |1030                              |
|Germany                         |59                                |
|Palau                           |79                                |
|France                          |84                                |
|Greece                          |345                               |
|Taiwan                          |351                               |
|British Virgin Isla

## IMO Unik per

### Status Navigasi

In [24]:
# Hitung jumlah 'mmsi' yang unik berdasarkan 'nav_status'
unique_imo_per_nav_status = data_sampel.select("imo","nav_status").groupBy("nav_status").agg(countDistinct("imo").alias("unique_imo_per_nav_status_count"))

# Tampilkan DataFrame Spark
unique_imo_per_nav_status.show(unique_imo_per_nav_status.count(), truncate=False)

+--------------------------+-------------------------------+
|nav_status                |unique_imo_per_nav_status_count|
+--------------------------+-------------------------------+
|Moored                    |24559                          |
|Aground                   |804                            |
|Restricted Manoeuvrability|4991                           |
|Not Defined               |5809                           |
|Not Under Command         |14958                          |
|Engaged In Fishing        |436                            |
|Underway Sailing          |11046                          |
|Unknown                   |7145                           |
|At Anchor                 |24061                          |
|Under Way Using Engine    |27442                          |
+--------------------------+-------------------------------+



### Tipe Kapal

In [25]:
# Hitung jumlah 'mmsi' yang unik berdasarkan 'vessel_type'
unique_imo_per_vessel_type = data_sampel.select("imo","vessel_type").groupBy("vessel_type").agg(countDistinct("imo").alias("unique_imo_per_vessel_type_count"))

# Tampilkan DataFrame Spark
unique_imo_per_vessel_type.show(unique_imo_per_vessel_type.count(), truncate=False)

+------------------------------------+--------------------------------+
|vessel_type                         |unique_imo_per_vessel_type_count|
+------------------------------------+--------------------------------+
|Sailing                             |215                             |
|Tanker                              |9154                            |
|Military                            |167                             |
|Towing                              |586                             |
|Reserved                            |520                             |
|Unknown                             |5580                            |
|Other                               |1644                            |
|UNAVAILABLE                         |90                              |
|Tug                                 |1792                            |
|Law Enforcement                     |118                             |
|Pleasure Craft                      |958                       

### Negara Kapal

In [26]:
# Hitung jumlah 'mmsi' yang unik berdasarkan 'vessel_type'
unique_imo_per_flag_country = data_sampel.select("imo","flag_country").groupBy("flag_country").agg(countDistinct("imo").alias("unique_imo_per_flag_country_count"))

# Tampilkan DataFrame Spark
unique_imo_per_flag_country.show(unique_imo_per_flag_country.count(), truncate=False)

+--------------------------------+---------------------------------+
|flag_country                    |unique_imo_per_flag_country_count|
+--------------------------------+---------------------------------+
|Kiribati                        |29                               |
|Philippines                     |141                              |
|Djibouti                        |20                               |
|Singapore                       |2903                             |
|Malaysia                        |629                              |
|Germany                         |57                               |
|Palau                           |92                               |
|France                          |71                               |
|Greece                          |470                              |
|Sri Lanka                       |10                               |
|Taiwan                          |204                              |
|British Virgin Islands          |

## Nilai Valid

### MMSI Valid

In [27]:
# Definisikan nilai mmsi yang valid
nilai_valid = [100000000, 999999999]

# Jumlah mmsi yang valid
jumlah_mmsi_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['mmsi'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("mmsi").alias("mmsi_valid"))

# Buat DataFrame hasil
mmsi_valid = jumlah_mmsi_valid_per_bulan

# Tampilkan hasil
mmsi_valid.show()

+---------+----------+
|   months|mmsi_valid|
+---------+----------+
| November| 119213501|
| February| 139182981|
|  January| 143327127|
|    April| 148231231|
| December| 154116337|
|   August| 155661342|
|     June| 147191969|
|September| 151249370|
|  October| 155982352|
|     July| 155700247|
|    March| 149788640|
|      May| 149841097|
+---------+----------+



### IMO Valid

In [28]:
# Definisikan nilai imo yang valid
nilai_valid = [1000000, 9999999]

# Hitung jumlah imo valid per bulan
jumlah_imo_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['imo'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("imo").alias("imo_valid"))

# Buat DataFrame hasil
imo_valid = jumlah_imo_valid_per_bulan

# Tampilkan hasil
imo_valid.show() 

+---------+---------+
|   months|imo_valid|
+---------+---------+
|  January|133909316|
| February|130509544|
|     July|147391757|
|  October|147180567|
|   August|147576290|
|September|142540565|
|     June|138357982|
|      May|141063315|
|    March|140647822|
|    April|139115054|
| November|111469663|
| December|144900472|
+---------+---------+



### Status Navigasi Valid

In [29]:
# Definisikan nilai nav_status_code yang valid
nilai_valid = [0, 14]

# Hitung jumlah nav_status_code  valid per bulan
jumlah_nav_status_code_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['nav_status_code'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("nav_status_code").alias("nav_status_code_valid"))

# Buat DataFrame hasil
nav_status_code_valid = jumlah_nav_status_code_valid_per_bulan

# Tampilkan hasil
nav_status_code_valid.show() 

+---------+---------------------+
|   months|nav_status_code_valid|
+---------+---------------------+
| February|            131616922|
|  January|            135089372|
|     July|            148415283|
|    March|            141923034|
|September|            143063930|
|     June|            139421676|
|      May|            142263960|
|   August|            148223798|
|  October|            147731976|
|    April|            140262977|
| November|            112425580|
| December|            145610063|
+---------+---------------------+



### Tipe Kapal Valid

In [30]:
# Definisikan nilai vessel_type_code yang valid
nilai_valid = [1, 255]

# Hitung jumlah vessel_type_code  valid per bulan
jumlah_vessel_type_code_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['vessel_type_code'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("vessel_type_code").alias("vessel_type_code_valid"))

# Buat DataFrame hasil
vessel_type_code_valid = jumlah_vessel_type_code_valid_per_bulan

# Tampilkan hasil
vessel_type_code_valid.show() 

+---------+----------------------+
|   months|vessel_type_code_valid|
+---------+----------------------+
| February|             139136575|
|  January|             143285547|
|     July|             155658110|
|September|             151212698|
|     June|             147151041|
|   August|             155629165|
|  October|             155941907|
|    April|             148197720|
|      May|             149811456|
|    March|             149748827|
| November|             119202801|
| December|             154067527|
+---------+----------------------+



### Negara Kapal Valid

In [31]:
# Definisikan nilai flag_country_code yang valid
nilai_valid = [201, 775]

# Hitung jumlah flag_country_code tidak valid per bulan
jumlah_flag_country_code_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['flag_code'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("flag_code").alias("flag_country_code_valid"))

# Buat DataFrame hasil
flag_country_code_valid = jumlah_flag_country_code_valid_per_bulan

# Tampilkan hasil
flag_country_code_valid.show() 

+---------+-----------------------+
|   months|flag_country_code_valid|
+---------+-----------------------+
| February|              139182981|
|  January|              143327127|
|     July|              155700247|
|     June|              147191969|
|  October|              155982352|
|   August|              155661342|
|    March|              149788640|
|September|              151249370|
|    April|              148231231|
|      May|              147715918|
| November|               96930139|
| December|              154116337|
+---------+-----------------------+



### Latitude & Longitude Valid

In [32]:
# Definisikan nilai latitude yang valid
nilai_valid = [-90, 90]

# Hitung jumlah latitude tidak valid per bulan
jumlah_latitude_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['latitude'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("latitude").alias("latitude_valid"))

# Buat DataFrame hasil
latitude_valid = jumlah_latitude_valid_per_bulan

# Tampilkan hasil
latitude_valid.show() 

+---------+--------------+
|   months|latitude_valid|
+---------+--------------+
| February|     139182981|
|  January|     143327127|
|     July|     155700247|
|  October|     155982352|
|   August|     155661342|
|     June|     147191969|
|September|     151249370|
|      May|     149845329|
|    March|     149788640|
|    April|     148231231|
| November|     119253847|
| December|     154116337|
+---------+--------------+



In [33]:
# Definisikan nilai longitude yang valid
nilai_valid = [-180, 180]

# Hitung jumlah longitude tidak valid per bulan
jumlah_longitude_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['longitude'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("longitude").alias("longitude_valid"))

# Buat DataFrame hasil
longitude_valid = jumlah_longitude_valid_per_bulan

# Tampilkan hasil
longitude_valid.show()

+---------+---------------+
|   months|longitude_valid|
+---------+---------------+
| February|      139182981|
|  January|      143327127|
|     July|      155700247|
|September|      151249370|
|   August|      155661342|
|     June|      147191969|
|    April|      148231231|
|    March|      149788640|
|  October|      155982352|
|      May|      149845329|
| November|      119253847|
| December|      154116337|
+---------+---------------+



### dt_pos_utc Valid

In [34]:
# Definisikan pola regex untuk nilai 'dt_pos_utc' yang valid
pattern = "^[1-9][0-9]{0,3}-[1-9]|0[1-9]|1[0-2]-[1-9]|0[1-9]|[1-2][0-9]|3[0-1] [0-9]|0[0-9]|1[0-9]|2[0-3]:[0-5][0-9]:[0-5][0-9]$"

# Filter data untuk mendapatkan hanya nilai 'dt_pos_utc' yang tidak valid
dt_pos_utc_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("dt_pos_utc").rlike(pattern)) \
    .groupBy("months").agg(F.count("dt_pos_utc").alias("dt_pos_utc_valid"))

# Buat DataFrame hasil
dt_pos_utc_valid = dt_pos_utc_valid_per_bulan

# Tampilkan hasil
dt_pos_utc_valid.show()  

+---------+----------------+
|   months|dt_pos_utc_valid|
+---------+----------------+
| February|       139182981|
|  January|       143327127|
|  October|       155982352|
|     July|       155700247|
|     June|       147191969|
|September|       151249370|
|    April|       148231231|
|   August|       155661342|
|      May|       149845329|
|    March|       149788640|
| November|       119253847|
| December|       154116337|
+---------+----------------+



## Nilai Default

### MMSI dengan Nilai Default per Bulan

In [35]:
# Tentukan nilai yang ingin dihitung
default_value_1 = 0
default_value_2 = 1193046

# Hitung jumlah 'mmsi' dengan nilai default per bulan
mmsi_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((F.col("mmsi") == default_value_1) | (F.col("mmsi") == default_value_2)) \
    .groupBy("months", "mmsi").agg(F.count("mmsi").alias("mmsi_count_default"))

# Tampilkan DataFrame Spark hasil akhir
mmsi_default_per_month.show()

+--------+-------+------------------+
|  months|   mmsi|mmsi_count_default|
+--------+-------+------------------+
|November|      0|              5528|
|November|1193046|              1360|
|     May|1193046|                91|
|     May|      0|               475|
+--------+-------+------------------+



### IMO dengan Nilai Default per Bulan

In [36]:
# Tentukan nilai yang ingin dihitung
default_value = 0  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'imo' dengan nilai default per bulan
imo_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("imo") == default_value) \
    .groupBy("months").agg(F.count("imo").alias("imo_count_default"))

# Tampilkan DataFrame Spark hasil akhir
imo_default_per_month.show() 

+------+-----------------+
|months|imo_count_default|
+------+-----------------+
+------+-----------------+



### Status Navigasi dengan Nilai Default per Bulan

In [37]:
# Tentukan nilai yang ingin dihitung
default_value = "Not Defined"  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'nav_status' dengan nilai default per bulan
nav_status_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("nav_status") == default_value) \
    .groupBy("months").agg(F.count("nav_status").alias("nav_status_count_default"))

# Tampilkan DataFrame Spark hasil akhir
nav_status_default_per_month.show() 

+---------+------------------------+
|   months|nav_status_count_default|
+---------+------------------------+
|  January|                 2414121|
|  October|                 2397638|
| February|                 2364662|
|    March|                 2451983|
|   August|                 2214161|
|     July|                 2161999|
|     June|                 2355284|
|      May|                 2547225|
|September|                 2300887|
|    April|                 2453652|
| November|                 2025493|
| December|                 2590777|
+---------+------------------------+



### Tipe Kapal dengan Nilai Default per Bulan

In [38]:
# Tentukan nilai yang ingin dihitung
default_value = "Not Available"  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'vessel_type' dengan nilai default per bulan
vessel_type_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("vessel_type") == default_value) \
    .groupBy("months").agg(F.count("vessel_type").alias("vessel_type_count_default"))

# Tampilkan DataFrame Spark hasil akhir
vessel_type_default_per_month.show() 

+---------+-------------------------+
|   months|vessel_type_count_default|
+---------+-------------------------+
| February|                    46406|
|  January|                    41580|
|  October|                    40445|
|     July|                    42137|
|     June|                    40928|
|    March|                    39813|
|   August|                    32177|
|      May|                    33873|
|September|                    36672|
|    April|                    33511|
| December|                    48810|
| November|                    51046|
+---------+-------------------------+



### Negara Kapal dengan Nilai Default per Bulan

In [39]:
# MISAL DEFAULT = 0
# Tentukan nilai yang ingin dihitung
default_value = 0  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'flag_country' dengan nilai default per bulan
flag_country_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("flag_code") == default_value) \
    .groupBy("months").agg(F.count("flag_code").alias("flag_country_count_default"))

# Tampilkan DataFrame Spark hasil akhir
flag_country_default_per_month.show()  

+------+--------------------------+
|months|flag_country_count_default|
+------+--------------------------+
+------+--------------------------+



### Latitude & Longitude dengan Nilai Default per Bulan

In [40]:
# Tentukan nilai yang ingin dihitung
default_value = 91  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'latitude' dengan nilai default per bulan
latitude_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("latitude") == default_value) \
    .groupBy("months").agg(F.count("latitude").alias("latitude_count_default"))

# Tampilkan DataFrame Spark hasil akhir
latitude_default_per_month.show() 

+------+----------------------+
|months|latitude_count_default|
+------+----------------------+
+------+----------------------+



In [41]:
# Tentukan nilai yang ingin dihitung
default_value = 181  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'longitude' dengan nilai default per bulan
longitude_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("longitude") == default_value) \
    .groupBy("months").agg(F.count("longitude").alias("longitude_count_default"))

# Tampilkan DataFrame Spark hasil akhir
longitude_default_per_month.show() 

+------+-----------------------+
|months|longitude_count_default|
+------+-----------------------+
+------+-----------------------+



### dt_pos_utc dengan Nilai Default per Bulan

In [42]:
# Tentukan nilai yang ingin dihitung
default_value = "0-0-0 24:60:60"

# Hitung jumlah nilai default pada kolom 'dt_pos_utc' per bulan
dt_pos_utc_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("dt_pos_utc") == default_value) \
    .groupBy("months").agg(F.count("dt_pos_utc").alias("dt_pos_ut_count_dafault"))

# Tampilkan DataFrame Spark hasil akhir
dt_pos_utc_default_per_month.show()  

+------+-----------------------+
|months|dt_pos_ut_count_dafault|
+------+-----------------------+
+------+-----------------------+



## Tidak Valid

### MMSI Tidak Valid

In [43]:
# Definisikan nilai mmsi yang valid
nilai_valid = [100000000, 999999999]

# Hitung jumlah mmsi tidak valid per bulan
jumlah_mmsi_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['mmsi'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['mmsi'] != 0) & (data_sampel['mmsi'] != 1193046) &
            (data_sampel['mmsi'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("mmsi").alias("mmsi_tidak_valid"))

# Buat DataFrame hasil
mmsi_invalid = jumlah_mmsi_tidak_valid_per_bulan

# Tampilkan hasil
mmsi_invalid.show()


+--------+----------------+
|  months|mmsi_tidak_valid|
+--------+----------------+
|     May|            3666|
|November|           33458|
+--------+----------------+



### IMO Tidak Valid

In [44]:
# Definisikan nilai imo yang valid
nilai_valid = [1000000, 9999999]

# Hitung jumlah imo tidak valid per bulan
jumlah_imo_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['imo'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['imo'] != 0) &
            (data_sampel['imo'].isNotNull()) 
           )\
    .groupBy("months").agg(F.count("imo").alias("imo_tidak_valid"))

# Buat DataFrame hasil
imo_invalid = jumlah_imo_tidak_valid_per_bulan

# Tampilkan hasil
imo_invalid.show() 

+---------+---------------+
|   months|imo_tidak_valid|
+---------+---------------+
|  January|         385042|
|  October|         341021|
| February|         337088|
|    March|         380841|
|   August|         360863|
|     July|         357533|
|September|         331227|
|     June|         404061|
|    April|         422751|
|      May|         457276|
| December|         348589|
| November|         319696|
+---------+---------------+



### Status Navigasi Tidak Valid

In [45]:
# Definisikan nilai nav_status_code yang valid
nilai_valid = [0, 14]

# Hitung jumlah nav_status_code tidak valid per bulan
jumlah_nav_status_code_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['nav_status_code'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['nav_status_code'] != 15) & 
            (data_sampel['nav_status_code'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("nav_status_code").alias("nav_status_code_tidak_valid"))

# Buat DataFrame hasil
nav_status_code_invalid = jumlah_nav_status_code_tidak_valid_per_bulan

# Tampilkan hasil
nav_status_code_invalid.show() 

+---------+---------------------------+
|   months|nav_status_code_tidak_valid|
+---------+---------------------------+
|  January|                    5823634|
| February|                    5201397|
|     July|                    5122965|
|    March|                    5413623|
|     June|                    5415009|
|   August|                    5223383|
|September|                    5884553|
|      May|                    5034144|
|  October|                    5852738|
|    April|                    5514602|
| November|                    4802774|
| December|                    5915497|
+---------+---------------------------+



### Tipe Kapal Tidak Valid

In [46]:
# Definisikan nilai vessel_type_code yang valid
nilai_valid = [1, 255]

# Hitung jumlah vessel_type_code tidak valid per bulan
jumlah_vessel_type_code_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['vessel_type_code'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['vessel_type_code'] != 0) &
            (data_sampel['vessel_type_code'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("vessel_type_code").alias("vessel_type_code_tidak_valid"))

# Buat DataFrame hasil
vessel_type_code_invalid = jumlah_vessel_type_code_tidak_valid_per_bulan

# Tampilkan hasil
vessel_type_code_invalid.show() 

+------+----------------------------+
|months|vessel_type_code_tidak_valid|
+------+----------------------------+
+------+----------------------------+



### Negara Kapal Tidak Valid

In [47]:
# Definisikan nilai flag_country_code yang valid
nilai_valid = [201, 775]

# Hitung jumlah flag_country_code tidak valid per bulan
jumlah_flag_country_code_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['flag_code'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['flag_code'] != 0) &
            (data_sampel['flag_code'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("flag_code").alias("flag_country_code_tidak_valid"))

# Buat DataFrame hasil
flag_country_code_invalid = jumlah_flag_country_code_tidak_valid_per_bulan

# Tampilkan hasil
flag_country_code_invalid.show() 

+------+-----------------------------+
|months|flag_country_code_tidak_valid|
+------+-----------------------------+
+------+-----------------------------+



### Latitude & Longitude Tidak Valid

In [48]:
# Definisikan nilai latitude yang valid
nilai_valid = [-90, 90]

# Hitung jumlah latitude tidak valid per bulan
jumlah_latitude_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['latitude'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['latitude'] != 91) &
            (data_sampel['latitude'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("latitude").alias("latitude_tidak_valid"))

# Buat DataFrame hasil
latitude_invalid = jumlah_latitude_tidak_valid_per_bulan

# Tampilkan hasil
latitude_invalid.show() 

+------+--------------------+
|months|latitude_tidak_valid|
+------+--------------------+
+------+--------------------+



In [49]:
# Definisikan nilai longitude yang valid
nilai_valid = [-180, 180]

# Hitung jumlah longitude tidak valid per bulan
jumlah_longitude_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['longitude'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['longitude'] != 181) &
            (data_sampel['longitude'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("longitude").alias("longitude_tidak_valid"))

# Buat DataFrame hasil
longitude_invalid = jumlah_longitude_tidak_valid_per_bulan

# Tampilkan hasil
longitude_invalid.show()

+------+---------------------+
|months|longitude_tidak_valid|
+------+---------------------+
+------+---------------------+



### dt_pos_utc Tidak Valid

In [50]:
# Definisikan pola regex untuk nilai 'dt_pos_utc' yang valid
pattern = "^[1-9][0-9]{0,3}-[1-9]|0[1-9]|1[0-2]-[1-9]|0[1-9]|[1-2][0-9]|3[0-1] [0-9]|0[0-9]|1[0-9]|2[0-3]:[0-5][0-9]:[0-5][0-9]$"

default_value = "0-0-0 24:60:60"

# Filter data untuk mendapatkan hanya nilai 'dt_pos_utc' yang tidak valid
dt_pos_utc_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~F.col("dt_pos_utc").rlike(pattern)) & 
            (data_sampel['dt_pos_utc'] != default_value) &
            (data_sampel['dt_pos_utc'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("dt_pos_utc").alias("dt_pos_utc_tidak_valid"))

# Buat DataFrame hasil
dt_pos_utc_invalid = dt_pos_utc_tidak_valid_per_bulan

# Tampilkan hasil
dt_pos_utc_invalid.show()  

+------+----------------------+
|months|dt_pos_utc_tidak_valid|
+------+----------------------+
+------+----------------------+



## Missing Value

### MS MMSI

In [51]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "mmsi"
missing_values_per_month_mmsi = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("mmsi").isNull().cast(IntegerType())).alias("mmsi_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_mmsi.show() 

+---------+------------+
|   months|mmsi_missing|
+---------+------------+
|  January|           0|
|     July|           0|
|    March|           0|
|     June|           0|
|   August|           0|
|September|           0|
|      May|           0|
| February|           0|
|    April|           0|
|  October|           0|
| December|           0|
| November|           0|
+---------+------------+



### MS IMO

In [52]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "imo"
missing_values_per_month_imo = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("imo").isNull().cast(IntegerType())).alias("imo_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_imo.show()  

+---------+-----------+
|   months|imo_missing|
+---------+-----------+
| February|    8336349|
|  January|    9032769|
|     July|    7950957|
|   August|    7724189|
|  October|    8460764|
|     June|    8429926|
|      May|    8324738|
|September|    8377578|
|    April|    8693426|
|    March|    8759977|
| November|    7464488|
| December|    8867276|
+---------+-----------+



### MS Status Navigasi

In [53]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "nav_status"
missing_values_per_month_nav_status = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("nav_status").isNull().cast(IntegerType())).alias("nav_status_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_nav_status.show() 

+---------+------------------+
|   months|nav_status_missing|
+---------+------------------+
| February|                 0|
|  January|                 0|
|  October|                 0|
|     July|                 0|
|    March|                 0|
|     June|                 0|
|   August|                 0|
|September|                 0|
|      May|                 0|
|    April|                 0|
| December|                 0|
| November|                 0|
+---------+------------------+



### MS Tipe Kapal

In [54]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "vessel_type"
missing_values_per_month_vessel_type = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("vessel_type").isNull().cast(IntegerType())).alias("vessel_type_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_vessel_type.show() 

+---------+-------------------+
|   months|vessel_type_missing|
+---------+-------------------+
| February|                  0|
|  January|                  0|
|     July|                  0|
|September|                  0|
|   August|                  0|
|      May|                  0|
|  October|                  0|
|     June|                  0|
|    March|                  0|
|    April|                  0|
| December|                  0|
| November|                  0|
+---------+-------------------+



### MS Negara Kapal

In [55]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "flag_country"
missing_values_per_month_flag_code = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("flag_code").isNull().cast(IntegerType())).alias("flag_code_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_flag_code.show() 

+---------+-----------------+
|   months|flag_code_missing|
+---------+-----------------+
| February|                0|
|  January|                0|
|  October|                0|
|     July|                0|
|   August|                0|
|      May|          2129411|
|    March|                0|
|September|                0|
|    April|                0|
|     June|                0|
| December|                0|
| November|         22323708|
+---------+-----------------+



### MS Latitude & Longitude

In [56]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "latitude"
missing_values_per_month_latitude = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("latitude").isNull().cast(IntegerType())).alias("latitude_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_latitude.show() 

+---------+----------------+
|   months|latitude_missing|
+---------+----------------+
|  January|               0|
|  October|               0|
|     July|               0|
|   August|               0|
|      May|               0|
|September|               0|
| February|               0|
|     June|               0|
|    March|               0|
|    April|               0|
| November|               0|
| December|               0|
+---------+----------------+



In [57]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "longitude"
missing_values_per_month_longitude = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("longitude").isNull().cast(IntegerType())).alias("longitude_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_longitude.show() 

+---------+-----------------+
|   months|longitude_missing|
+---------+-----------------+
|  January|                0|
|     July|                0|
|     June|                0|
|  October|                0|
|      May|                0|
| February|                0|
|September|                0|
|    March|                0|
|   August|                0|
|    April|                0|
| November|                0|
| December|                0|
+---------+-----------------+



### MS dt_pos_utc

In [58]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "dt_pos_utc"
missing_values_per_month_dt_pos_utc = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("dt_pos_utc").isNull().cast(IntegerType())).alias("dt_pos_utc_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_dt_pos_utc.show() 

+---------+------------------+
|   months|dt_pos_utc_missing|
+---------+------------------+
|  January|                 0|
|  October|                 0|
|     July|                 0|
|    March|                 0|
|     June|                 0|
|   August|                 0|
|September|                 0|
|      May|                 0|
| February|                 0|
|    April|                 0|
| November|                 0|
| December|                 0|
+---------+------------------+



## Pergerakan Anomali

In [36]:
# plot jarak tempuh (y) sama waktu tempuh (x)
# Dari plotnya keliatan mana yang jarak tempuhnya besar tapi waktu tempuhnya kecil (atau sebaliknya?)

### Status Navigasi & SOG

In [6]:
# Tentukan persentil yang diinginkan
percentiles = [0.5, 0.75, 0.9, 0.95, 0.99, 0.999]

# Loop melalui setiap persentil dan hitung nilai kuantil untuk sog
quantile_columns = [expr(f"percentile_approx(sog, {p})").alias(f"sog_{int(p * 100)}") for p in percentiles]

# Kelompokkan berdasarkan nav_status dan hitung kuantilnya
quantiles_per_nav_status = data_sampel.select("nav_status","sog").groupBy("nav_status").agg(*quantile_columns)

# Tampilkan hasil
quantiles_per_nav_status.show()

+--------------------+------+------+------+------+------+------+
|          nav_status|sog_50|sog_75|sog_90|sog_95|sog_99|sog_99|
+--------------------+------+------+------+------+------+------+
|              Moored|   0.0|   0.0|   0.0|   0.1|   8.0|  16.2|
|Restricted Manoeu...|   0.1|   2.0|   5.6|   8.0|  13.3|  17.6|
|             Aground|   0.0|   0.3|   5.1|   9.0|  21.0|  23.4|
|         Not Defined|   0.0|   3.0|  10.0|  12.0|  16.8|  24.0|
|   Not Under Command|   1.0|   1.9|   3.3|   8.0|  13.0|  17.9|
|  Engaged In Fishing|   3.9|   8.4|  10.9|  11.8|  13.7|  15.0|
|    Underway Sailing|   0.2|   8.0|  12.0|  13.2|  17.7|  26.8|
|             Unknown|   0.2|   4.3|   8.1|  10.4|  15.1|  23.9|
|           At Anchor|   0.0|   0.1|   0.2|   0.3|   3.5|  12.4|
|Under Way Using E...|  11.5|  13.0|  15.7|  17.5|  19.8|  22.0|
+--------------------+------+------+------+------+------+------+



### Validasi Status Navigasi dengan SOG

In [7]:
# Memeriksa kecocokan nav_status dan sog

# Ekstrak nama bulan dari timestamp
data_sampel = data_sampel.withColumn("bulan", date_format("dt_pos_utc", "MMMM"))

# Tentukan kriteria pergerakan tidak normal
kriteria_pergerakan_tidak_normal = (
    ((col("nav_status").isin(['At Anchor', 'Moored'])) &
    (col("sog") > 1)) | 
    ((col("nav_status").isin(['Not Under Command', 'Underway Sailing', 'Under Way Using Engine', 'Engaged In Fishing', 'Restricted Manoeuvrability', 'Aground'])) &
    (col("sog") < 1))
)

# Hitung pergerakan tidak normal per bulan
pergerakan_tidak_normal_per_bulan = data_sampel.withColumn(
    "pergerakan_tidak_normal",
    when(kriteria_pergerakan_tidak_normal, 1).otherwise(0)
).groupBy("bulan").agg({"pergerakan_tidak_normal": "sum"}).withColumnRenamed("SUM(pergerakan_tidak_normal)", "total_pergerakan_tidak_normal")

# Tampilkan hasil
pergerakan_tidak_normal_per_bulan.show()

+---------+-----------------------------+
|    bulan|total_pergerakan_tidak_normal|
+---------+-----------------------------+
|  January|                     11848390|
|    March|                     12156876|
|  October|                     12847935|
|     June|                     11555304|
|   August|                     12334589|
| February|                     11032325|
|      May|                     11756787|
|September|                     12385670|
|    April|                     11958960|
|     July|                     12261002|
| November|                     10670926|
| December|                     12619793|
+---------+-----------------------------+



## Record Duplicate

In [None]:
# Ekstrak kolom bulan dari kolom tanggal atau waktu di DataFrame
data_sampel = data_sampel.withColumn("bulan", date_format("dt_pos_utc", "MMMM"))

# Tentukan record-record duplikat dalam DataFrame dengan membandingkan nilai semua variabel
duplikat = data_sampel.groupBy(*data_sampel.columns).count().where("count > 1")

# Kelompokkan record-record duplikat berdasarkan bulan dan hitung jumlahnya
duplikat_per_bulan = duplikat.groupBy("bulan").count()

# Tampilkan hasilnya
duplikat_per_bulan.show()

+--------+-------+
|   bulan|  count|
+--------+-------+
|February| 485821|
| January|2147179|
+--------+-------+



# Cek Data AIS

## Cek 1 IMO 1 MMSI

In [7]:
# Grouping data by "imo"
grouped_data = data_sampel.select("imo", "mmsi") \
    .groupBy("imo") \
    .agg(countDistinct("mmsi").alias("mmsi_count"))

# Urutkan
grouped_data = grouped_data.orderBy(col("mmsi_count").desc())

# Tampilkan hasil
grouped_data.show(grouped_data.count(), truncate = False)

+----------+----------+
|imo       |mmsi_count|
+----------+----------+
|null      |23475     |
|9789317   |512       |
|1400704   |114       |
|9789359   |87        |
|9789315   |84        |
|9789328   |84        |
|9789337   |83        |
|123456789 |83        |
|9789370   |82        |
|9789367   |81        |
|9789327   |80        |
|9789341   |80        |
|9789313   |80        |
|9789333   |80        |
|9789351   |79        |
|9789318   |78        |
|9789373   |78        |
|9789312   |77        |
|9789365   |76        |
|9789335   |76        |
|9789374   |76        |
|9789343   |76        |
|9789348   |74        |
|9789358   |73        |
|9789338   |73        |
|9789353   |73        |
|9789321   |72        |
|9789336   |72        |
|9789350   |72        |
|9789342   |72        |
|9789320   |72        |
|9789360   |71        |
|9789369   |71        |
|9789326   |71        |
|9789362   |71        |
|9789355   |70        |
|9789349   |70        |
|9789316   |70        |
|9789361   |70  

## Cek Jumlah Record

In [6]:
# Menghitung jumlah record per MMSI
mmsi_counts = data_sampel.select("mmsi").groupBy("mmsi").agg(count("*").alias("count"))

# Filter record kurang dari 10
mmsi_with_record_less10 = mmsi_counts.filter(col("count") < 10)

In [None]:
# Urutkan
mmsi_with_record_less10 = mmsi_with_record_less10.orderBy(col("count"))

# Tampilkan hasil
mmsi_with_record_less10.show(mmsi_with_record_less10.count(), truncate = False)

+----------+-----+
|mmsi      |count|
+----------+-----+
|994071970 |1    |
|525200569 |1    |
|525019457 |1    |
|525487200 |1    |
|994063101 |1    |
|710240258 |1    |
|525101741 |1    |
|994071835 |1    |
|538002366 |1    |
|525101277 |1    |
|506740997 |1    |
|636015674 |1    |
|525200772 |1    |
|525100456 |1    |
|477995736 |1    |
|525800908 |1    |
|525019589 |1    |
|98351604  |1    |
|431401408 |1    |
|412426960 |1    |
|525101530 |1    |
|525200128 |1    |
|994037779 |1    |
|574002191 |1    |
|525005254 |1    |
|525201734 |1    |
|525004122 |1    |
|994080364 |1    |
|59065791  |1    |
|525600320 |1    |
|525012395 |1    |
|994037127 |1    |
|525600179 |1    |
|538002812 |1    |
|108080304 |1    |
|525201171 |1    |
|525500655 |1    |
|600013290 |1    |
|150010538 |1    |
|525015368 |1    |
|525700073 |1    |
|525200701 |1    |
|994164231 |1    |
|525104042 |1    |
|564552632 |1    |
|525400812 |1    |
|161000027 |1    |
|525321401 |1    |
|525400391 |1    |
|98351625  |

In [None]:
# Mengelompokkan MMSI berdasarkan jumlah record
count_groups = mmsi_with_record_less10.groupBy("count").agg(count("mmsi").alias("mmsi_count"))

# Urutkan
count_groups = count_groups.orderBy(col("mmsi_count"))

# Tampilkan hasil
count_groups.show(count_groups.count(), truncate = False)

+-----+----------+
|count|mmsi_count|
+-----+----------+
|9    |160       |
|8    |183       |
|6    |203       |
|7    |204       |
|5    |248       |
|4    |272       |
|3    |318       |
|2    |395       |
|1    |577       |
+-----+----------+



## Cek SOG > 3

In [None]:
# Menghitung jumlah record per MMSI dengan SOG > 3
mmsi_with_sog_greater3 = data_sampel.select("mmsi", "sog").groupBy("mmsi").agg(
    count(when(col("sog") > 3, True)).alias("count_SOG_greater_than_3")
)

# Filter MMSI dg SOG > 3 kurang dari 20
mmsi_with_sog_greater3_less20 = mmsi_with_sog_greater3.filter(col("count_SOG_greater_than_3") < 20)

In [None]:
# Urutkan
mmsi_with_sog_greater3_less20 = mmsi_with_sog_greater3_less20.orderBy(col("count_SOG_greater_than_3"))

# Tampilkan hasil
mmsi_with_sog_greater3_less20.show(mmsi_with_sog_greater3_less20.count(), truncate = False)

+----------+------------------------+
|mmsi      |count_SOG_greater_than_3|
+----------+------------------------+
|525201070 |0                       |
|566895000 |0                       |
|123033234 |0                       |
|98353262  |0                       |
|525401588 |0                       |
|525145147 |0                       |
|457417000 |0                       |
|994037906 |0                       |
|525015556 |0                       |
|98351684  |0                       |
|525101603 |0                       |
|525007206 |0                       |
|525803528 |0                       |
|525727222 |0                       |
|166000093 |0                       |
|150000013 |0                       |
|150000477 |0                       |
|353001795 |0                       |
|260001459 |0                       |
|98351152  |0                       |
|525000951 |0                       |
|994037517 |0                       |
|566280000 |0                       |
|171000044 |

In [None]:
# Mengelompokkan MMSI berdasarkan jumlah record
count_groups = mmsi_with_sog_greater3_less20.groupBy("count_SOG_greater_than_3").agg(count("mmsi").alias("mmsi_count"))

# Urutkan
count_groups = count_groups.orderBy(col("mmsi_count"))

# Tampilkan hasil
count_groups.show(count_groups.count(), truncate = False)

+------------------------+----------+
|count_SOG_greater_than_3|mmsi_count|
+------------------------+----------+
|18                      |89        |
|19                      |96        |
|17                      |98        |
|16                      |110       |
|13                      |123       |
|15                      |123       |
|14                      |128       |
|10                      |129       |
|12                      |131       |
|11                      |136       |
|9                       |169       |
|8                       |191       |
|7                       |200       |
|6                       |238       |
|5                       |274       |
|4                       |319       |
|3                       |408       |
|2                       |621       |
|1                       |1027      |
|0                       |1976      |
+------------------------+----------+



# Download

## Fungsi

In [7]:
def create_download_link(df, title, filename):
    csv = df.to_csv(index = False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload, title=title, filename=filename)
    return HTML(html)

## Download Data

In [19]:
# Jumlah Records (All)

# DataFrame Spark
rekaman_per_bulan = spark.createDataFrame(rekaman_per_bulan)

# Export ke Pandas DataFrame
rekaman_per_bulan = rekaman_per_bulan.toPandas()

# Download Data
create_download_link(rekaman_per_bulan, title="rekaman_per_bulan", filename="rekaman_per_bulan.csv")

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


In [10]:
# Records (1%)

# DataFrame Spark
#sampled_data = spark.createDataFrame(sampled_data)

# Export ke Pandas DataFrame
#sampled_data = sampled_data.toPandas()

# Download Data
#create_download_link(sampled_data, title=sampled_data, filename="sampled_data.csv")

In [None]:
# Partisi Records (1%)

# DataFrame Spark
#sampledd_data_1 = spark.createDataFrame(sampledd_data_1)
#sampledd_data_2 = spark.createDataFrame(sampledd_data_2)
#sampledd_data_3 = spark.createDataFrame(sampledd_data_3)
#sampledd_data_4 = spark.createDataFrame(sampledd_data_4)
#sampledd_data_5 = spark.createDataFrame(sampledd_data_5)
#sampledd_data_6 = spark.createDataFrame(sampledd_data_6)
#sampledd_data_7 = spark.createDataFrame(sampledd_data_7)
#sampledd_data_8 = spark.createDataFrame(sampledd_data_8)
#sampledd_data_9 = spark.createDataFrame(sampledd_data_9)
#sampledd_data_10 = spark.createDataFrame(sampledd_data_10)

# Export ke Pandas DataFrame
#sampledd_data_1 = sampledd_data_1.toPandas()
#sampledd_data_2 = sampledd_data_2.toPandas()
#sampledd_data_3 = sampledd_data_3.toPandas()
#sampledd_data_4 = sampledd_data_4.toPandas()
#sampledd_data_5 = sampledd_data_5.toPandas()
#sampledd_data_6 = sampledd_data_6.toPandas()
#sampledd_data_7 = sampledd_data_7.toPandas()
#sampledd_data_8 = sampledd_data_8.toPandas()
#sampledd_data_9 = sampledd_data_9.toPandas()
#sampledd_data_10 = sampledd_data_10.toPandas()

# Download Data
#create_download_link(sampledd_data_1, title=sampledd_data_1, filename="sampledd_data_1.csv")
#create_download_link(sampledd_data_2, title=sampledd_data_2, filename="sampledd_data_2.csv")
#create_download_link(sampledd_data_3, title=sampledd_data_3, filename="sampledd_data_3.csv")
#create_download_link(sampledd_data_4, title=sampledd_data_4, filename="sampledd_data_4.csv")
#create_download_link(sampledd_data_5, title=sampledd_data_5, filename="sampledd_data_5.csv")
#create_download_link(sampledd_data_6, title=sampledd_data_6, filename="sampledd_data_6.csv")
#create_download_link(sampledd_data_7, title=sampledd_data_7, filename="sampledd_data_7.csv")
#create_download_link(sampledd_data_8, title=sampledd_data_8, filename="sampledd_data_8.csv")
#create_download_link(sampledd_data_9, title=sampledd_data_9, filename="sampledd_data_9.csv")
#create_download_link(sampledd_data_10, title=sampledd_data_10, filename="sampledd_data_10.csv")

In [17]:
# Jumlah Records (1%)

# DataFrame Spark
#rekaman_per_bulan_1pers = spark.createDataFrame(rekaman_per_bulan_1pers)

# Export ke Pandas DataFrame
rekaman_per_bulan_1pers = rekaman_per_bulan_1pers.toPandas()

# Download Data
create_download_link(rekaman_per_bulan_1pers, title=rekaman_per_bulan_1pers, filename="rekaman_per_bulan_1pers.csv")

In [9]:
# Statistical Summary

# DataFrame Spark
#stats_df = spark.createDataFrame(stats_df)
#stats_df_2 = spark.createDataFrame(stats_df_2)

# Export ke Pandas DataFrame
# stats_df = stats_df.toPandas()
# stats_df_2 = stats_df_2.toPandas()
# quantiles_per_nav_status = quantiles_per_nav_status.toPandas() 

# Download Data
# create_download_link(stats_df, title=stats_df, filename="stats_df.csv")
# create_download_link(stats_df_2, title=stats_df_2, filename="stats_df_2.csv")
# create_download_link(quantiles_per_nav_status, title=quantiles_per_nav_status, filename="quantiles_per_nav_status.csv")

In [19]:
# (MMSI, IMO, Status Navigasi, Tipe Kapal, Negara Kapal) Unik

# DataFrame Spark
#df_unique_mmsi_spark = spark.createDataFrame(df_unique_mmsi_spark)
#df_unique_imo_spark = spark.createDataFrame(df_unique_imo_spark)
#mmsi_count_per_nav_status = spark.createDataFrame(mmsi_count_per_nav_status)
#mmsi_count_per_vessel_type = spark.createDataFrame(mmsi_count_per_vessel_type)
#mmsi_count_per_flag_country = spark.createDataFrame(mmsi_count_per_flag_country)

# Export ke Pandas DataFrame
#df_unique_mmsi_spark = df_unique_mmsi_spark.toPandas()
#df_unique_imo_spark = df_unique_imo_spark.toPandas()
mmsi_count_per_nav_status = mmsi_count_per_nav_status.toPandas()
#mmsi_count_per_vessel_type = mmsi_count_per_vessel_type.toPandas()
#mmsi_count_per_flag_country = mmsi_count_per_flag_country.toPandas()
   

# Download Data
#create_download_link(df_unique_mmsi_spark, title=df_unique_mmsi_spark, filename="df_unique_mmsi_spark.csv")
#create_download_link(df_unique_imo_spark, title=df_unique_imo_spark, filename="df_unique_imo_spark.csv")
create_download_link(mmsi_count_per_nav_status, title=mmsi_count_per_nav_status, filename="mmsi_count_per_nav_status.csv")
#create_download_link(mmsi_count_per_vessel_type, title=mmsi_count_per_vessel_type, filename="mmsi_count_per_vessel_type.csv")
#create_download_link(mmsi_count_per_flag_country, title=mmsi_count_per_flag_country, filename="mmsi_count_per_flag_country.csv")

In [65]:
# MMSI Unik per (Status Navigasi, Tipe Kapal, Negara Kapal) 

# DataFrame Spark
#unique_mmsi_per_nav_status = spark.createDataFrame(unique_mmsi_per_nav_status)
#unique_mmsi_per_vessel_type = spark.createDataFrame(unique_mmsi_per_vessel_type)
#unique_mmsi_per_flag_country = spark.createDataFrame(unique_mmsi_per_flag_country)

# Export ke Pandas DataFrame
#unique_mmsi_per_nav_status = unique_mmsi_per_nav_status.toPandas()
unique_mmsi_per_vessel_type = unique_mmsi_per_vessel_type.toPandas()
# unique_mmsi_per_flag_country = unique_mmsi_per_flag_country.toPandas()
  
# Download Data
#create_download_link(unique_mmsi_per_nav_status, title=unique_mmsi_per_nav_status, filename="unique_mmsi_per_nav_status.csv")
create_download_link(unique_mmsi_per_vessel_type, title=unique_mmsi_per_vessel_type, filename="unique_mmsi_per_vessel_type.csv")
# create_download_link(unique_mmsi_per_flag_country, title=unique_mmsi_per_flag_country, filename="unique_mmsi_per_flag_country.csv")

Error while receiving.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=60>
Closing down clientserver connection
Closing down clientserver connection


ERROR: Exception while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=60>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR: KeyboardInterrupt while

KeyboardInterrupt: 

In [None]:
# IMO Unik per (Status Navigasi, Tipe Kapal, Negara Kapal) 

# DataFrame Spark
#unique_imo_per_nav_status = spark.createDataFrame(unique_imo_per_nav_status)
#unique_imo_per_vessel_type = spark.createDataFrame(unique_imo_per_vessel_type)
#unique_imo_per_flag_country = spark.createDataFrame(unique_imo_per_flag_country)

# Export ke Pandas DataFrame
#unique_imo_per_nav_status = unique_imo_per_nav_status.toPandas()
unique_imo_per_vessel_type = unique_imo_per_vessel_type.toPandas()
# unique_imo_per_flag_country = unique_imo_per_flag_country.toPandas()
  
# Download Data
#create_download_link(unique_imo_per_nav_status, title=unique_imo_per_nav_status, filename="unique_imo_per_nav_status.csv")
create_download_link(unique_imo_per_vessel_type, title=unique_imo_per_vessel_type, filename="unique_imo_per_vessel_type.csv")
# create_download_link(unique_imo_per_flag_country, title=unique_imo_per_flag_country, filename="unique_imo_per_flag_country.csv")

In [19]:
# Nilai Default

# DataFrame Spark
#mmsi_default_per_month = spark.createDataFrame(mmsi_default_per_month)
#imo_default_per_month = spark.createDataFrame(imo_default_per_month)
#nav_status_default_per_month = spark.createDataFrame(nav_status_default_per_month)
#vessel_type_default_per_month = spark.createDataFrame(vessel_type_default_per_month)
#flag_country_default_per_month = spark.createDataFrame(flag_country_default_per_month)
#latitude_default_per_month = spark.createDataFrame(latitude_default_per_month)
#longitude_default_per_month = spark.createDataFrame(longitude_default_per_month)
#dt_pos_utc_default_per_month = spark.createDataFrame(dt_pos_utc_default_per_month)

# Export ke Pandas DataFrame
#mmsi_default_per_month = mmsi_default_per_month.toPandas()
#imo_default_per_month = imo_default_per_month.toPandas()
#nav_status_default_per_month = nav_status_default_per_month.toPandas()
#vessel_type_default_per_month = vessel_type_default_per_month.toPandas()
#flag_country_default_per_month = flag_country_default_per_month.toPandas()
#latitude_default_per_month = latitude_default_per_month.toPandas()
#longitude_default_per_month = longitude_default_per_month.toPandas()
#dt_pos_utc_default_per_month = dt_pos_utc_default_per_month.toPandas()
             

# Download Data
#create_download_link(mmsi_default_per_month, title=mmsi_default_per_month, filename="mmsi_default_per_month.csv")
#create_download_link(imo_default_per_month, title=imo_default_per_month, filename="imo_default_per_month.csv")
#create_download_link(nav_status_default_per_month, title=nav_status_default_per_month, filename="nav_status_default_per_month.csv")
#create_download_link(vessel_type_default_per_month, title=vessel_type_default_per_month, filename="vessel_type_default_per_month.csv")
#create_download_link(flag_country_default_per_month, title=flag_country_default_per_month, filename="flag_country_default_per_month.csv")
#create_download_link(latitude_default_per_month, title=latitude_default_per_month, filename="latitude_default_per_month.csv")
#create_download_link(longitude_default_per_month, title=longitude_default_per_month, filename="longitude_default_per_month.csv")
#create_download_link(dt_pos_utc_default_per_month, title=dt_pos_utc_default_per_month, filename="dt_pos_utc_default_per_month.csv")

In [None]:
# Invalid Value

# DataFrame Spark
mmsi_invalid = spark.createDataFrame(mmsi_invalid)
#imo_invalid = spark.createDataFrame(imo_invalid)
#nav_status_code_invalid = spark.createDataFrame(nav_status_code_invalid)
#vessel_type_code_invalid = spark.createDataFrame(vessel_type_code_invalid)
#flag_country_code_invalid = spark.createDataFrame(flag_country_code_invalid)
#latitude_invalid = spark.createDataFrame(latitude_invalid)
#longitude_invalid = spark.createDataFrame(longitude_invalid)
#dt_pos_utc_invalid = spark.createDataFrame(dt_pos_utc_invalid)

# Export ke Pandas DataFrame
mmsi_invalid = mmsi_invalid.toPandas()
#imo_invalid = imo_invalid.toPandas()
#nav_status_code_invalid = nav_status_code_invalid.toPandas()
#vessel_type_code_invalid = vessel_type_code_invalid.toPandas()
#flag_country_code_invalid = flag_country_code_invalid.toPandas()
#latitude_invalid = latitude_invalid.toPandas()
#longitude_invalid = longitude_invalid.toPandas()
#dt_pos_utc_invalid = dt_pos_utc_invalid.toPandas()
       

# Download Data
create_download_link(mmsi_invalid, title=mmsi_invalid, filename="mmsi_invalid.csv")
#create_download_link(imo_invalid, title=imo_invalid, filename="imo_invalid.csv")
#create_download_link(nav_status_code_invalid, title=nav_status_code_invalid, filename="nav_status_code_invalid.csv")
#create_download_link(vessel_type_code_invalid, title=vessel_type_code_invalid, filename="vessel_type_code_invalid.csv")
#create_download_link(flag_country_code_invalid, title=flag_country_code_invalid, filename="flag_country_code_invalid.csv")
#create_download_link(latitude_invalid, title=latitude_invalid, filename="latitude_invalid.csv")
#create_download_link(longitude_invalid, title=longitude_invalid, filename="longitude_invalid.csv")
#create_download_link(dt_pos_utc_invalid, title=dt_pos_utc_invalid, filename="dt_pos_utc_invalid.csv")

In [None]:
# Missing Value

# DataFrame Spark
missing_values_per_month_mmsi = spark.createDataFrame(missing_values_per_month_mmsi)
#missing_values_per_month_imo = spark.createDataFrame(missing_values_per_month_imo)
#missing_values_per_month_nav_status = spark.createDataFrame(missing_values_per_month_nav_status)
#missing_values_per_month_vessel_type = spark.createDataFrame(missing_values_per_month_vessel_type)
#missing_values_per_month_flag_country = spark.createDataFrame(missing_values_per_month_flag_country)
#missing_values_per_month_latitude = spark.createDataFrame(missing_values_per_month_latitude)
#missing_values_per_month_longitude = spark.createDataFrame(missing_values_per_month_longitude)
#missing_values_per_month_dt_pos_utc = spark.createDataFrame(missing_values_per_month_dt_pos_utc)

# Export ke Pandas DataFrame
missing_values_per_month_mmsi = missing_values_per_month_mmsi.toPandas()
#missing_values_per_month_imo = missing_values_per_month_imo.toPandas()
#missing_values_per_month_nav_status = missing_values_per_month_nav_status.toPandas()
#missing_values_per_month_vessel_type = missing_values_per_month_vessel_type.toPandas()
#missing_values_per_month_flag_country = missing_values_per_month_flag_country.toPandas()
#missing_values_per_month_latitude = missing_values_per_month_latitude.toPandas()
#missing_values_per_month_longitude = missing_values_per_month_longitude.toPandas()
#missing_values_per_month_dt_pos_utc = missing_values_per_month_dt_pos_utc.toPandas()
        

# Download Data
create_download_link(missing_values_per_month_mmsi, title=missing_values_per_month_mmsi, filename="missing_values_per_month_mmsi.csv")
#create_download_link(missing_values_per_month_imo, title=missing_values_per_month_imo, filename="missing_values_per_month_imo.csv")
#create_download_link(missing_values_per_month_nav_status, title=missing_values_per_month_nav_status, filename="missing_values_per_month_nav_status.csv")
#create_download_link(missing_values_per_month_vessel_type, title=missing_values_per_month_vessel_type, filename="missing_values_per_month_vessel_type.csv")
#create_download_link(missing_values_per_month_flag_country, title=missing_values_per_month_flag_country, filename="missing_values_per_month_flag_country.csv")
#create_download_link(missing_values_per_month_latitude, title=missing_values_per_month_latitude, filename="missing_values_per_month_latitude.csv")
#create_download_link(missing_values_per_month_longitude, title=missing_values_per_month_longitude, filename="missing_values_per_month_longitude.csv")
#create_download_link(missing_values_per_month_dt_pos_utc, title=missing_values_per_month_dt_pos_utc, filename="missing_values_per_month_dt_pos_utc.csv")

In [None]:
# Filter (MMSI, IMO, Status Navigasi, Tipe Kapal, Negara Kapal Default, Latitude, Longitude, dt_pos_utc)

# DataFrame Spark
unique_filtered_mmsi_per_month = spark.createDataFrame(unique_filtered_mmsi_per_month)
#unique_filtered_imo_per_month = spark.createDataFrame(unique_filtered_imo_per_month)
#mmsi_count_per_filtered_nav_status = spark.createDataFrame(mmsi_count_per_filtered_nav_status)
#unique_mmsi_per_filtered_nav_status = spark.createDataFrame(unique_mmsi_per_filtered_nav_status)
#mmsi_count_per_filtered_vessel_type = spark.createDataFrame(mmsi_count_per_filtered_vessel_type)
#unique_mmsi_per_filtered_vessel_type = spark.createDataFrame(unique_mmsi_per_filtered_vessel_type)
#mmsi_count_per_filtered_flag_country = spark.createDataFrame(mmsi_count_per_filtered_flag_country)
#unique_mmsi_per_filtered_flag_country = spark.createDataFrame(unique_mmsi_per_filtered_flag_country)

# Export ke Pandas DataFrame
unique_filtered_mmsi_per_month = unique_filtered_mmsi_per_month.toPandas()
#df_unique_filtered_imo_spark = df_unique_filtered_imo_spark.toPandas()
#mmsi_count_per_filtered_nav_status = mmsi_count_per_filtered_nav_status.toPandas()
#unique_mmsi_per_filtered_nav_status = unique_mmsi_per_filtered_nav_status.toPandas()
#mmsi_count_per_filtered_vessel_type = mmsi_count_per_filtered_vessel_type.toPandas()
#unique_mmsi_per_filtered_vessel_type = unique_mmsi_per_filtered_vessel_type.toPandas()
#mmsi_count_per_filtered_flag_country = mmsi_count_per_filtered_flag_country.toPandas()
#unique_mmsi_per_filtered_flag_country = unique_mmsi_per_filtered_flag_country.toPandas()


# Download Data
create_download_link(unique_filtered_mmsi_per_month, title=unique_filtered_mmsi_per_month, filename="unique_filtered_mmsi_per_month.csv")
#create_download_link(unique_filtered_imo_per_month, title=unique_filtered_imo_per_month, filename="unique_filtered_imo_per_month.csv")
#create_download_link(mmsi_count_per_filtered_nav_status, title=mmsi_count_per_filtered_nav_status, filename="mmsi_count_per_filtered_nav_status.csv")
#create_download_link(unique_mmsi_per_filtered_nav_status, title=unique_mmsi_per_filtered_nav_status, filename="unique_mmsi_per_filtered_nav_status.csv")
#create_download_link(mmsi_count_per_filtered_vessel_type, title=mmsi_count_per_filtered_vessel_type, filename="mmsi_count_per_filtered_vessel_type.csv")
#create_download_link(unique_mmsi_per_filtered_vessel_type, title=unique_mmsi_per_filtered_vessel_type, filename="unique_mmsi_per_filtered_vessel_type.csv")
#create_download_link(mmsi_count_per_filtered_flag_country, title=mmsi_count_per_filtered_flag_country, filename="mmsi_count_per_filtered_flag_country.csv")
#create_download_link(unique_mmsi_per_filtered_flag_country, title=unique_mmsi_per_filtered_flag_country, filename="unique_mmsi_per_filtered_flag_country.csv")

In [None]:
# Filter yg melakukan pelayaran/lintasan anomali(?)

# DataFrame Spark
rekaman_per_bulan_1pers = spark.createDataFrame(rekaman_per_bulan_1pers)
rekaman_per_bulan_1pers = spark.createDataFrame(rekaman_per_bulan_1pers)

# Export ke Pandas DataFrame
rekaman_per_bulan_1pers = rekaman_per_bulan_1pers.toPandas()

# Download Data
create_download_link(rekaman_per_bulan_1pers, filename="rekaman_per_bulan_1pers.csv")