# Inilization

In [2]:
import pandas as pd
from pyspark.sql import functions as F

#For 3.3.1
#Register Sedona Functions to Spark
from sedona.register import SedonaRegistrator
SedonaRegistrator.registerAll(spark)


#For 3.3.2
from shapely.geometry import Point, Polygon, mapping
import h3.api.numpy_int as h3int 

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, count, countDistinct, when, expr
import calendar
from pyspark.sql.types import IntegerType
from pyspark.sql.window import Window

In [4]:
import pandas as pd
from IPython.display import HTML
import base64
from pyspark.sql.functions import year, month, dayofmonth, hour, minute, second
from pyspark.sql.functions import monotonically_increasing_id

# Eksplorasi Data

## Data AIS

In [5]:
basepath = "s3a://ungp-ais-data-historical-backup/exact-earth-data/transformed/prod/"

In [6]:
#Read Data January 2022
#df_data = spark.read.parquet(basepath+ "year=2022/month=01/day=01")

#Read Data 2022
df_data = spark.read.parquet(basepath+ "year=2022")

In [7]:
df_data.createOrReplaceTempView("temp_df")

## Cek Data

In [7]:
#Check-out available fields
df_data.printSchema()

root
 |-- mmsi: integer (nullable = true)
 |-- imo: integer (nullable = true)
 |-- vessel_name: string (nullable = true)
 |-- callsign: string (nullable = true)
 |-- vessel_type: string (nullable = true)
 |-- vessel_type_code: integer (nullable = true)
 |-- vessel_type_cargo: string (nullable = true)
 |-- vessel_class: string (nullable = true)
 |-- length: double (nullable = true)
 |-- width: double (nullable = true)
 |-- flag_country: string (nullable = true)
 |-- flag_code: integer (nullable = true)
 |-- destination: string (nullable = true)
 |-- eta: integer (nullable = true)
 |-- draught: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- sog: double (nullable = true)
 |-- cog: double (nullable = true)
 |-- rot: double (nullable = true)
 |-- heading: double (nullable = true)
 |-- nav_status: string (nullable = true)
 |-- nav_status_code: integer (nullable = true)
 |-- source: string (nullable = true)
 |-- dt_pos_utc: times

In [8]:
# Show sample 5 data
# df.show(n=1, vertical=True)
df_data.show(n=5, vertical=True)

-RECORD 0---------------------------------
 mmsi              | 273295870            
 imo               | 9167758              
 vessel_name       | ARKA-33              
 callsign          | UBPW7                
 vessel_type       | Fishing              
 vessel_type_code  | 30                   
 vessel_type_cargo | null                 
 vessel_class      | A                    
 length            | 0.0                  
 width             | 0.0                  
 flag_country      | Russian Federation   
 flag_code         | 273                  
 destination       | BARENTS SEA          
 eta               | 8112200              
 draught           | 5.0                  
 longitude         | 49.25605             
 latitude          | 76.70656667          
 sog               | 0.8                  
 cog               | 50.6                 
 rot               | 0.0                  
 heading           | 6.0                  
 nav_status        | Under Way Using E... 
 nav_status

## Jumlah Records

In [9]:
total_record = df_data.count()
total_record

8898338622

## 1% Records

In [8]:
def create_sampled_data_for_each_month(df_data_dengan_bulan, unique_months):
    # Inisialisasi DataFrame untuk menyimpan hasil
    sampled_data = None
    
    # Loop melalui setiap bulan dan ambil sekitar 1% data
    for month in unique_months:
        # Filter data berdasarkan bulan
        filtered_data = df_data_dengan_bulan.filter(col("bulan") == month)
        
        # Ambil sekitar 1% dari data untuk bulan tersebut
        sampled_data_month = filtered_data.sample(fraction=0.01, seed=42)
        
        # Gabungkan hasil untuk setiap bulan menjadi satu DataFrame
        if sampled_data is None:
            sampled_data = sampled_data_month
        else:
            sampled_data = sampled_data.union(sampled_data_month)

    return sampled_data

In [9]:
# Ekstrak nama bulan dari timestamp
df_data_dengan_bulan = df_data.withColumn("bulan", date_format("dt_pos_utc", "MMMM"))

# Hitung jumlah unik bulan
unique_months = df_data_dengan_bulan.select("bulan").distinct()

# Eksekusi fungsi
sampledd_data = create_sampled_data_for_each_month(df_data_dengan_bulan, unique_months)

### Save 1% Data

In [5]:
# Save Path
save_path = "s3a://ungp-ais-data-historical-backup/user_temp/"
save_path_unique = save_path + "222011349/"

In [6]:
# Save Data
#sampledd_data.write.option("header", True).mode("overwrite").parquet(save_path_unique + "data-ais-1persen-dunia-2022.parquet")

### Read 1% Data

In [6]:
# Read Data
data_sampel = spark.read.parquet(save_path_unique + "data-ais-1persen-dunia-2022.parquet", header=True)

### Record per Bulan

In [8]:
jumlah_record_per_bulan = data_sampel.withColumn("months", date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(count("*").alias("jumlah_record_per_bulan"))

In [9]:
jumlah_record_per_bulan.show(12)

+---------+-----------------------+
|   months|jumlah_record_per_bulan|
+---------+-----------------------+
| November|                6152686|
| December|                7332324|
|     July|                8180319|
|  October|                7692290|
|   August|                7956939|
|September|                7664096|
|    March|                7495299|
|    April|                7515300|
|     June|                7702391|
|      May|                7688979|
| February|                6523538|
|  January|                7092365|
+---------+-----------------------+



## Statistical Summary Data AIS

In [7]:
# MMSI, IMO, Status Navigasi, Tipe Kapal, Negara Kapal

# Variabel yang ingin dihitung statistik deskriptif
variables_to_describe = ["mmsi", "imo", "nav_status_code", "vessel_type_code", "flag_code"]

# Ambil statistik deskriptif tertentu untuk kolom-kolom tersebut
stats_df = data_sampel.select(variables_to_describe).describe()

# Pilih statistik tertentu
selected_stats = ["count", "mean", "stddev", "min", "25%", "50%", "75%", "max"]
stats_df = stats_df.select(["summary"] + [col(var).alias(var) for var in variables_to_describe]).filter(col("summary").isin(selected_stats))

# Tampilkan statistik deskriptif
stats_df.show()

+-------+--------------------+--------------------+------------------+------------------+------------------+
|summary|                mmsi|                 imo|   nav_status_code|  vessel_type_code|         flag_code|
+-------+--------------------+--------------------+------------------+------------------+------------------+
|  count|            88996526|            49396979|          88996526|          88996526|          87373801|
|   mean| 3.955718593709703E8|1.4585489263007926E7|5.4672777114917945| 62.17126814590493|395.20465823616854|
| stddev|1.3375405787626667E8|   6.7175274756498E7| 6.600339462843637|20.758994913218373|133.48844301584683|
|    min|                   0|                   1|                 0|                 0|               201|
|    max|          1073722367|          1073741823|                16|               255|               775|
+-------+--------------------+--------------------+------------------+------------------+------------------+



In [8]:
# Latitude, Longitude, dt_pos_utc

# Pemisahan komponen waktu dari kolom dt_pos_utc
data_sampel = data_sampel.withColumn("year", year("dt_pos_utc"))
data_sampel = data_sampel.withColumn("month", month("dt_pos_utc"))
data_sampel = data_sampel.withColumn("day", dayofmonth("dt_pos_utc"))
data_sampel = data_sampel.withColumn("hour", hour("dt_pos_utc"))
data_sampel = data_sampel.withColumn("minute", minute("dt_pos_utc"))
data_sampel = data_sampel.withColumn("second", second("dt_pos_utc"))

# Variabel yang ingin dihitung statistik deskriptif
variables_to_describe_2 = ["latitude", "longitude", "year", "month", "day", "hour", "minute", "second"]

# Ambil statistik deskriptif tertentu untuk kolom-kolom tersebut
stats_df_2 = data_sampel.select(variables_to_describe_2).describe()

# Pilih statistik tertentu
selected_stats = ["count", "mean", "stddev", "min", "25%", "50%", "75%", "max"]
stats_df_2 = stats_df_2.select(["summary"] + [col(var).alias(var) for var in variables_to_describe_2]).filter(col("summary").isin(selected_stats))

# Tampilkan statistik deskriptif
stats_df_2.show()

+-------+------------------+------------------+--------------------+-----------------+------------------+------------------+------------------+-----------------+
|summary|          latitude|         longitude|                year|            month|               day|              hour|            minute|           second|
+-------+------------------+------------------+--------------------+-----------------+------------------+------------------+------------------+-----------------+
|  count|          88996526|          88996526|            88996526|         88996526|          88996526|          88996526|          88996526|         88996526|
|   mean|24.839926569895294|27.691537992568033|   2021.999999404471|6.515206245241528|15.765656245952792|11.537739832676165| 29.53726906149123|29.34361451367214|
| stddev|  26.5039724583369| 87.63593397212726|7.717049408373274E-4|3.373979315200634| 8.846388808618096| 6.923254355883742|17.307282607379648|17.31154116556796|
|    min|             -90.0|

## Nilai Unik

### MMSI Unik per Bulan

In [9]:
# Hitung jumlah 'mmsi' yang unik per bulan
unique_mmsi_per_month = data_sampel.withColumn("months", date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(countDistinct("mmsi").alias("unique_mmsi_count"))

# Tampilkan DataFrame Spark
unique_mmsi_per_month.show()

+---------+-----------------+
|   months|unique_mmsi_count|
+---------+-----------------+
|     July|           310532|
| November|           281458|
| February|           236903|
|  January|           253983|
|    March|           268823|
|  October|           289747|
|      May|           291265|
|   August|           314024|
|    April|           289684|
|     June|           295255|
| December|           257860|
|September|           305440|
+---------+-----------------+



### IMO Unik per Bulan

In [10]:
# Hitung jumlah 'imo' yang unik per bulan
unique_imo_per_month = data_sampel.withColumn("months", date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(countDistinct("imo").alias("unique_imo_count"))

# Tampilkan DataFrame Spark
unique_imo_per_month.show()

+---------+----------------+
|   months|unique_imo_count|
+---------+----------------+
|     July|           71362|
| November|           70912|
| February|           68606|
|  January|           70475|
|    March|           69970|
|  October|           71333|
|      May|           71172|
|   August|           71549|
|    April|           70502|
|     June|           71212|
| December|           70225|
|September|           71452|
+---------+----------------+



### Status Navigasi Setahun

In [None]:
# Hitung jumlah 'mmsi' per masing-masing 'nav_status' dengan DataFrame API
mmsi_count_per_nav_status = data_sampel.groupBy("nav_status").agg(F.count("mmsi").alias("mmsi_count"))

# Tampilkan DataFrame Spark
mmsi_count_per_nav_status.show()

### Tipe Kapal Setahun

In [None]:
# Hitung jumlah 'mmsi' per masing-masing 'vessel_type'
mmsi_count_per_vessel_type = data_sampel.groupBy("vessel_type").agg(count("mmsi").alias("mmsi_count"))

# Tampilkan DataFrame Spark
mmsi_count_per_vessel_type.show()

### Negara Kapal Setahun

In [None]:
# Hitung jumlah 'mmsi' per masing-masing 'flag_country'
mmsi_count_per_flag_country = data_sampel.groupBy("flag_country").agg(count("mmsi").alias("mmsi_count"))

# Tampilkan DataFrame Spark
mmsi_count_per_flag_country.show()

# Quality Assurance

## MMSI Unik per

### Status Navigasi

In [11]:
# Hitung jumlah 'mmsi' yang unik berdasarkan 'nav_status'
unique_mmsi_per_nav_status = data_sampel.groupBy("nav_status").agg(countDistinct("mmsi").alias("unique_mmsi_per_nav_status_count"))

# Tampilkan DataFrame Spark
unique_mmsi_per_nav_status.show()

+--------------------+--------------------------------+
|          nav_status|unique_mmsi_per_nav_status_count|
+--------------------+--------------------------------+
|              Moored|                          103317|
|Restricted Manoeu...|                           13669|
|             Aground|                            1981|
|         Not Defined|                           36212|
|   Not Under Command|                           18338|
|  Engaged In Fishing|                            7059|
|    Underway Sailing|                           25453|
|             Unknown|                          411470|
|           At Anchor|                           85960|
|Under Way Using E...|                          172355|
+--------------------+--------------------------------+



### Tipe Kapal

In [12]:
# Hitung jumlah 'mmsi' yang unik berdasarkan 'vessel_type'
unique_mmsi_per_vessel_type = data_sampel.groupBy("vessel_type").agg(countDistinct("mmsi").alias("unique_mmsi_per_vessel_type_count"))

# Tampilkan DataFrame Spark
unique_mmsi_per_vessel_type.show()

+--------------------+---------------------------------+
|         vessel_type|unique_mmsi_per_vessel_type_count|
+--------------------+---------------------------------+
|             Sailing|                            67649|
|              Tanker|                            25376|
|Ships Not Party t...|                              318|
|            Military|                             2152|
|              Towing|                             6938|
|            Reserved|                             3165|
|                 SAR|                             4329|
|             Unknown|                           110671|
|               Other|                            19461|
|         UNAVAILABLE|                             5139|
|                 Tug|                            22275|
|     Law Enforcement|                             2650|
|      Pleasure Craft|                            72881|
|           Passenger|                            17351|
|              Diving|         

### Negara Kapal

In [13]:
# Filter out rows where 'flag_country' is not null
df_filtered = data_sampel.filter(col("flag_country").isNotNull())

# Hitung jumlah 'mmsi' yang unik berdasarkan 'flag_country'
unique_mmsi_per_flag_country = df_filtered.groupBy("flag_country").agg(countDistinct("mmsi").alias("unique_mmsi_per_flag_country_count"))

# Tampilkan DataFrame Spark
unique_mmsi_per_flag_country.show() 

+------------------+----------------------------------+
|      flag_country|unique_mmsi_per_flag_country_count|
+------------------+----------------------------------+
|          Paraguay|                               220|
|          Anguilla|                                99|
|             Macao|                                79|
|             Yemen|                                45|
|           Senegal|                                92|
|            Sweden|                              8263|
|          Kiribati|                               110|
|            Guyana|                               110|
|       Philippines|                               974|
|           Eritrea|                                44|
|          Djibouti|                                75|
|         Singapore|                              5654|
|          Malaysia|                              2664|
|              Fiji|                               139|
|            Turkey|                            

## IMO Unik per

### Status Navigasi

In [14]:
# Hitung jumlah 'mmsi' yang unik berdasarkan 'nav_status'
unique_imo_per_nav_status = data_sampel.select("imo","nav_status").groupBy("nav_status").agg(countDistinct("imo").alias("unique_imo_per_nav_status_count"))

# Tampilkan DataFrame Spark
unique_imo_per_nav_status.show(unique_imo_per_nav_status.count(), truncate=False)

+--------------------------+-------------------------------+
|nav_status                |unique_imo_per_nav_status_count|
+--------------------------+-------------------------------+
|Moored                    |62323                          |
|Restricted Manoeuvrability|10961                          |
|Aground                   |969                            |
|Not Defined               |9227                           |
|Not Under Command         |16623                          |
|Engaged In Fishing        |3304                           |
|Underway Sailing          |16780                          |
|Unknown                   |12126                          |
|At Anchor                 |54210                          |
|Under Way Using Engine    |72170                          |
+--------------------------+-------------------------------+



### Tipe Kapal

In [15]:
# Hitung jumlah 'mmsi' yang unik berdasarkan 'vessel_type'
unique_imo_per_vessel_type = data_sampel.select("imo","vessel_type").groupBy("vessel_type").agg(countDistinct("imo").alias("unique_imo_per_vessel_type_count"))

# Tampilkan DataFrame Spark
unique_imo_per_vessel_type.show(unique_imo_per_vessel_type.count(), truncate=False)

+------------------------------------+--------------------------------+
|vessel_type                         |unique_imo_per_vessel_type_count|
+------------------------------------+--------------------------------+
|Sailing                             |870                             |
|Tanker                              |15786                           |
|Ships Not Party to Armed Conflict   |127                             |
|Military                            |528                             |
|Towing                              |2133                            |
|Reserved                            |1047                            |
|SAR                                 |527                             |
|Unknown                             |7109                            |
|Other                               |7326                            |
|UNAVAILABLE                         |97                              |
|Tug                                 |8750                      

### Negara Kapal

In [16]:
# Hitung jumlah 'mmsi' yang unik berdasarkan 'vessel_type'
unique_imo_per_flag_country = data_sampel.select("imo","flag_country").groupBy("flag_country").agg(countDistinct("imo").alias("unique_imo_per_flag_country_count"))

# Tampilkan DataFrame Spark
unique_imo_per_flag_country.show(unique_imo_per_flag_country.count(), truncate=False)

+--------------------------------+---------------------------------+
|flag_country                    |unique_imo_per_flag_country_count|
+--------------------------------+---------------------------------+
|Paraguay                        |64                               |
|Senegal                         |36                               |
|Sweden                          |341                              |
|Kiribati                        |52                               |
|Guyana                          |70                               |
|Philippines                     |620                              |
|Tonga                           |8                                |
|Singapore                       |3349                             |
|Malaysia                        |1014                             |
|Fiji                            |33                               |
|Turkey                          |781                              |
|Germany                         |

## Nilai Valid

### MMSI Valid

In [12]:
# Definisikan nilai mmsi yang valid
nilai_valid = [100000000, 999999999]

# Jumlah mmsi yang valid
jumlah_mmsi_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['mmsi'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("mmsi").alias("mmsi_valid"))

# Buat DataFrame hasil
mmsi_valid = jumlah_mmsi_valid_per_bulan

# Tampilkan hasil
mmsi_valid.show()

+---------+----------+
|   months|mmsi_valid|
+---------+----------+
|     July|   8180319|
|  January|   7092365|
|  October|   7692290|
|      May|   7687858|
|   August|   7956939|
|    April|   7515300|
|     June|   7702391|
|September|   7664096|
| February|   6523538|
| December|   7332324|
|    March|   7495299|
| November|   6142451|
+---------+----------+



### IMO Valid

In [13]:
# Definisikan nilai imo yang valid
nilai_valid = [1000000, 9999999]

# Hitung jumlah imo valid per bulan
jumlah_imo_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['imo'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("imo").alias("imo_valid"))

# Buat DataFrame hasil
imo_valid = jumlah_imo_valid_per_bulan

# Tampilkan hasil
imo_valid.show() 

+---------+---------+
|   months|imo_valid|
+---------+---------+
|     July|  4264089|
| November|  3303518|
|    March|  4052453|
|  October|  4232146|
|   August|  4276330|
|    April|  4012972|
|     June|  4005177|
| December|  4125890|
|September|  4114450|
|      May|  4081545|
|  January|  3892366|
| February|  3710542|
+---------+---------+



### Status Navigasi Valid

In [14]:
# Definisikan nilai nav_status_code yang valid
nilai_valid = [0, 14]

# Hitung jumlah nav_status_code  valid per bulan
jumlah_nav_status_code_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['nav_status_code'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("nav_status_code").alias("nav_status_code_valid"))

# Buat DataFrame hasil
nav_status_code_valid = jumlah_nav_status_code_valid_per_bulan

# Tampilkan hasil
nav_status_code_valid.show() 

+---------+---------------------+
|   months|nav_status_code_valid|
+---------+---------------------+
| November|              4578294|
|    March|              5492513|
|  October|              5663560|
|   August|              5754012|
| December|              5513175|
|September|              5550344|
|    April|              5422589|
|     June|              5508218|
|     July|              5833525|
|      May|              5565943|
|  January|              5295222|
| February|              4972545|
+---------+---------------------+



### Tipe Kapal Valid

In [15]:
# Definisikan nilai vessel_type_code yang valid
nilai_valid = [1, 255]

# Hitung jumlah vessel_type_code  valid per bulan
jumlah_vessel_type_code_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['vessel_type_code'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("vessel_type_code").alias("vessel_type_code_valid"))

# Buat DataFrame hasil
vessel_type_code_valid = jumlah_vessel_type_code_valid_per_bulan

# Tampilkan hasil
vessel_type_code_valid.show() 

+---------+----------------------+
|   months|vessel_type_code_valid|
+---------+----------------------+
| November|               6143471|
|    March|               7479988|
|  October|               7684138|
|   August|               7948482|
| December|               7323787|
|September|               7655915|
|     July|               8166596|
|    April|               7497828|
|     June|               7689837|
|      May|               7675549|
|  January|               7076010|
| February|               6510712|
+---------+----------------------+



### Negara Kapal Valid

In [16]:
# Definisikan nilai flag_country_code yang valid
nilai_valid = [201, 775]

# Hitung jumlah flag_country_code tidak valid per bulan
jumlah_flag_country_code_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['flag_code'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("flag_code").alias("flag_country_code_valid"))

# Buat DataFrame hasil
flag_country_code_valid = jumlah_flag_country_code_valid_per_bulan

# Tampilkan hasil
flag_country_code_valid.show() 

+---------+-----------------------+
|   months|flag_country_code_valid|
+---------+-----------------------+
| November|                4670991|
|    March|                7495299|
|  October|                7692290|
|   August|                7956939|
| December|                7332324|
|September|                7664096|
|     July|                8180319|
|      May|                7547949|
|    April|                7515300|
|     June|                7702391|
|  January|                7092365|
| February|                6523538|
+---------+-----------------------+



### Latitude & Longitude Valid

In [17]:
# Definisikan nilai latitude yang valid
nilai_valid = [-90, 90]

# Hitung jumlah latitude tidak valid per bulan
jumlah_latitude_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['latitude'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("latitude").alias("latitude_valid"))

# Buat DataFrame hasil
latitude_valid = jumlah_latitude_valid_per_bulan

# Tampilkan hasil
latitude_valid.show() 

+---------+--------------+
|   months|latitude_valid|
+---------+--------------+
| November|       6152686|
|    March|       7495299|
|  October|       7692290|
|   August|       7956939|
| December|       7332324|
|September|       7664096|
|     July|       8180319|
|    April|       7515300|
|     June|       7702391|
|      May|       7688979|
| February|       6523538|
|  January|       7092365|
+---------+--------------+



In [18]:
# Definisikan nilai longitude yang valid
nilai_valid = [-180, 180]

# Hitung jumlah longitude tidak valid per bulan
jumlah_longitude_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['longitude'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("longitude").alias("longitude_valid"))

# Buat DataFrame hasil
longitude_valid = jumlah_longitude_valid_per_bulan

# Tampilkan hasil
longitude_valid.show()

+---------+---------------+
|   months|longitude_valid|
+---------+---------------+
| December|        7332324|
|     July|        8180319|
| November|        6152686|
|    March|        7495299|
|  October|        7692290|
|   August|        7956939|
|    April|        7515300|
|     June|        7702391|
|September|        7664096|
|      May|        7688979|
| February|        6523538|
|  January|        7092365|
+---------+---------------+



### dt_pos_utc Valid

In [19]:
# Definisikan pola regex untuk nilai 'dt_pos_utc' yang valid
pattern = "^[1-9][0-9]{0,3}-[1-9]|0[1-9]|1[0-2]-[1-9]|0[1-9]|[1-2][0-9]|3[0-1] [0-9]|0[0-9]|1[0-9]|2[0-3]:[0-5][0-9]:[0-5][0-9]$"

# Filter data untuk mendapatkan hanya nilai 'dt_pos_utc' yang tidak valid
dt_pos_utc_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("dt_pos_utc").rlike(pattern)) \
    .groupBy("months").agg(F.count("dt_pos_utc").alias("dt_pos_utc_valid"))

# Buat DataFrame hasil
dt_pos_utc_valid = dt_pos_utc_valid_per_bulan

# Tampilkan hasil
dt_pos_utc_valid.show()  

+---------+----------------+
|   months|dt_pos_utc_valid|
+---------+----------------+
| November|         6152686|
|    March|         7495299|
|  October|         7692290|
|   August|         7956939|
| December|         7332324|
|September|         7664096|
|     July|         8180319|
|    April|         7515300|
|     June|         7702391|
|      May|         7688979|
| February|         6523538|
|  January|         7092365|
+---------+----------------+



## Nilai Default

### MMSI dengan Nilai Default per Bulan

In [20]:
# Tentukan nilai yang ingin dihitung
default_value_1 = 0
default_value_2 = 1193046

# Hitung jumlah 'mmsi' dengan nilai default per bulan
mmsi_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((F.col("mmsi") == default_value_1) | (F.col("mmsi") == default_value_2)) \
    .groupBy("months", "mmsi").agg(F.count("mmsi").alias("mmsi_count_default"))

# Tampilkan DataFrame Spark hasil akhir
mmsi_default_per_month.show()

+--------+-------+------------------+
|  months|   mmsi|mmsi_count_default|
+--------+-------+------------------+
|November|      0|                59|
|November|1193046|                19|
|     May|      0|                 1|
+--------+-------+------------------+



### IMO dengan Nilai Default per Bulan

In [7]:
# Tentukan nilai yang ingin dihitung
default_value = 0  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'imo' dengan nilai default per bulan
imo_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("imo") == default_value) \
    .groupBy("months").agg(F.count("imo").alias("imo_count_default"))

# Tampilkan DataFrame Spark hasil akhir
imo_default_per_month.show() 

+------+-----------------+
|months|imo_count_default|
+------+-----------------+
+------+-----------------+



### Status Navigasi dengan Nilai Default per Bulan

In [8]:
# Tentukan nilai yang ingin dihitung
default_value = "Not Defined"  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'nav_status' dengan nilai default per bulan
nav_status_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("nav_status") == default_value) \
    .groupBy("months").agg(F.count("nav_status").alias("nav_status_count_default"))

# Tampilkan DataFrame Spark hasil akhir
nav_status_default_per_month.show() 

+---------+------------------------+
|   months|nav_status_count_default|
+---------+------------------------+
| December|                  371464|
| November|                  329642|
|    March|                  388469|
|  October|                  399015|
|   August|                  416208|
|September|                  397973|
|     July|                  430255|
|    April|                  384620|
|      May|                  414301|
|     June|                  405164|
| February|                  357029|
|  January|                  386246|
+---------+------------------------+



### Tipe Kapal dengan Nilai Default per Bulan

In [9]:
# Tentukan nilai yang ingin dihitung
default_value = "Not Available"  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'vessel_type' dengan nilai default per bulan
vessel_type_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("vessel_type") == default_value) \
    .groupBy("months").agg(F.count("vessel_type").alias("vessel_type_count_default"))

# Tampilkan DataFrame Spark hasil akhir
vessel_type_default_per_month.show() 

+---------+-------------------------+
|   months|vessel_type_count_default|
+---------+-------------------------+
| November|                     9215|
|  October|                     8152|
|   August|                     8457|
| December|                     8537|
|September|                     8181|
|     July|                    13723|
|    March|                    15311|
|    April|                    17472|
|      May|                    13430|
|     June|                    12554|
|  January|                    16355|
| February|                    12826|
+---------+-------------------------+



### Negara Kapal dengan Nilai Default per Bulan

In [10]:
# MISAL DEFAULT = 0
# Tentukan nilai yang ingin dihitung
default_value = 0  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'flag_country' dengan nilai default per bulan
flag_country_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("flag_code") == default_value) \
    .groupBy("months").agg(F.count("flag_code").alias("flag_country_count_default"))

# Tampilkan DataFrame Spark hasil akhir
flag_country_default_per_month.show()  

+------+--------------------------+
|months|flag_country_count_default|
+------+--------------------------+
+------+--------------------------+



### Latitude & Longitude dengan Nilai Default per Bulan

In [11]:
# Tentukan nilai yang ingin dihitung
default_value = 91  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'latitude' dengan nilai default per bulan
latitude_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("latitude") == default_value) \
    .groupBy("months").agg(F.count("latitude").alias("latitude_count_default"))

# Tampilkan DataFrame Spark hasil akhir
latitude_default_per_month.show() 

+------+----------------------+
|months|latitude_count_default|
+------+----------------------+
+------+----------------------+



In [12]:
# Tentukan nilai yang ingin dihitung
default_value = 181  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'longitude' dengan nilai default per bulan
longitude_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("longitude") == default_value) \
    .groupBy("months").agg(F.count("longitude").alias("longitude_count_default"))

# Tampilkan DataFrame Spark hasil akhir
longitude_default_per_month.show() 

+------+-----------------------+
|months|longitude_count_default|
+------+-----------------------+
+------+-----------------------+



### dt_pos_utc dengan Nilai Default per Bulan

In [13]:
# Tentukan nilai yang ingin dihitung
default_value = "0-0-0 24:60:60"

# Hitung jumlah nilai default pada kolom 'dt_pos_utc' per bulan
dt_pos_utc_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("dt_pos_utc") == default_value) \
    .groupBy("months").agg(F.count("dt_pos_utc").alias("dt_pos_ut_count_dafault"))

# Tampilkan DataFrame Spark hasil akhir
dt_pos_utc_default_per_month.show()  

+------+-----------------------+
|months|dt_pos_ut_count_dafault|
+------+-----------------------+
+------+-----------------------+



## Tidak Valid

### MMSI Tidak Valid

In [17]:
# Definisikan nilai mmsi yang valid
nilai_valid = [100000000, 999999999]

# Hitung jumlah mmsi tidak valid per bulan
jumlah_mmsi_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(~(data_sampel['mmsi'].between(nilai_valid[0], nilai_valid[1])) & 
            ((data_sampel['mmsi'] != 0) | (data_sampel['mmsi'] != 1193046)) &
            (data_sampel['mmsi'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("mmsi").alias("mmsi_tidak_valid"))

# Buat DataFrame hasil
mmsi_invalid = jumlah_mmsi_tidak_valid_per_bulan

# Tampilkan hasil
mmsi_invalid.show()


+--------+----------------+
|  months|mmsi_tidak_valid|
+--------+----------------+
|November|           10235|
|     May|            1121|
+--------+----------------+



### IMO Tidak Valid

In [20]:
# Definisikan nilai imo yang valid
nilai_valid = [1000000, 9999999]

# Hitung jumlah imo tidak valid per bulan
jumlah_imo_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['imo'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['imo'] != 0) &
            (data_sampel['imo'].isNotNull()) 
           )\
    .groupBy("months").agg(F.count("imo").alias("imo_tidak_valid"))

# Buat DataFrame hasil
imo_invalid = jumlah_imo_tidak_valid_per_bulan

# Tampilkan hasil
imo_invalid.show() 

+---------+---------------+
|   months|imo_tidak_valid|
+---------+---------------+
|     July|         118819|
| November|          92215|
|  January|         118295|
|    March|         111760|
|  October|         111154|
|      May|         122504|
| December|         102670|
|     June|         116427|
|September|         112087|
|    April|         115899|
|   August|         120266|
| February|          83405|
+---------+---------------+



### Status Navigasi Tidak Valid

In [21]:
# Definisikan nilai nav_status_code yang valid
nilai_valid = [0, 14]

# Hitung jumlah nav_status_code tidak valid per bulan
jumlah_nav_status_code_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['nav_status_code'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['nav_status_code'] != 15) & 
            (data_sampel['nav_status_code'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("nav_status_code").alias("nav_status_code_tidak_valid"))

# Buat DataFrame hasil
nav_status_code_invalid = jumlah_nav_status_code_tidak_valid_per_bulan

# Tampilkan hasil
nav_status_code_invalid.show() 

+---------+---------------------------+
|   months|nav_status_code_tidak_valid|
+---------+---------------------------+
| November|                    1244750|
|    March|                    1614317|
|  October|                    1629715|
|   August|                    1786719|
| December|                    1447685|
|September|                    1715779|
|    April|                    1708091|
|     June|                    1789009|
|     July|                    1916539|
|      May|                    1708735|
| February|                    1193964|
|  January|                    1410897|
+---------+---------------------------+



### Tipe Kapal Tidak Valid

In [22]:
# Definisikan nilai vessel_type_code yang valid
nilai_valid = [1, 255]

# Hitung jumlah vessel_type_code tidak valid per bulan
jumlah_vessel_type_code_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['vessel_type_code'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['vessel_type_code'] != 0) &
            (data_sampel['vessel_type_code'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("vessel_type_code").alias("vessel_type_code_tidak_valid"))

# Buat DataFrame hasil
vessel_type_code_invalid = jumlah_vessel_type_code_tidak_valid_per_bulan

# Tampilkan hasil
vessel_type_code_invalid.show() 

+------+----------------------------+
|months|vessel_type_code_tidak_valid|
+------+----------------------------+
+------+----------------------------+



### Negara Kapal Tidak Valid

In [23]:
# Definisikan nilai flag_country_code yang valid
nilai_valid = [201, 775]

# Hitung jumlah flag_country_code tidak valid per bulan
jumlah_flag_country_code_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['flag_code'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['flag_code'] != 0) &
            (data_sampel['flag_code'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("flag_code").alias("flag_country_code_tidak_valid"))

# Buat DataFrame hasil
flag_country_code_invalid = jumlah_flag_country_code_tidak_valid_per_bulan

# Tampilkan hasil
flag_country_code_invalid.show() 

+------+-----------------------------+
|months|flag_country_code_tidak_valid|
+------+-----------------------------+
+------+-----------------------------+



### Latitude & Longitude Tidak Valid

In [24]:
# Definisikan nilai latitude yang valid
nilai_valid = [-90, 90]

# Hitung jumlah latitude tidak valid per bulan
jumlah_latitude_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['latitude'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['latitude'] != 91) &
            (data_sampel['latitude'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("latitude").alias("latitude_tidak_valid"))

# Buat DataFrame hasil
latitude_invalid = jumlah_latitude_tidak_valid_per_bulan

# Tampilkan hasil
latitude_invalid.show() 

+------+--------------------+
|months|latitude_tidak_valid|
+------+--------------------+
+------+--------------------+



In [25]:
# Definisikan nilai longitude yang valid
nilai_valid = [-180, 180]

# Hitung jumlah longitude tidak valid per bulan
jumlah_longitude_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['longitude'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['longitude'] != 181) &
            (data_sampel['longitude'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("longitude").alias("longitude_tidak_valid"))

# Buat DataFrame hasil
longitude_invalid = jumlah_longitude_tidak_valid_per_bulan

# Tampilkan hasil
longitude_invalid.show()

+------+---------------------+
|months|longitude_tidak_valid|
+------+---------------------+
+------+---------------------+



### dt_pos_utc Tidak Valid

In [26]:
# Definisikan pola regex untuk nilai 'dt_pos_utc' yang valid
pattern = "^[1-9][0-9]{0,3}-[1-9]|0[1-9]|1[0-2]-[1-9]|0[1-9]|[1-2][0-9]|3[0-1] [0-9]|0[0-9]|1[0-9]|2[0-3]:[0-5][0-9]:[0-5][0-9]$"

default_value = "0-0-0 24:60:60"

# Filter data untuk mendapatkan hanya nilai 'dt_pos_utc' yang tidak valid
dt_pos_utc_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~F.col("dt_pos_utc").rlike(pattern)) & 
            (data_sampel['dt_pos_utc'] != default_value) &
            (data_sampel['dt_pos_utc'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("dt_pos_utc").alias("dt_pos_utc_tidak_valid"))

# Buat DataFrame hasil
dt_pos_utc_invalid = dt_pos_utc_tidak_valid_per_bulan

# Tampilkan hasil
dt_pos_utc_invalid.show()  

+------+----------------------+
|months|dt_pos_utc_tidak_valid|
+------+----------------------+
+------+----------------------+



## Missing Value

### MS MMSI

In [27]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "mmsi"
missing_values_per_month_mmsi = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("mmsi").isNull().cast(IntegerType())).alias("mmsi_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_mmsi.show() 

+---------+------------+
|   months|mmsi_missing|
+---------+------------+
|     July|           0|
| November|           0|
|    March|           0|
|  October|           0|
|   August|           0|
|    April|           0|
|     June|           0|
| December|           0|
|September|           0|
|      May|           0|
| February|           0|
|  January|           0|
+---------+------------+



### MS IMO

In [28]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "imo"
missing_values_per_month_imo = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("imo").isNull().cast(IntegerType())).alias("imo_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_imo.show()  

+---------+-----------+
|   months|imo_missing|
+---------+-----------+
| December|    3103764|
| November|    2756953|
|    March|    3331086|
|  October|    3348990|
|   August|    3560343|
|September|    3437559|
|    April|    3386429|
|      May|    3484930|
|     June|    3580787|
|     July|    3797411|
| February|    2729591|
|  January|    3081704|
+---------+-----------+



### MS Status Navigasi

In [30]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "nav_status"
missing_values_per_month_nav_status = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("nav_status").isNull().cast(IntegerType())).alias("nav_status_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_nav_status.show() 

+---------+------------------+
|   months|nav_status_missing|
+---------+------------------+
| November|                 0|
|  October|                 0|
|      May|                 0|
|   August|                 0|
|    April|                 0|
|     June|                 0|
|September|                 0|
|     July|                 0|
|    March|                 0|
| December|                 0|
| February|                 0|
|  January|                 0|
+---------+------------------+



### MS Tipe Kapal

In [31]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "vessel_type"
missing_values_per_month_vessel_type = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("vessel_type").isNull().cast(IntegerType())).alias("vessel_type_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_vessel_type.show() 

+---------+-------------------+
|   months|vessel_type_missing|
+---------+-------------------+
| December|                  0|
| November|                  0|
|    March|                  0|
|  October|                  0|
|   August|                  0|
|     July|                  0|
|    April|                  0|
|September|                  0|
|     June|                  0|
|      May|                  0|
|  January|                  0|
| February|                  0|
+---------+-------------------+



### MS Negara Kapal

In [32]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "flag_country"
missing_values_per_month_flag_country = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("flag_country").isNull().cast(IntegerType())).alias("flag_country_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_flag_country.show() 

+---------+--------------------+
|   months|flag_country_missing|
+---------+--------------------+
| November|               46730|
|    March|                   0|
|  October|                   0|
|   August|                   0|
| December|                   0|
|    April|                   0|
|September|                   0|
|     July|                   0|
|      May|                4334|
|     June|                   0|
|  January|                   0|
| February|                   0|
+---------+--------------------+



### MS Latitude & Longitude

In [33]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "latitude"
missing_values_per_month_latitude = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("latitude").isNull().cast(IntegerType())).alias("latitude_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_latitude.show() 

+---------+----------------+
|   months|latitude_missing|
+---------+----------------+
| November|               0|
|    March|               0|
|  October|               0|
|   August|               0|
| December|               0|
|September|               0|
|    April|               0|
|     June|               0|
|     July|               0|
|      May|               0|
|  January|               0|
| February|               0|
+---------+----------------+



In [34]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "longitude"
missing_values_per_month_longitude = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("longitude").isNull().cast(IntegerType())).alias("longitude_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_longitude.show() 

+---------+-----------------+
|   months|longitude_missing|
+---------+-----------------+
| November|                0|
|    March|                0|
|  October|                0|
|   August|                0|
| December|                0|
|September|                0|
|    April|                0|
|     June|                0|
|     July|                0|
|      May|                0|
| February|                0|
|  January|                0|
+---------+-----------------+



### MS dt_pos_utc

In [35]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "dt_pos_utc"
missing_values_per_month_dt_pos_utc = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("dt_pos_utc").isNull().cast(IntegerType())).alias("dt_pos_utc_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_dt_pos_utc.show() 

+---------+------------------+
|   months|dt_pos_utc_missing|
+---------+------------------+
| November|                 0|
|    March|                 0|
|  October|                 0|
|   August|                 0|
| December|                 0|
|     July|                 0|
|    April|                 0|
|September|                 0|
|     June|                 0|
|      May|                 0|
|  January|                 0|
| February|                 0|
+---------+------------------+



## Pergerakan Anomali

In [36]:
# plot jarak tempuh (y) sama waktu tempuh (x)
# Dari plotnya keliatan mana yang jarak tempuhnya besar tapi waktu tempuhnya kecil (atau sebaliknya?)

### Status Navigasi & SOG

In [37]:
# Tentukan persentil yang diinginkan
percentiles = [0.5, 0.75, 0.9, 0.95, 0.99, 0.999]

# Loop melalui setiap persentil dan hitung nilai kuantil untuk sog
quantile_columns = [expr(f"percentile_approx(sog, {p})").alias(f"sog_{int(p * 100)}") for p in percentiles]

# Kelompokkan berdasarkan nav_status dan hitung kuantilnya
quantiles_per_nav_status = data_sampel.select("nav_status","sog").groupBy("nav_status").agg(*quantile_columns)

# Tampilkan hasil
quantiles_per_nav_status.show()

+--------------------+------+------+------+------+------+------+
|          nav_status|sog_50|sog_75|sog_90|sog_95|sog_99|sog_99|
+--------------------+------+------+------+------+------+------+
|              Moored|   0.0|   0.0|   0.0|   0.2|   9.2|  17.5|
|Restricted Manoeu...|   0.0|   1.0|   4.6|   7.0|  11.3|  16.2|
|             Aground|   0.0|   0.1|   6.0|   9.0|  15.8|  25.0|
|         Not Defined|   0.0|   1.0|   7.0|   9.1|  15.0|  26.3|
|   Not Under Command|   0.5|   1.5|   4.5|   8.3|  13.2|  20.9|
|  Engaged In Fishing|   1.3|   4.2|   8.3|   9.8|  12.0|  14.0|
|    Underway Sailing|   0.0|   4.5|   9.5|  11.7|  15.7|  26.7|
|             Unknown|   0.0|   2.2|   6.2|   7.9|  11.3|  28.4|
|           At Anchor|   0.0|   0.1|   0.2|   0.5|   7.7|  12.6|
|Under Way Using E...|   7.9|  11.9|  14.0|  16.6|  20.0|  28.0|
+--------------------+------+------+------+------+------+------+



### Validasi Status Navigasi dengan SOG

In [39]:
# Memeriksa kecocokan nav_status dan sog

# Ekstrak nama bulan dari timestamp
data_sampel = data_sampel.withColumn("bulan", date_format("dt_pos_utc", "MMMM"))

# Tentukan kriteria pergerakan tidak normal
kriteria_pergerakan_tidak_normal = (
    ((col("nav_status").isin(['At Anchor', 'Moored'])) &
    (col("sog") > 1)) | 
    ((col("nav_status").isin(['Not Under Command', 'Underway Sailing', 'Under Way Using Engine', 'Engaged In Fishing', 'Restricted Manoeuvrability', 'Aground'])) &
    (col("sog") < 1))
)

# Hitung pergerakan tidak normal per bulan
pergerakan_tidak_normal_per_bulan = data_sampel.withColumn(
    "pergerakan_tidak_normal",
    when(kriteria_pergerakan_tidak_normal, 1).otherwise(0)
).groupBy("bulan").agg({"pergerakan_tidak_normal": "sum"}).withColumnRenamed("SUM(pergerakan_tidak_normal)", "total_pergerakan_tidak_normal")

# Tampilkan hasil
pergerakan_tidak_normal_per_bulan.show()

+---------+-----------------------------+
|    bulan|total_pergerakan_tidak_normal|
+---------+-----------------------------+
|     July|                      1573546|
| November|                      1306845|
|    March|                      1476077|
|  October|                      1516312|
|      May|                      1500083|
|   August|                      1539682|
|    April|                      1454747|
|     June|                      1491183|
| December|                      1462994|
|September|                      1491274|
|  January|                      1460853|
| February|                      1333742|
+---------+-----------------------------+



# Filter

## Filter MMSI

In [15]:
# Filter data untuk mmsi di dalam rentang tertentu
filtered_mmsi = sampledd_data.filter(col("mmsi").between(100000000, 999999999))

In [9]:
#filtered_mmsi.count()

In [None]:
# Hitung jumlah 'mmsi' yang unik per bulan
unique_filtered_mmsi_per_month = filtered_mmsi.withColumn("months", date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(countDistinct("mmsi").alias("unique_mmsi_count"))

# Tampilkan DataFrame Spark
unique_filtered_mmsi_per_month.show()

## Filter IMO

In [None]:
# Filter data untuk imo di dalam rentang tertentu
filtered_imo = filtered_mmsi.filter(col("imo").between(1000000, 9999999))

In [None]:
filtered_imo.count()

In [None]:
# Hitung jumlah 'imo' yang unik per bulan
unique_filtered_imo_per_month = filtered_imo.withColumn("months", date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(countDistinct("imo").alias("unique_imo_count"))

# Tampilkan DataFrame Spark
unique_filtered_imo_per_month.show() 

## Filter Status Navigasi

In [None]:
# Filter data untuk nav_status_code di dalam rentang tertentu
filtered_nav_status_code = filtered_imo.filter(col("nav_status_code").between(0, 14))

In [None]:
# Nilai yang akan di-filter
nav_status_values = ['Under Way Using Engine', 'At Anchor', 'Restricted Manoeuvrability', 'Moored', 'Engaged In Fishing', 'Underway Sailing']

# Filter data berdasarkan nilai 'nav_status'
filtered_nav_status = filtered_nav_status_code.filter(col("nav_status").isin(nav_status_values))

In [None]:
filtered_nav_status.count()

In [None]:
# Hitung jumlah 'mmsi' per masing-masing 'nav_status'
mmsi_count_per_filtered_nav_status = filtered_nav_status.groupBy("nav_status").agg(count("mmsi").alias("mmsi_count"))

# Tampilkan DataFrame Spark
mmsi_count_per_filtered_nav_status.show() 

In [None]:
# Hitung jumlah 'mmsi' yang unik berdasarkan 'nav_status'
unique_mmsi_per_filtered_nav_status = filtered_nav_status.groupBy("nav_status").agg(countDistinct("mmsi").alias("unique_mmsi_per_filtered_nav_status_count"))

# Tampilkan DataFrame Spark
unique_mmsi_per_filtered_nav_status.show() 

## Filter Tipe Kapal

In [None]:
# Filter data untuk vessel_type_code di dalam rentang tertentu
filtered_vessel_type_code = filtered_nav_status.filter(col("vessel_type_code").between(1, 99))

In [None]:
# Nilai yang akan di-filter
vessel_type_values = ['Sailing', 'Tanker', 'Other', 'Pleasure Craft', 'Passenger', 'Fishing', 'Port Tender', 'Dredging', 'Cargo']

# Filter data berdasarkan nilai 'vessel_type'
filtered_vessel_type = filtered_vessel_type_code.filter(col("vessel_type").isin(vessel_type_values))

In [None]:
filtered_vessel_type.count()

In [None]:
# Hitung jumlah 'mmsi' per masing-masing 'vessel_type'
mmsi_count_per_filtered_vessel_type = filtered_vessel_type.groupBy("vessel_type").agg(count("mmsi").alias("mmsi_count"))

# Tampilkan DataFrame Spark
mmsi_count_per_filtered_vessel_type.show() 

In [None]:
# Hitung jumlah 'mmsi' yang unik berdasarkan 'vessel_type'
unique_mmsi_per_filtered_vessel_type = filtered_vessel_type.groupBy("vessel_type").agg(countDistinct("mmsi").alias("unique_mmsi_per_filtered_vessel_type_count"))

# Tampilkan DataFrame Spark
unique_mmsi_per_filtered_vessel_type.show()  

## Filter Negara Kapal

In [None]:
# Filter data untuk flag_country_code di dalam rentang tertentu
filtered_flag_country_code = filtered_vessel_type.filter(col("flag_code").between(201, 775))

In [None]:
# Nilai yang akan di-filter
# bagi jadi dua, Indonesia dan negara asing

# Filter data berdasarkan nilai 'flag_country'

In [None]:
filtered_flag_country_code.count()

In [None]:
# Hitung jumlah 'mmsi' per masing-masing 'flag_country'
mmsi_count_per_filtered_flag_country = filtered_flag_country.groupBy("flag_country").agg(count("mmsi").alias("mmsi_count"))

# Tampilkan DataFrame Spark
mmsi_count_per_filtered_flag_country.show() 

In [None]:
# Filter out rows where 'flag_country' is not null
df_filtered = filtered_flag_country.filter(col("flag_country").isNotNull())

# Hitung jumlah 'mmsi' yang unik berdasarkan 'flag_country'
unique_mmsi_per_filtered_flag_country = df_filtered.groupBy("flag_country").agg(countDistinct("mmsi").alias("unique_mmsi_per_flag_country_count"))

# Tampilkan DataFrame Spark
unique_mmsi_per_filtered_flag_country.show() 

## Filter Lat, Long

In [None]:
# Filter data untuk latitude & longitude di dalam rentang tertentu
filtered_lat = filtered_flag_country_code.filter(col("latitude").between(-90, 90))
filtered_lat_long = filtered_lat.filter(col("longitude").between(-180, 180))

In [None]:
filtered_lat_long.count()

## Filter dt_pos_utc

In [None]:
# Filter data untuk dt_pos_utc di dalam rentang tertentu
pattern = "^[1-9][0-9]{0,3}-[1-9]|0[1-9]|1[0-2]-[1-9]|0[1-9]|[1-2][0-9]|3[0-1] [0-9]|0[0-9]|1[0-9]|2[0-3]:[0-5][0-9]:[0-5][0-9]$"

# Filter data untuk mendapatkan hanya nilai 'dt_pos_utc' yang tidak valid
filtered_dt_pos_utc = filtered_lat_long.filter(F.col("dt_pos_utc").rlike(pattern))

In [None]:
filtered_dt_pos_utc.count()

## Filter Pelayaran/Pergerakan Anomali

In [None]:
# plot jarak tempuh (y) sama waktu tempuh (x)
# Dari plotnya keliatan mana yang jarak tempuhnya besar tapi waktu tempuhnya kecil (atau sebaliknya?)

In [None]:
# Tentukan kriteria pergerakan tidak normal
kriteria_pergerakan_tidak_normal = (
    ((col("nav_status").isin(['At Anchor', 'Restricted Manoeuvrability', 'Moored', 'Aground'])) &
    (col("sog") > 1)) | 
    ((col("nav_status").isin(['Not Under Command', 'Underway Sailing', 'Under Way Using Engine', 'Engaged In Fishing'])) &
    (col("sog") < 1))
)

# kriteria pergerakan normal
kriteria_pergerakan_normal = ~kriteria_pergerakan_tidak_normal

# Filter pergerakan normal
filtered_pergerakan_normal = filtered_dt_pos_utc.filter(kriteria_pergerakan_normal)

In [None]:
filtered_pergerakan_normal.count()

## Filter Kapal di Pelabuhan

## Filter Pergerakan Anomali

# Setelah Difilter

## Record

In [None]:
# Kelompokkan berdasarkan bulan dan hitung jumlah rekaman
rekaman_per_bulan_filter = filtered_pergerakan_normal.groupBy("months").agg(count("*").alias("jumlah_record_per_bulan"))

# Tampilkan DataFrame Spark yang telah diubah dan diurutkan
rekaman_per_bulan_filter.show()

# Download

## Fungsi

In [11]:
def create_download_link(df, title, filename):
    csv = df.to_csv(index = False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload, title=title, filename=filename)
    return HTML(html)

## Download Data

In [19]:
# Jumlah Records (All)

# DataFrame Spark
rekaman_per_bulan = spark.createDataFrame(rekaman_per_bulan)

# Export ke Pandas DataFrame
rekaman_per_bulan = rekaman_per_bulan.toPandas()

# Download Data
create_download_link(rekaman_per_bulan, title="rekaman_per_bulan", filename="rekaman_per_bulan.csv")

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


In [10]:
# Records (1%)

# DataFrame Spark
#sampled_data = spark.createDataFrame(sampled_data)

# Export ke Pandas DataFrame
#sampled_data = sampled_data.toPandas()

# Download Data
#create_download_link(sampled_data, title=sampled_data, filename="sampled_data.csv")

In [None]:
# Partisi Records (1%)

# DataFrame Spark
#sampledd_data_1 = spark.createDataFrame(sampledd_data_1)
#sampledd_data_2 = spark.createDataFrame(sampledd_data_2)
#sampledd_data_3 = spark.createDataFrame(sampledd_data_3)
#sampledd_data_4 = spark.createDataFrame(sampledd_data_4)
#sampledd_data_5 = spark.createDataFrame(sampledd_data_5)
#sampledd_data_6 = spark.createDataFrame(sampledd_data_6)
#sampledd_data_7 = spark.createDataFrame(sampledd_data_7)
#sampledd_data_8 = spark.createDataFrame(sampledd_data_8)
#sampledd_data_9 = spark.createDataFrame(sampledd_data_9)
#sampledd_data_10 = spark.createDataFrame(sampledd_data_10)

# Export ke Pandas DataFrame
#sampledd_data_1 = sampledd_data_1.toPandas()
#sampledd_data_2 = sampledd_data_2.toPandas()
#sampledd_data_3 = sampledd_data_3.toPandas()
#sampledd_data_4 = sampledd_data_4.toPandas()
#sampledd_data_5 = sampledd_data_5.toPandas()
#sampledd_data_6 = sampledd_data_6.toPandas()
#sampledd_data_7 = sampledd_data_7.toPandas()
#sampledd_data_8 = sampledd_data_8.toPandas()
#sampledd_data_9 = sampledd_data_9.toPandas()
#sampledd_data_10 = sampledd_data_10.toPandas()

# Download Data
#create_download_link(sampledd_data_1, title=sampledd_data_1, filename="sampledd_data_1.csv")
#create_download_link(sampledd_data_2, title=sampledd_data_2, filename="sampledd_data_2.csv")
#create_download_link(sampledd_data_3, title=sampledd_data_3, filename="sampledd_data_3.csv")
#create_download_link(sampledd_data_4, title=sampledd_data_4, filename="sampledd_data_4.csv")
#create_download_link(sampledd_data_5, title=sampledd_data_5, filename="sampledd_data_5.csv")
#create_download_link(sampledd_data_6, title=sampledd_data_6, filename="sampledd_data_6.csv")
#create_download_link(sampledd_data_7, title=sampledd_data_7, filename="sampledd_data_7.csv")
#create_download_link(sampledd_data_8, title=sampledd_data_8, filename="sampledd_data_8.csv")
#create_download_link(sampledd_data_9, title=sampledd_data_9, filename="sampledd_data_9.csv")
#create_download_link(sampledd_data_10, title=sampledd_data_10, filename="sampledd_data_10.csv")

In [17]:
# Jumlah Records (1%)

# DataFrame Spark
#rekaman_per_bulan_1pers = spark.createDataFrame(rekaman_per_bulan_1pers)

# Export ke Pandas DataFrame
rekaman_per_bulan_1pers = rekaman_per_bulan_1pers.toPandas()

# Download Data
create_download_link(rekaman_per_bulan_1pers, title=rekaman_per_bulan_1pers, filename="rekaman_per_bulan_1pers.csv")

In [25]:
# Statistical Summary

# DataFrame Spark
stats_df = spark.createDataFrame(stats_df)
#stats_df_2 = spark.createDataFrame(stats_df_2)

# Export ke Pandas DataFrame
stats_df = stats_df.toPandas()
#stats_df_2 = stats_df_2.toPandas()

# Download Data
create_download_link(stats_df, title=stats_df, filename="stats_df.csv")
#create_download_link(stats_df_2, title=stats_df_2, filename="stats_df_2.csv")

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


In [19]:
# (MMSI, IMO, Status Navigasi, Tipe Kapal, Negara Kapal) Unik

# DataFrame Spark
#df_unique_mmsi_spark = spark.createDataFrame(df_unique_mmsi_spark)
#df_unique_imo_spark = spark.createDataFrame(df_unique_imo_spark)
#mmsi_count_per_nav_status = spark.createDataFrame(mmsi_count_per_nav_status)
#mmsi_count_per_vessel_type = spark.createDataFrame(mmsi_count_per_vessel_type)
#mmsi_count_per_flag_country = spark.createDataFrame(mmsi_count_per_flag_country)

# Export ke Pandas DataFrame
#df_unique_mmsi_spark = df_unique_mmsi_spark.toPandas()
#df_unique_imo_spark = df_unique_imo_spark.toPandas()
mmsi_count_per_nav_status = mmsi_count_per_nav_status.toPandas()
#mmsi_count_per_vessel_type = mmsi_count_per_vessel_type.toPandas()
#mmsi_count_per_flag_country = mmsi_count_per_flag_country.toPandas()
   

# Download Data
#create_download_link(df_unique_mmsi_spark, title=df_unique_mmsi_spark, filename="df_unique_mmsi_spark.csv")
#create_download_link(df_unique_imo_spark, title=df_unique_imo_spark, filename="df_unique_imo_spark.csv")
create_download_link(mmsi_count_per_nav_status, title=mmsi_count_per_nav_status, filename="mmsi_count_per_nav_status.csv")
#create_download_link(mmsi_count_per_vessel_type, title=mmsi_count_per_vessel_type, filename="mmsi_count_per_vessel_type.csv")
#create_download_link(mmsi_count_per_flag_country, title=mmsi_count_per_flag_country, filename="mmsi_count_per_flag_country.csv")

In [15]:
# MMSI Unik per (Status Navigasi, Tipe Kapal, Negara Kapal) 

# DataFrame Spark
#unique_mmsi_per_nav_status = spark.createDataFrame(unique_mmsi_per_nav_status)
#unique_mmsi_per_vessel_type = spark.createDataFrame(unique_mmsi_per_vessel_type)
#unique_mmsi_per_flag_country = spark.createDataFrame(unique_mmsi_per_flag_country)

# Export ke Pandas DataFrame
#unique_mmsi_per_nav_status = unique_mmsi_per_nav_status.toPandas()
#unique_mmsi_per_vessel_type = unique_mmsi_per_vessel_type.toPandas()
unique_mmsi_per_flag_country = unique_mmsi_per_flag_country.toPandas()
  
# Download Data
#create_download_link(unique_mmsi_per_nav_status, title=unique_mmsi_per_nav_status, filename="unique_mmsi_per_nav_status.csv")
#create_download_link(unique_mmsi_per_vessel_type, title=unique_mmsi_per_vessel_type, filename="unique_mmsi_per_vessel_type.csv")
create_download_link(unique_mmsi_per_flag_country, title=unique_mmsi_per_flag_country, filename="unique_mmsi_per_flag_country.csv")

In [19]:
# Nilai Default

# DataFrame Spark
#mmsi_default_per_month = spark.createDataFrame(mmsi_default_per_month)
#imo_default_per_month = spark.createDataFrame(imo_default_per_month)
#nav_status_default_per_month = spark.createDataFrame(nav_status_default_per_month)
#vessel_type_default_per_month = spark.createDataFrame(vessel_type_default_per_month)
#flag_country_default_per_month = spark.createDataFrame(flag_country_default_per_month)
#latitude_default_per_month = spark.createDataFrame(latitude_default_per_month)
#longitude_default_per_month = spark.createDataFrame(longitude_default_per_month)
#dt_pos_utc_default_per_month = spark.createDataFrame(dt_pos_utc_default_per_month)

# Export ke Pandas DataFrame
#mmsi_default_per_month = mmsi_default_per_month.toPandas()
#imo_default_per_month = imo_default_per_month.toPandas()
#nav_status_default_per_month = nav_status_default_per_month.toPandas()
#vessel_type_default_per_month = vessel_type_default_per_month.toPandas()
#flag_country_default_per_month = flag_country_default_per_month.toPandas()
#latitude_default_per_month = latitude_default_per_month.toPandas()
#longitude_default_per_month = longitude_default_per_month.toPandas()
#dt_pos_utc_default_per_month = dt_pos_utc_default_per_month.toPandas()
             

# Download Data
#create_download_link(mmsi_default_per_month, title=mmsi_default_per_month, filename="mmsi_default_per_month.csv")
#create_download_link(imo_default_per_month, title=imo_default_per_month, filename="imo_default_per_month.csv")
#create_download_link(nav_status_default_per_month, title=nav_status_default_per_month, filename="nav_status_default_per_month.csv")
#create_download_link(vessel_type_default_per_month, title=vessel_type_default_per_month, filename="vessel_type_default_per_month.csv")
#create_download_link(flag_country_default_per_month, title=flag_country_default_per_month, filename="flag_country_default_per_month.csv")
#create_download_link(latitude_default_per_month, title=latitude_default_per_month, filename="latitude_default_per_month.csv")
#create_download_link(longitude_default_per_month, title=longitude_default_per_month, filename="longitude_default_per_month.csv")
#create_download_link(dt_pos_utc_default_per_month, title=dt_pos_utc_default_per_month, filename="dt_pos_utc_default_per_month.csv")

In [None]:
# Invalid Value

# DataFrame Spark
mmsi_invalid = spark.createDataFrame(mmsi_invalid)
#imo_invalid = spark.createDataFrame(imo_invalid)
#nav_status_code_invalid = spark.createDataFrame(nav_status_code_invalid)
#vessel_type_code_invalid = spark.createDataFrame(vessel_type_code_invalid)
#flag_country_code_invalid = spark.createDataFrame(flag_country_code_invalid)
#latitude_invalid = spark.createDataFrame(latitude_invalid)
#longitude_invalid = spark.createDataFrame(longitude_invalid)
#dt_pos_utc_invalid = spark.createDataFrame(dt_pos_utc_invalid)

# Export ke Pandas DataFrame
mmsi_invalid = mmsi_invalid.toPandas()
#imo_invalid = imo_invalid.toPandas()
#nav_status_code_invalid = nav_status_code_invalid.toPandas()
#vessel_type_code_invalid = vessel_type_code_invalid.toPandas()
#flag_country_code_invalid = flag_country_code_invalid.toPandas()
#latitude_invalid = latitude_invalid.toPandas()
#longitude_invalid = longitude_invalid.toPandas()
#dt_pos_utc_invalid = dt_pos_utc_invalid.toPandas()
       

# Download Data
create_download_link(mmsi_invalid, title=mmsi_invalid, filename="mmsi_invalid.csv")
#create_download_link(imo_invalid, title=imo_invalid, filename="imo_invalid.csv")
#create_download_link(nav_status_code_invalid, title=nav_status_code_invalid, filename="nav_status_code_invalid.csv")
#create_download_link(vessel_type_code_invalid, title=vessel_type_code_invalid, filename="vessel_type_code_invalid.csv")
#create_download_link(flag_country_code_invalid, title=flag_country_code_invalid, filename="flag_country_code_invalid.csv")
#create_download_link(latitude_invalid, title=latitude_invalid, filename="latitude_invalid.csv")
#create_download_link(longitude_invalid, title=longitude_invalid, filename="longitude_invalid.csv")
#create_download_link(dt_pos_utc_invalid, title=dt_pos_utc_invalid, filename="dt_pos_utc_invalid.csv")

In [None]:
# Missing Value

# DataFrame Spark
missing_values_per_month_mmsi = spark.createDataFrame(missing_values_per_month_mmsi)
#missing_values_per_month_imo = spark.createDataFrame(missing_values_per_month_imo)
#missing_values_per_month_nav_status = spark.createDataFrame(missing_values_per_month_nav_status)
#missing_values_per_month_vessel_type = spark.createDataFrame(missing_values_per_month_vessel_type)
#missing_values_per_month_flag_country = spark.createDataFrame(missing_values_per_month_flag_country)
#missing_values_per_month_latitude = spark.createDataFrame(missing_values_per_month_latitude)
#missing_values_per_month_longitude = spark.createDataFrame(missing_values_per_month_longitude)
#missing_values_per_month_dt_pos_utc = spark.createDataFrame(missing_values_per_month_dt_pos_utc)

# Export ke Pandas DataFrame
missing_values_per_month_mmsi = missing_values_per_month_mmsi.toPandas()
#missing_values_per_month_imo = missing_values_per_month_imo.toPandas()
#missing_values_per_month_nav_status = missing_values_per_month_nav_status.toPandas()
#missing_values_per_month_vessel_type = missing_values_per_month_vessel_type.toPandas()
#missing_values_per_month_flag_country = missing_values_per_month_flag_country.toPandas()
#missing_values_per_month_latitude = missing_values_per_month_latitude.toPandas()
#missing_values_per_month_longitude = missing_values_per_month_longitude.toPandas()
#missing_values_per_month_dt_pos_utc = missing_values_per_month_dt_pos_utc.toPandas()
        

# Download Data
create_download_link(missing_values_per_month_mmsi, title=missing_values_per_month_mmsi, filename="missing_values_per_month_mmsi.csv")
#create_download_link(missing_values_per_month_imo, title=missing_values_per_month_imo, filename="missing_values_per_month_imo.csv")
#create_download_link(missing_values_per_month_nav_status, title=missing_values_per_month_nav_status, filename="missing_values_per_month_nav_status.csv")
#create_download_link(missing_values_per_month_vessel_type, title=missing_values_per_month_vessel_type, filename="missing_values_per_month_vessel_type.csv")
#create_download_link(missing_values_per_month_flag_country, title=missing_values_per_month_flag_country, filename="missing_values_per_month_flag_country.csv")
#create_download_link(missing_values_per_month_latitude, title=missing_values_per_month_latitude, filename="missing_values_per_month_latitude.csv")
#create_download_link(missing_values_per_month_longitude, title=missing_values_per_month_longitude, filename="missing_values_per_month_longitude.csv")
#create_download_link(missing_values_per_month_dt_pos_utc, title=missing_values_per_month_dt_pos_utc, filename="missing_values_per_month_dt_pos_utc.csv")

In [None]:
# Filter (MMSI, IMO, Status Navigasi, Tipe Kapal, Negara Kapal Default, Latitude, Longitude, dt_pos_utc)

# DataFrame Spark
unique_filtered_mmsi_per_month = spark.createDataFrame(unique_filtered_mmsi_per_month)
#unique_filtered_imo_per_month = spark.createDataFrame(unique_filtered_imo_per_month)
#mmsi_count_per_filtered_nav_status = spark.createDataFrame(mmsi_count_per_filtered_nav_status)
#unique_mmsi_per_filtered_nav_status = spark.createDataFrame(unique_mmsi_per_filtered_nav_status)
#mmsi_count_per_filtered_vessel_type = spark.createDataFrame(mmsi_count_per_filtered_vessel_type)
#unique_mmsi_per_filtered_vessel_type = spark.createDataFrame(unique_mmsi_per_filtered_vessel_type)
#mmsi_count_per_filtered_flag_country = spark.createDataFrame(mmsi_count_per_filtered_flag_country)
#unique_mmsi_per_filtered_flag_country = spark.createDataFrame(unique_mmsi_per_filtered_flag_country)

# Export ke Pandas DataFrame
unique_filtered_mmsi_per_month = unique_filtered_mmsi_per_month.toPandas()
#df_unique_filtered_imo_spark = df_unique_filtered_imo_spark.toPandas()
#mmsi_count_per_filtered_nav_status = mmsi_count_per_filtered_nav_status.toPandas()
#unique_mmsi_per_filtered_nav_status = unique_mmsi_per_filtered_nav_status.toPandas()
#mmsi_count_per_filtered_vessel_type = mmsi_count_per_filtered_vessel_type.toPandas()
#unique_mmsi_per_filtered_vessel_type = unique_mmsi_per_filtered_vessel_type.toPandas()
#mmsi_count_per_filtered_flag_country = mmsi_count_per_filtered_flag_country.toPandas()
#unique_mmsi_per_filtered_flag_country = unique_mmsi_per_filtered_flag_country.toPandas()


# Download Data
create_download_link(unique_filtered_mmsi_per_month, title=unique_filtered_mmsi_per_month, filename="unique_filtered_mmsi_per_month.csv")
#create_download_link(unique_filtered_imo_per_month, title=unique_filtered_imo_per_month, filename="unique_filtered_imo_per_month.csv")
#create_download_link(mmsi_count_per_filtered_nav_status, title=mmsi_count_per_filtered_nav_status, filename="mmsi_count_per_filtered_nav_status.csv")
#create_download_link(unique_mmsi_per_filtered_nav_status, title=unique_mmsi_per_filtered_nav_status, filename="unique_mmsi_per_filtered_nav_status.csv")
#create_download_link(mmsi_count_per_filtered_vessel_type, title=mmsi_count_per_filtered_vessel_type, filename="mmsi_count_per_filtered_vessel_type.csv")
#create_download_link(unique_mmsi_per_filtered_vessel_type, title=unique_mmsi_per_filtered_vessel_type, filename="unique_mmsi_per_filtered_vessel_type.csv")
#create_download_link(mmsi_count_per_filtered_flag_country, title=mmsi_count_per_filtered_flag_country, filename="mmsi_count_per_filtered_flag_country.csv")
#create_download_link(unique_mmsi_per_filtered_flag_country, title=unique_mmsi_per_filtered_flag_country, filename="unique_mmsi_per_filtered_flag_country.csv")

In [None]:
# Filter yg melakukan pelayaran/lintasan anomali(?)

# DataFrame Spark
rekaman_per_bulan_1pers = spark.createDataFrame(rekaman_per_bulan_1pers)
rekaman_per_bulan_1pers = spark.createDataFrame(rekaman_per_bulan_1pers)

# Export ke Pandas DataFrame
rekaman_per_bulan_1pers = rekaman_per_bulan_1pers.toPandas()

# Download Data
create_download_link(rekaman_per_bulan_1pers, filename="rekaman_per_bulan_1pers.csv")