# Inilization

In [1]:
import pandas as pd
from pyspark.sql import functions as F

#For 3.3.1
#Register Sedona Functions to Spark
from sedona.register import SedonaRegistrator
SedonaRegistrator.registerAll(spark)


#For 3.3.2
from shapely.geometry import Point, Polygon, mapping
import h3.api.numpy_int as h3int 

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format, count, countDistinct, when, expr
from pyspark.sql.functions import collect_list, collect_set, explode
import calendar
from pyspark.sql.types import IntegerType
from pyspark.sql.window import Window

In [3]:
import pandas as pd
from IPython.display import HTML
import base64
from pyspark.sql.functions import year, month, dayofmonth, hour, minute, second
from pyspark.sql.functions import monotonically_increasing_id

# Eksplorasi Data

## Data AIS

In [4]:
# Path
save_path = "s3a://ungp-ais-data-historical-backup/user_temp/"
path_unique = save_path + "222011349/"

In [5]:
# Read Data
df_data = spark.read.parquet(path_unique + "data-ais-ihs-indonesia-by-mmsi-2022.parquet", header=True)

In [6]:
df_data.printSchema()

root
 |-- message_type: integer (nullable = true)
 |-- mmsi: integer (nullable = true)
 |-- imo: integer (nullable = true)
 |-- vessel_name: string (nullable = true)
 |-- callsign: string (nullable = true)
 |-- vessel_type: string (nullable = true)
 |-- vessel_type_code: integer (nullable = true)
 |-- vessel_type_cargo: string (nullable = true)
 |-- vessel_class: string (nullable = true)
 |-- length: double (nullable = true)
 |-- width: double (nullable = true)
 |-- flag_country: string (nullable = true)
 |-- flag_code: integer (nullable = true)
 |-- destination: string (nullable = true)
 |-- eta: integer (nullable = true)
 |-- draught: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- sog: double (nullable = true)
 |-- cog: double (nullable = true)
 |-- rot: double (nullable = true)
 |-- heading: double (nullable = true)
 |-- nav_status: string (nullable = true)
 |-- nav_status_code: integer (nullable = true)
 |-- source: st

In [7]:
# Show sample 1 data
df_data.show(n=1, vertical=True)

-RECORD 0---------------------------------------------------------
 message_type                              | 1                    
 mmsi                                      | 457900560            
 imo                                       | 7821960              
 vessel_name                               | MARKUL               
 callsign                                  | JVUW7                
 vessel_type                               | Fishing              
 vessel_type_code                          | 30                   
 vessel_type_cargo                         | null                 
 vessel_class                              | A                    
 length                                    | 50.0                 
 width                                     | 9.0                  
 flag_country                              | Mongolia             
 flag_code                                 | 457                  
 destination                               | DALIAN           

In [8]:
df_data.count()

1696875705

## Filter Data 2022

In [6]:
# Ekstrak tahun dari kolom yang berisi tanggal atau waktu
df_data = df_data.withColumn("tahun", date_format("dt_pos_utc", "yyyy"))

# Filter data untuk tahun 2022
data_sampel = df_data.filter(df_data["tahun"] == 2022)

In [10]:
data_sampel.count()

1696875705

## Record per Bulan

In [11]:
jumlah_record_per_bulan = data_sampel.withColumn("months", date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(count("*").alias("jumlah_record_per_bulan"))

In [12]:
jumlah_record_per_bulan.show(12)

+---------+-----------------------+
|   months|jumlah_record_per_bulan|
+---------+-----------------------+
|     July|              150053089|
| November|              113844313|
| February|              133040874|
|  January|              135081095|
|    March|              143776424|
|  October|              149953299|
|      May|              144231675|
|   August|              150143025|
|    April|              142280420|
|     June|              141269252|
| December|              147888048|
|September|              145314191|
+---------+-----------------------+



## Statistical Summary Data AIS

In [13]:
# MMSI, IMO, Status Navigasi, Tipe Kapal, Negara Kapal

# Variabel yang ingin dihitung statistik deskriptif
variables_to_describe = ["mmsi", "imo", "nav_status_code", "vessel_type_code", "flag_code", "sog"]

# Ambil statistik deskriptif tertentu untuk kolom-kolom tersebut
stats_df = data_sampel.select(variables_to_describe).describe()

# Pilih statistik tertentu
selected_stats = ["count", "mean", "stddev", "min", "25%", "50%", "75%", "max"]
stats_df = stats_df.select(["summary"] + [col(var).alias(var) for var in variables_to_describe]).filter(col("summary").isin(selected_stats))

# Tampilkan statistik deskriptif
stats_df.show()

+-------+--------------------+------------------+------------------+------------------+------------------+-----------------+
|summary|                mmsi|               imo|   nav_status_code|  vessel_type_code|         flag_code|              sog|
+-------+--------------------+------------------+------------------+------------------+------------------+-----------------+
|  count|          1696875705|        1696875705|        1696875705|        1696875705|        1674034613|       1696875705|
|   mean| 4.556537864275378E8| 9514841.348555109|1.4822160324347387| 72.01712981093097|455.31703518665535|7.263064470708549|
| stddev|1.2895218742964801E8|440266.44330385805|3.1302462246293947|11.239916888590978| 129.1180024317068|  6.2150425632018|
|    min|                   0|           1000215|                 0|                 0|               205|              0.0|
|    max|           777777772|           9999761|                16|               255|               750|            102.1|


In [14]:
# Latitude, Longitude, dt_pos_utc

# Pemisahan komponen waktu dari kolom dt_pos_utc
data_sampel = data_sampel.withColumn("year", year("dt_pos_utc"))
data_sampel = data_sampel.withColumn("month", month("dt_pos_utc"))
data_sampel = data_sampel.withColumn("day", dayofmonth("dt_pos_utc"))
data_sampel = data_sampel.withColumn("hour", hour("dt_pos_utc"))
data_sampel = data_sampel.withColumn("minute", minute("dt_pos_utc"))
data_sampel = data_sampel.withColumn("second", second("dt_pos_utc"))

# Variabel yang ingin dihitung statistik deskriptif
variables_to_describe_2 = ["latitude", "longitude", "year", "month", "day", "hour", "minute", "second"]

# Ambil statistik deskriptif tertentu untuk kolom-kolom tersebut
stats_df_2 = data_sampel.select(variables_to_describe_2).describe()

# Pilih statistik tertentu
selected_stats = ["count", "mean", "stddev", "min", "25%", "50%", "75%", "max"]
stats_df_2 = stats_df_2.select(["summary"] + [col(var).alias(var) for var in variables_to_describe_2]).filter(col("summary").isin(selected_stats))

# Tampilkan statistik deskriptif
stats_df_2.show()

+-------+------------------+-----------------+----------+------------------+------------------+-----------------+------------------+------------------+
|summary|          latitude|        longitude|      year|             month|               day|             hour|            minute|            second|
+-------+------------------+-----------------+----------+------------------+------------------+-----------------+------------------+------------------+
|  count|        1696875705|       1696875705|1696875705|        1696875705|        1696875705|       1696875705|        1696875705|        1696875705|
|   mean| 5.996610825747308|49.58689457964619|    2022.0| 6.515626454207499|15.791300276763643|11.62339027359697| 29.54237599270714|29.250482842524992|
| stddev|25.573815227415427| 87.1345944321897|       0.0|3.4063875153628596| 8.872540646463074| 6.93672545861264|17.306280002185158|17.391718411567684|
|    min|             -90.0|           -180.0|      2022|                 1|            

## Nilai Unik

### MMSI Unik per Bulan

In [18]:
# Hitung jumlah 'mmsi' yang unik per bulan
unique_mmsi_per_month = data_sampel.withColumn("months", date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(countDistinct("mmsi").alias("unique_mmsi_count"))

# Tampilkan DataFrame Spark
unique_mmsi_per_month.show()

+---------+-----------------+
|   months|unique_mmsi_count|
+---------+-----------------+
|     July|            26616|
| November|            26904|
| February|            25887|
|  January|            25893|
|    March|            26148|
|  October|            26958|
|      May|            26260|
|   August|            26684|
|    April|            26208|
|     June|            26396|
| December|            26816|
|September|            26874|
+---------+-----------------+



### IMO Unik per Bulan

In [19]:
# Hitung jumlah 'imo' yang unik per bulan
unique_imo_per_month = data_sampel.withColumn("months", date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(countDistinct("imo").alias("unique_imo_count"))

# Tampilkan DataFrame Spark
unique_imo_per_month.show()

+---------+----------------+
|   months|unique_imo_count|
+---------+----------------+
|     July|           26514|
| November|           26836|
| February|           25875|
|  January|           25882|
|    March|           26123|
|  October|           26860|
|      May|           26192|
|   August|           26590|
|    April|           26169|
|     June|           26311|
| December|           26743|
|September|           26793|
+---------+----------------+



### Status Navigasi Setahun

In [9]:
# Hitung jumlah 'mmsi' per masing-masing 'nav_status' dengan DataFrame API
mmsi_count_per_nav_status = data_sampel.groupBy("nav_status").agg(F.count("mmsi").alias("mmsi_count"))

# Tampilkan DataFrame Spark
mmsi_count_per_nav_status.show()

+--------------------+----------+
|          nav_status|mmsi_count|
+--------------------+----------+
|              Moored| 220232423|
|Restricted Manoeu...|  13251565|
|         Not Defined|  21562668|
|   Not Under Command|  13263397|
|    Underway Sailing|  38769069|
|             Unknown|  26878143|
|           At Anchor| 253902986|
|Under Way Using E...|1104661789|
|             Aground|    328730|
|  Engaged In Fishing|   4024935|
+--------------------+----------+



### Tipe Kapal Setahun

In [8]:
# Hitung jumlah 'mmsi' per masing-masing 'vessel_type'
mmsi_count_per_vessel_type = data_sampel.groupBy("vessel_type").agg(count("mmsi").alias("mmsi_count"))

# Tampilkan DataFrame Spark
mmsi_count_per_vessel_type.show(mmsi_count_per_vessel_type.count(), truncate = False)

+------------------------------------+----------+
|vessel_type                         |mmsi_count|
+------------------------------------+----------+
|Tanker                              |405830768 |
|Reserved                            |3371596   |
|Unknown                             |22896834  |
|Tug                                 |58101850  |
|Passenger                           |23545367  |
|WIG                                 |2227214   |
|Cargo                               |1079673972|
|Military                            |1253561   |
|Other                               |41605675  |
|Fishing                             |32350006  |
|Not Available                       |145353    |
|Sailing                             |1821814   |
|Pleasure Craft                      |5932659   |
|Diving                              |583533    |
|Towing                              |6914268   |
|Law Enforcement                     |1212456   |
|Spare                               |239261    |


### Negara Kapal Setahun

In [10]:
# Hitung jumlah 'mmsi' per masing-masing 'flag_country'
mmsi_count_per_flag_country = data_sampel.groupBy("flag_country").agg(count("mmsi").alias("mmsi_count"))

# Tampilkan DataFrame Spark
mmsi_count_per_flag_country.show(mmsi_count_per_flag_country.count(), truncate = False)

+--------------------------------+----------+
|flag_country                    |mmsi_count|
+--------------------------------+----------+
|Singapore                       |161978946 |
|Bahamas                         |46086637  |
|Marshall Islands                |222347422 |
|Cayman Islands                  |9666163   |
|Panama                          |287570391 |
|Hong Kong                       |142471987 |
|Cyprus                          |27802416  |
|Indonesia                       |118156227 |
|Liberia                         |222763125 |
|Malaysia                        |29036743  |
|Palau                           |1552797   |
|China                           |50117886  |
|Malta                           |68472949  |
|Denmark                         |15708239  |
|Thailand                        |6016078   |
|Vietnam                         |11117616  |
|Switzerland                     |959616    |
|Bangladesh                      |3195427   |
|South Korea                     |

# Quality Assurance

## MMSI Unik per

### Status Navigasi

In [20]:
# Hitung jumlah 'mmsi' yang unik berdasarkan 'nav_status'
unique_mmsi_per_nav_status = data_sampel.groupBy("nav_status").agg(countDistinct("mmsi").alias("unique_mmsi_per_nav_status_count"))

# Tampilkan DataFrame Spark
unique_mmsi_per_nav_status.show()

+--------------------+--------------------------------+
|          nav_status|unique_mmsi_per_nav_status_count|
+--------------------+--------------------------------+
|              Moored|                           23229|
|Restricted Manoeu...|                            4955|
|             Aground|                             780|
|         Not Defined|                            5666|
|   Not Under Command|                           15136|
|  Engaged In Fishing|                             407|
|    Underway Sailing|                           11016|
|             Unknown|                            9963|
|           At Anchor|                           23770|
|Under Way Using E...|                           25702|
+--------------------+--------------------------------+



### Tipe Kapal

In [21]:
# Hitung jumlah 'mmsi' yang unik berdasarkan 'vessel_type'
unique_mmsi_per_vessel_type = data_sampel.groupBy("vessel_type").agg(countDistinct("mmsi").alias("unique_mmsi_per_vessel_type_count"))

# Tampilkan DataFrame Spark
unique_mmsi_per_vessel_type.show(unique_mmsi_per_vessel_type.count(), truncate=False)

+------------------------------------+---------------------------------+
|vessel_type                         |unique_mmsi_per_vessel_type_count|
+------------------------------------+---------------------------------+
|Sailing                             |216                              |
|Tanker                              |8046                             |
|Ships Not Party to Armed Conflict   |34                               |
|Military                            |135                              |
|Towing                              |781                              |
|Reserved                            |460                              |
|SAR                                 |54                               |
|Unknown                             |7811                             |
|Other                               |1517                             |
|UNAVAILABLE                         |69                               |
|Tug                                 |3305         

### Negara Kapal

In [22]:
# Filter out rows where 'flag_country' is not null
df_filtered = data_sampel.filter(col("flag_country").isNotNull())

# Hitung jumlah 'mmsi' yang unik berdasarkan 'flag_country'
unique_mmsi_per_flag_country = df_filtered.groupBy("flag_country").agg(countDistinct("mmsi").alias("unique_mmsi_per_flag_country_count"))

# Tampilkan DataFrame Spark
unique_mmsi_per_flag_country.show(unique_mmsi_per_flag_country.count(), truncate=False) 

+--------------------------------+----------------------------------+
|flag_country                    |unique_mmsi_per_flag_country_count|
+--------------------------------+----------------------------------+
|Sweden                          |15                                |
|Kiribati                        |26                                |
|Guyana                          |8                                 |
|Philippines                     |117                               |
|Djibouti                        |13                                |
|Malaysia                        |661                               |
|Singapore                       |2413                              |
|Fiji                            |4                                 |
|Turkey                          |38                                |
|Germany                         |32                                |
|Comoros                         |25                                |
|Cambodia           

## IMO Unik per

### Status Navigasi

In [23]:
# Hitung jumlah 'mmsi' yang unik berdasarkan 'nav_status'
unique_imo_per_nav_status = data_sampel.select("imo","nav_status").groupBy("nav_status").agg(countDistinct("imo").alias("unique_imo_per_nav_status_count"))

# Tampilkan DataFrame Spark
unique_imo_per_nav_status.show(unique_imo_per_nav_status.count(), truncate=False)

+--------------------------+-------------------------------+
|nav_status                |unique_imo_per_nav_status_count|
+--------------------------+-------------------------------+
|Moored                    |22420                          |
|Restricted Manoeuvrability|4925                           |
|Aground                   |779                            |
|Not Defined               |5690                           |
|Not Under Command         |14858                          |
|Engaged In Fishing        |412                            |
|Underway Sailing          |10921                          |
|Unknown                   |9949                           |
|At Anchor                 |22739                          |
|Under Way Using Engine    |24735                          |
+--------------------------+-------------------------------+



### Tipe Kapal

In [24]:
# Hitung jumlah 'mmsi' yang unik berdasarkan 'vessel_type'
unique_imo_per_vessel_type = data_sampel.select("imo","vessel_type").groupBy("vessel_type").agg(countDistinct("imo").alias("unique_imo_per_vessel_type_count"))

# Tampilkan DataFrame Spark
unique_imo_per_vessel_type.show(unique_imo_per_vessel_type.count(), truncate=False)

+------------------------------------+--------------------------------+
|vessel_type                         |unique_imo_per_vessel_type_count|
+------------------------------------+--------------------------------+
|Sailing                             |216                             |
|Tanker                              |7625                            |
|Ships Not Party to Armed Conflict   |33                              |
|Military                            |137                             |
|Towing                              |795                             |
|Reserved                            |462                             |
|SAR                                 |54                              |
|Unknown                             |7801                            |
|Other                               |1488                            |
|UNAVAILABLE                         |69                              |
|Tug                                 |3314                      

### Negara Kapal

In [25]:
# Hitung jumlah 'mmsi' yang unik berdasarkan 'vessel_type'
unique_imo_per_flag_country = data_sampel.select("imo","flag_country").groupBy("flag_country").agg(countDistinct("imo").alias("unique_imo_per_flag_country_count"))

# Tampilkan DataFrame Spark
unique_imo_per_flag_country.show(unique_imo_per_flag_country.count(), truncate=False)

+--------------------------------+---------------------------------+
|flag_country                    |unique_imo_per_flag_country_count|
+--------------------------------+---------------------------------+
|Sweden                          |15                               |
|Kiribati                        |26                               |
|Guyana                          |8                                |
|Philippines                     |118                              |
|Djibouti                        |14                               |
|Singapore                       |2449                             |
|Malaysia                        |662                              |
|Turkey                          |39                               |
|Germany                         |36                               |
|Comoros                         |26                               |
|Cambodia                        |4                                |
|Palau                           |

## Nilai Valid

### MMSI Valid

In [26]:
# Definisikan nilai mmsi yang valid
nilai_valid = [100000000, 999999999]

# Jumlah mmsi yang valid
jumlah_mmsi_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['mmsi'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("mmsi").alias("mmsi_valid"))

# Buat DataFrame hasil
mmsi_valid = jumlah_mmsi_valid_per_bulan

# Tampilkan hasil
mmsi_valid.show()

+---------+----------+
|   months|mmsi_valid|
+---------+----------+
|     July| 150053089|
| November| 113839707|
| February| 133040874|
|  January| 135081095|
|    March| 143776424|
|  October| 149953299|
|      May| 144231509|
|   August| 150143025|
|    April| 142280420|
|     June| 141269252|
| December| 147888048|
|September| 145314191|
+---------+----------+



### IMO Valid

In [27]:
# Definisikan nilai imo yang valid
nilai_valid = [1000000, 9999999]

# Hitung jumlah imo valid per bulan
jumlah_imo_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['imo'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("imo").alias("imo_valid"))

# Buat DataFrame hasil
imo_valid = jumlah_imo_valid_per_bulan

# Tampilkan hasil
imo_valid.show() 

+---------+---------+
|   months|imo_valid|
+---------+---------+
|     July|150053089|
| November|113844313|
| February|133040874|
|  January|135081095|
|    March|143776424|
|  October|149953299|
|      May|144231675|
|   August|150143025|
|    April|142280420|
|     June|141269252|
| December|147888048|
|September|145314191|
+---------+---------+



### Status Navigasi Valid

In [28]:
# Definisikan nilai nav_status_code yang valid
nilai_valid = [0, 14]

# Hitung jumlah nav_status_code  valid per bulan
jumlah_nav_status_code_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['nav_status_code'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("nav_status_code").alias("nav_status_code_valid"))

# Buat DataFrame hasil
nav_status_code_valid = jumlah_nav_status_code_valid_per_bulan

# Tampilkan hasil
nav_status_code_valid.show() 

+---------+---------------------+
|   months|nav_status_code_valid|
+---------+---------------------+
|     July|            146246043|
| November|            110371157|
| February|            129207060|
|  January|            130981193|
|    March|            139697802|
|  October|            145706600|
|      May|            140103671|
|   August|            146224006|
|    April|            138129016|
|     June|            137249666|
| December|            143370807|
|September|            141147873|
+---------+---------------------+



### Tipe Kapal Valid

In [29]:
# Definisikan nilai vessel_type_code yang valid
nilai_valid = [1, 255]

# Hitung jumlah vessel_type_code  valid per bulan
jumlah_vessel_type_code_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['vessel_type_code'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("vessel_type_code").alias("vessel_type_code_valid"))

# Buat DataFrame hasil
vessel_type_code_valid = jumlah_vessel_type_code_valid_per_bulan

# Tampilkan hasil
vessel_type_code_valid.show() 

+---------+----------------------+
|   months|vessel_type_code_valid|
+---------+----------------------+
|     July|             150040844|
| November|             113829194|
| February|             133027700|
|  January|             135068716|
|    March|             143763090|
|  October|             149941944|
|      May|             144223045|
|   August|             150134006|
|    April|             142271278|
|     June|             141259475|
| December|             147867230|
|September|             145303830|
+---------+----------------------+



### Negara Kapal Valid

In [30]:
# Definisikan nilai flag_country_code yang valid
nilai_valid = [201, 775]

# Hitung jumlah flag_country_code tidak valid per bulan
jumlah_flag_country_code_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['flag_code'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("flag_code").alias("flag_country_code_valid"))

# Buat DataFrame hasil
flag_country_code_valid = jumlah_flag_country_code_valid_per_bulan

# Tampilkan hasil
flag_country_code_valid.show() 

+---------+-----------------------+
|   months|flag_country_code_valid|
+---------+-----------------------+
|     July|              150053089|
| November|               93008592|
| February|              133040874|
|  January|              135081095|
|    March|              143776424|
|  October|              149953299|
|      May|              142226304|
|   August|              150143025|
|    April|              142280420|
|     June|              141269252|
| December|              147888048|
|September|              145314191|
+---------+-----------------------+



### Latitude & Longitude Valid

In [31]:
# Definisikan nilai latitude yang valid
nilai_valid = [-90, 90]

# Hitung jumlah latitude tidak valid per bulan
jumlah_latitude_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['latitude'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("latitude").alias("latitude_valid"))

# Buat DataFrame hasil
latitude_valid = jumlah_latitude_valid_per_bulan

# Tampilkan hasil
latitude_valid.show() 

+---------+--------------+
|   months|latitude_valid|
+---------+--------------+
|     July|     150053089|
| November|     113844313|
| February|     133040874|
|  January|     135081095|
|    March|     143776424|
|  October|     149953299|
|      May|     144231675|
|   August|     150143025|
|    April|     142280420|
|     June|     141269252|
| December|     147888048|
|September|     145314191|
+---------+--------------+



In [32]:
# Definisikan nilai longitude yang valid
nilai_valid = [-180, 180]

# Hitung jumlah longitude tidak valid per bulan
jumlah_longitude_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(data_sampel['longitude'].between(nilai_valid[0], nilai_valid[1])) \
    .groupBy("months").agg(F.count("longitude").alias("longitude_valid"))

# Buat DataFrame hasil
longitude_valid = jumlah_longitude_valid_per_bulan

# Tampilkan hasil
longitude_valid.show()

+---------+---------------+
|   months|longitude_valid|
+---------+---------------+
|     July|      150053089|
| November|      113844313|
| February|      133040874|
|  January|      135081095|
|    March|      143776424|
|  October|      149953299|
|      May|      144231675|
|   August|      150143025|
|    April|      142280420|
|     June|      141269252|
| December|      147888048|
|September|      145314191|
+---------+---------------+



### dt_pos_utc Valid

In [33]:
# Definisikan pola regex untuk nilai 'dt_pos_utc' yang valid
pattern = "^[1-9][0-9]{0,3}-[1-9]|0[1-9]|1[0-2]-[1-9]|0[1-9]|[1-2][0-9]|3[0-1] [0-9]|0[0-9]|1[0-9]|2[0-3]:[0-5][0-9]:[0-5][0-9]$"

# Filter data untuk mendapatkan hanya nilai 'dt_pos_utc' yang tidak valid
dt_pos_utc_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("dt_pos_utc").rlike(pattern)) \
    .groupBy("months").agg(F.count("dt_pos_utc").alias("dt_pos_utc_valid"))

# Buat DataFrame hasil
dt_pos_utc_valid = dt_pos_utc_valid_per_bulan

# Tampilkan hasil
dt_pos_utc_valid.show()  

+---------+----------------+
|   months|dt_pos_utc_valid|
+---------+----------------+
|     July|       150053089|
| November|       113844313|
| February|       133040874|
|  January|       135081095|
|    March|       143776424|
|  October|       149953299|
|      May|       144231675|
|   August|       150143025|
|    April|       142280420|
|     June|       141269252|
| December|       147888048|
|September|       145314191|
+---------+----------------+



## Nilai Default

### MMSI dengan Nilai Default per Bulan

In [34]:
# Tentukan nilai yang ingin dihitung
default_value_1 = 0
default_value_2 = 1193046

# Hitung jumlah 'mmsi' dengan nilai default per bulan
mmsi_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((F.col("mmsi") == default_value_1) | (F.col("mmsi") == default_value_2)) \
    .groupBy("months", "mmsi").agg(F.count("mmsi").alias("mmsi_count_default"))

# Tampilkan DataFrame Spark hasil akhir
mmsi_default_per_month.show()

+--------+----+------------------+
|  months|mmsi|mmsi_count_default|
+--------+----+------------------+
|November|   0|              1969|
|     May|   0|               128|
+--------+----+------------------+



### IMO dengan Nilai Default per Bulan

In [35]:
# Tentukan nilai yang ingin dihitung
default_value = 0  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'imo' dengan nilai default per bulan
imo_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("imo") == default_value) \
    .groupBy("months").agg(F.count("imo").alias("imo_count_default"))

# Tampilkan DataFrame Spark hasil akhir
imo_default_per_month.show() 

+------+-----------------+
|months|imo_count_default|
+------+-----------------+
+------+-----------------+



### Status Navigasi dengan Nilai Default per Bulan

In [36]:
# Tentukan nilai yang ingin dihitung
default_value = "Not Defined"  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'nav_status' dengan nilai default per bulan
nav_status_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("nav_status") == default_value) \
    .groupBy("months").agg(F.count("nav_status").alias("nav_status_count_default"))

# Tampilkan DataFrame Spark hasil akhir
nav_status_default_per_month.show() 

+---------+------------------------+
|   months|nav_status_count_default|
+---------+------------------------+
| February|                 1864900|
|  October|                 1820684|
|      May|                 1980389|
|    April|                 1904636|
|   August|                 1621945|
| December|                 1966814|
|September|                 1722734|
| November|                 1532015|
|  January|                 1874220|
|     June|                 1766628|
|    March|                 1926137|
|     July|                 1581566|
+---------+------------------------+



### Tipe Kapal dengan Nilai Default per Bulan

In [37]:
# Tentukan nilai yang ingin dihitung
default_value = "Not Available"  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'vessel_type' dengan nilai default per bulan
vessel_type_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("vessel_type") == default_value) \
    .groupBy("months").agg(F.count("vessel_type").alias("vessel_type_count_default"))

# Tampilkan DataFrame Spark hasil akhir
vessel_type_default_per_month.show() 

+---------+-------------------------+
|   months|vessel_type_count_default|
+---------+-------------------------+
|     July|                    12245|
|    March|                    13334|
| December|                    20818|
| November|                    15119|
|   August|                     9019|
|    April|                     9142|
|September|                    10361|
|  October|                    11355|
|     June|                     9777|
|      May|                     8630|
| February|                    13174|
|  January|                    12379|
+---------+-------------------------+



### Negara Kapal dengan Nilai Default per Bulan

In [38]:
# MISAL DEFAULT = 0
# Tentukan nilai yang ingin dihitung
default_value = 0  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'flag_country' dengan nilai default per bulan
flag_country_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("flag_code") == default_value) \
    .groupBy("months").agg(F.count("flag_code").alias("flag_country_count_default"))

# Tampilkan DataFrame Spark hasil akhir
flag_country_default_per_month.show()  

+------+--------------------------+
|months|flag_country_count_default|
+------+--------------------------+
+------+--------------------------+



### Latitude & Longitude dengan Nilai Default per Bulan

In [39]:
# Tentukan nilai yang ingin dihitung
default_value = 91  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'latitude' dengan nilai default per bulan
latitude_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("latitude") == default_value) \
    .groupBy("months").agg(F.count("latitude").alias("latitude_count_default"))

# Tampilkan DataFrame Spark hasil akhir
latitude_default_per_month.show() 

+------+----------------------+
|months|latitude_count_default|
+------+----------------------+
+------+----------------------+



In [40]:
# Tentukan nilai yang ingin dihitung
default_value = 181  # Ganti dengan nilai yang diinginkan

# Hitung jumlah 'longitude' dengan nilai default per bulan
longitude_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("longitude") == default_value) \
    .groupBy("months").agg(F.count("longitude").alias("longitude_count_default"))

# Tampilkan DataFrame Spark hasil akhir
longitude_default_per_month.show() 

+------+-----------------------+
|months|longitude_count_default|
+------+-----------------------+
+------+-----------------------+



### dt_pos_utc dengan Nilai Default per Bulan

In [41]:
# Tentukan nilai yang ingin dihitung
default_value = "0-0-0 24:60:60"

# Hitung jumlah nilai default pada kolom 'dt_pos_utc' per bulan
dt_pos_utc_default_per_month = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter(F.col("dt_pos_utc") == default_value) \
    .groupBy("months").agg(F.count("dt_pos_utc").alias("dt_pos_ut_count_dafault"))

# Tampilkan DataFrame Spark hasil akhir
dt_pos_utc_default_per_month.show()  

+------+-----------------------+
|months|dt_pos_ut_count_dafault|
+------+-----------------------+
+------+-----------------------+



## Tidak Valid

### MMSI Tidak Valid

In [42]:
# Definisikan nilai mmsi yang valid
nilai_valid = [100000000, 999999999]

# Hitung jumlah mmsi tidak valid per bulan
jumlah_mmsi_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['mmsi'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['mmsi'] != 0) & (data_sampel['mmsi'] != 1193046) &
            (data_sampel['mmsi'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("mmsi").alias("mmsi_tidak_valid"))

# Buat DataFrame hasil
mmsi_invalid = jumlah_mmsi_tidak_valid_per_bulan

# Tampilkan hasil
mmsi_invalid.show()


+--------+----------------+
|  months|mmsi_tidak_valid|
+--------+----------------+
|November|            2637|
|     May|              38|
+--------+----------------+



### IMO Tidak Valid

In [43]:
# Definisikan nilai imo yang valid
nilai_valid = [1000000, 9999999]

# Hitung jumlah imo tidak valid per bulan
jumlah_imo_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['imo'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['imo'] != 0) &
            (data_sampel['imo'].isNotNull()) 
           )\
    .groupBy("months").agg(F.count("imo").alias("imo_tidak_valid"))

# Buat DataFrame hasil
imo_invalid = jumlah_imo_tidak_valid_per_bulan

# Tampilkan hasil
imo_invalid.show() 

+------+---------------+
|months|imo_tidak_valid|
+------+---------------+
+------+---------------+



### Status Navigasi Tidak Valid

In [44]:
# Definisikan nilai nav_status_code yang valid
nilai_valid = [0, 14]

# Hitung jumlah nav_status_code tidak valid per bulan
jumlah_nav_status_code_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['nav_status_code'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['nav_status_code'] != 15) & 
            (data_sampel['nav_status_code'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("nav_status_code").alias("nav_status_code_tidak_valid"))

# Buat DataFrame hasil
nav_status_code_invalid = jumlah_nav_status_code_tidak_valid_per_bulan

# Tampilkan hasil
nav_status_code_invalid.show() 

+---------+---------------------------+
|   months|nav_status_code_tidak_valid|
+---------+---------------------------+
| November|                    1941141|
| February|                    1968914|
|  January|                    2225682|
|    March|                    2152485|
|  October|                    2426015|
|      May|                    2147615|
|   August|                    2297074|
|     June|                    2252958|
| December|                    2550427|
|     July|                    2225480|
|    April|                    2246768|
|September|                    2443584|
+---------+---------------------------+



### Tipe Kapal Tidak Valid

In [45]:
# Definisikan nilai vessel_type_code yang valid
nilai_valid = [1, 255]

# Hitung jumlah vessel_type_code tidak valid per bulan
jumlah_vessel_type_code_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['vessel_type_code'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['vessel_type_code'] != 0) &
            (data_sampel['vessel_type_code'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("vessel_type_code").alias("vessel_type_code_tidak_valid"))

# Buat DataFrame hasil
vessel_type_code_invalid = jumlah_vessel_type_code_tidak_valid_per_bulan

# Tampilkan hasil
vessel_type_code_invalid.show() 

+------+----------------------------+
|months|vessel_type_code_tidak_valid|
+------+----------------------------+
+------+----------------------------+



### Negara Kapal Tidak Valid

In [46]:
# Definisikan nilai flag_country_code yang valid
nilai_valid = [201, 775]

# Hitung jumlah flag_country_code tidak valid per bulan
jumlah_flag_country_code_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['flag_code'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['flag_code'] != 0) &
            (data_sampel['flag_code'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("flag_code").alias("flag_country_code_tidak_valid"))

# Buat DataFrame hasil
flag_country_code_invalid = jumlah_flag_country_code_tidak_valid_per_bulan

# Tampilkan hasil
flag_country_code_invalid.show() 

+------+-----------------------------+
|months|flag_country_code_tidak_valid|
+------+-----------------------------+
+------+-----------------------------+



### Latitude & Longitude Tidak Valid

In [47]:
# Definisikan nilai latitude yang valid
nilai_valid = [-90, 90]

# Hitung jumlah latitude tidak valid per bulan
jumlah_latitude_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['latitude'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['latitude'] != 91) &
            (data_sampel['latitude'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("latitude").alias("latitude_tidak_valid"))

# Buat DataFrame hasil
latitude_invalid = jumlah_latitude_tidak_valid_per_bulan

# Tampilkan hasil
latitude_invalid.show() 

+------+--------------------+
|months|latitude_tidak_valid|
+------+--------------------+
+------+--------------------+



In [48]:
# Definisikan nilai longitude yang valid
nilai_valid = [-180, 180]

# Hitung jumlah longitude tidak valid per bulan
jumlah_longitude_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~data_sampel['longitude'].between(nilai_valid[0], nilai_valid[1])) & 
            (data_sampel['longitude'] != 181) &
            (data_sampel['longitude'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("longitude").alias("longitude_tidak_valid"))

# Buat DataFrame hasil
longitude_invalid = jumlah_longitude_tidak_valid_per_bulan

# Tampilkan hasil
longitude_invalid.show()

+------+---------------------+
|months|longitude_tidak_valid|
+------+---------------------+
+------+---------------------+



### dt_pos_utc Tidak Valid

In [49]:
# Definisikan pola regex untuk nilai 'dt_pos_utc' yang valid
pattern = "^[1-9][0-9]{0,3}-[1-9]|0[1-9]|1[0-2]-[1-9]|0[1-9]|[1-2][0-9]|3[0-1] [0-9]|0[0-9]|1[0-9]|2[0-3]:[0-5][0-9]:[0-5][0-9]$"

default_value = "0-0-0 24:60:60"

# Filter data untuk mendapatkan hanya nilai 'dt_pos_utc' yang tidak valid
dt_pos_utc_tidak_valid_per_bulan = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .filter((~F.col("dt_pos_utc").rlike(pattern)) & 
            (data_sampel['dt_pos_utc'] != default_value) &
            (data_sampel['dt_pos_utc'].isNotNull())
           ) \
    .groupBy("months").agg(F.count("dt_pos_utc").alias("dt_pos_utc_tidak_valid"))

# Buat DataFrame hasil
dt_pos_utc_invalid = dt_pos_utc_tidak_valid_per_bulan

# Tampilkan hasil
dt_pos_utc_invalid.show()  

+------+----------------------+
|months|dt_pos_utc_tidak_valid|
+------+----------------------+
+------+----------------------+



## Missing Value

### MS MMSI

In [50]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "mmsi"
missing_values_per_month_mmsi = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("mmsi").isNull().cast(IntegerType())).alias("mmsi_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_mmsi.show() 

+---------+------------+
|   months|mmsi_missing|
+---------+------------+
|     July|           0|
| November|           0|
| February|           0|
|  January|           0|
|    March|           0|
|  October|           0|
|      May|           0|
|   August|           0|
|    April|           0|
|     June|           0|
| December|           0|
|September|           0|
+---------+------------+



### MS IMO

In [51]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "imo"
missing_values_per_month_imo = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("imo").isNull().cast(IntegerType())).alias("imo_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_imo.show()  

+---------+-----------+
|   months|imo_missing|
+---------+-----------+
|     July|          0|
| November|          0|
| February|          0|
|  January|          0|
|    March|          0|
|  October|          0|
|      May|          0|
|   August|          0|
|    April|          0|
|     June|          0|
| December|          0|
|September|          0|
+---------+-----------+



### MS Status Navigasi

In [52]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "nav_status"
missing_values_per_month_nav_status = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("nav_status").isNull().cast(IntegerType())).alias("nav_status_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_nav_status.show() 

+---------+------------------+
|   months|nav_status_missing|
+---------+------------------+
|     July|                 0|
| November|                 0|
| February|                 0|
|  January|                 0|
|    March|                 0|
|  October|                 0|
|      May|                 0|
|   August|                 0|
|    April|                 0|
|     June|                 0|
| December|                 0|
|September|                 0|
+---------+------------------+



### MS Tipe Kapal

In [53]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "vessel_type"
missing_values_per_month_vessel_type = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("vessel_type").isNull().cast(IntegerType())).alias("vessel_type_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_vessel_type.show() 

+---------+-------------------+
|   months|vessel_type_missing|
+---------+-------------------+
|     July|                  0|
| November|                  0|
| February|                  0|
|  January|                  0|
|    March|                  0|
|  October|                  0|
|      May|                  0|
|   August|                  0|
|    April|                  0|
|     June|                  0|
| December|                  0|
|September|                  0|
+---------+-------------------+



### MS Negara Kapal

In [54]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "flag_country"
missing_values_per_month_flag_code = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("flag_code").isNull().cast(IntegerType())).alias("flag_code_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_flag_code.show() 

+---------+-----------------+
|   months|flag_code_missing|
+---------+-----------------+
|     July|                0|
| November|         20835721|
| February|                0|
|  January|                0|
|    March|                0|
|  October|                0|
|      May|          2005371|
|   August|                0|
|    April|                0|
|     June|                0|
| December|                0|
|September|                0|
+---------+-----------------+



### MS Latitude & Longitude

In [55]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "latitude"
missing_values_per_month_latitude = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("latitude").isNull().cast(IntegerType())).alias("latitude_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_latitude.show() 

+---------+----------------+
|   months|latitude_missing|
+---------+----------------+
|     July|               0|
| November|               0|
| February|               0|
|  January|               0|
|    March|               0|
|  October|               0|
|      May|               0|
|   August|               0|
|    April|               0|
|     June|               0|
| December|               0|
|September|               0|
+---------+----------------+



In [56]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "longitude"
missing_values_per_month_longitude = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("longitude").isNull().cast(IntegerType())).alias("longitude_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_longitude.show() 

+---------+-----------------+
|   months|longitude_missing|
+---------+-----------------+
|     July|                0|
| November|                0|
| February|                0|
|  January|                0|
|    March|                0|
|  October|                0|
|      May|                0|
|   August|                0|
|    April|                0|
|     June|                0|
| December|                0|
|September|                0|
+---------+-----------------+



### MS dt_pos_utc

In [57]:
# Hitung jumlah nilai yang hilang per bulan untuk kolom "dt_pos_utc"
missing_values_per_month_dt_pos_utc = data_sampel.withColumn("months", F.date_format("dt_pos_utc", "MMMM")) \
    .groupBy("months").agg(
        F.sum(F.col("dt_pos_utc").isNull().cast(IntegerType())).alias("dt_pos_utc_missing")
    )

# Tampilkan DataFrame Spark
missing_values_per_month_dt_pos_utc.show() 

+---------+------------------+
|   months|dt_pos_utc_missing|
+---------+------------------+
|     July|                 0|
| November|                 0|
| February|                 0|
|  January|                 0|
|    March|                 0|
|  October|                 0|
|      May|                 0|
|   August|                 0|
|    April|                 0|
|     June|                 0|
| December|                 0|
|September|                 0|
+---------+------------------+



## Pergerakan Anomali

In [36]:
# plot jarak tempuh (y) sama waktu tempuh (x)
# Dari plotnya keliatan mana yang jarak tempuhnya besar tapi waktu tempuhnya kecil (atau sebaliknya?)

### Status Navigasi & SOG

In [8]:
# Tentukan persentil yang diinginkan
percentiles = [0.5, 0.75, 0.9, 0.95, 0.99, 0.999]

# Loop melalui setiap persentil dan hitung nilai kuantil untuk sog
quantile_columns = [expr(f"percentile_approx(sog, {p})").alias(f"sog_{int(p * 100)}") for p in percentiles]

# Kelompokkan berdasarkan nav_status dan hitung kuantilnya
quantiles_per_nav_status = data_sampel.select("nav_status","sog").groupBy("nav_status").agg(*quantile_columns)

# Tampilkan hasil
quantiles_per_nav_status.show()

Error while receiving.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=60>
Closing down clientserver connection
Closing down clientserver connection


ERROR: Exception while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=60>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR: KeyboardInterrupt while

### Validasi Status Navigasi dengan SOG

In [58]:
# Memeriksa kecocokan nav_status dan sog

# Ekstrak nama bulan dari timestamp
data_sampel = data_sampel.withColumn("bulan", date_format("dt_pos_utc", "MMMM"))

# Tentukan kriteria pergerakan tidak normal
kriteria_pergerakan_tidak_normal = (
    ((col("nav_status").isin(['At Anchor', 'Moored'])) &
    (col("sog") > 1)) | 
    ((col("nav_status").isin(['Not Under Command', 'Underway Sailing', 'Under Way Using Engine', 'Engaged In Fishing', 'Restricted Manoeuvrability', 'Aground'])) &
    (col("sog") < 1))
)

# Hitung pergerakan tidak normal per bulan
pergerakan_tidak_normal_per_bulan = data_sampel.withColumn(
    "pergerakan_tidak_normal",
    when(kriteria_pergerakan_tidak_normal, 1).otherwise(0)
).groupBy("bulan").agg({"pergerakan_tidak_normal": "sum"}).withColumnRenamed("SUM(pergerakan_tidak_normal)", "total_pergerakan_tidak_normal")

# Tampilkan hasil
pergerakan_tidak_normal_per_bulan.show()

+---------+-----------------------------+
|    bulan|total_pergerakan_tidak_normal|
+---------+-----------------------------+
|     July|                     11391984|
| November|                      9700593|
| February|                     10108051|
|  January|                     10719582|
|    March|                     11153300|
|  October|                     11891305|
|      May|                     10899656|
|   August|                     11418597|
|    April|                     11034173|
|     June|                     10705654|
| December|                     11620082|
|September|                     11484971|
+---------+-----------------------------+



## Record Duplicate

In [59]:
# Ekstrak kolom bulan dari kolom tanggal atau waktu di DataFrame
data_sampel = data_sampel.withColumn("bulan", date_format("dt_pos_utc", "MMMM"))

# Tentukan record-record duplikat dalam DataFrame dengan membandingkan nilai semua variabel
duplikat = data_sampel.groupBy(*data_sampel.columns).count().where("count > 1")

# Kelompokkan record-record duplikat berdasarkan bulan dan hitung jumlahnya
duplikat_per_bulan = duplikat.groupBy("bulan").count()

# Tampilkan hasilnya
duplikat_per_bulan.show()

+-----+-----+
|bulan|count|
+-----+-----+
+-----+-----+



# Cek Data AIS

## Cek 1 MMSI 1 IMO

In [26]:
# Menghitung jumlah IMO unik
jumlah_imo_unik = data_sampel.select("imo").distinct().count()

# Menampilkan hasil
print("Jumlah IMO unik:", jumlah_imo_unik)

Jumlah IMO unik: 29002


In [27]:
# Menghitung jumlah MMSI unik
jumlah_mmsi_unik = data_sampel.select("mmsi").distinct().count()

# Menampilkan hasil
print("Jumlah MMSI unik:", jumlah_mmsi_unik)

Jumlah MMSI unik: 29944


In [8]:
# Grouping data by "imo"
grouped_data = data_sampel.select("imo", "mmsi") \
    .groupBy("imo") \
    .agg(countDistinct("mmsi").alias("mmsi_count"))

# Urutkan
grouped_data = grouped_data.orderBy(col("mmsi_count").desc())

# Tampilkan hasil
grouped_data.show(grouped_data.count(), truncate = False)

+-------+----------+
|imo    |mmsi_count|
+-------+----------+
|9789312|77        |
|9789374|76        |
|9789348|74        |
|9789350|72        |
|9789336|72        |
|9789362|71        |
|9789324|60        |
|9665619|4         |
|9206035|4         |
|9876543|4         |
|9231509|4         |
|9331696|4         |
|9174220|4         |
|9159098|3         |
|9347475|3         |
|9338694|3         |
|9230880|3         |
|9387267|3         |
|9554585|3         |
|9110054|3         |
|9133587|3         |
|9244403|3         |
|9683520|3         |
|9657014|3         |
|9526992|3         |
|9784659|3         |
|9322827|3         |
|9754068|3         |
|9838759|3         |
|9381184|3         |
|9259733|3         |
|9444120|3         |
|9392860|3         |
|9805685|3         |
|9150925|3         |
|9269362|3         |
|9718868|3         |
|9410959|3         |
|9141338|3         |
|9292981|3         |
|9872470|3         |
|9469675|3         |
|9700500|3         |
|9182291|3         |
|9791767|3   

In [10]:
# Grouping data by "imo" and deduplicating "mmsi" within each group
grouped_data = data_sampel.select("imo", "mmsi") \
    .groupBy("imo") \
    .agg(F.collect_set("mmsi").alias("unique_mmsi_set"))

# Explode the set of unique MMSIs into separate rows
grouped_data_exploded = grouped_data.withColumn("mmsi", explode("unique_mmsi_set"))

# Grouping by "imo" again to collect the unique MMSIs into a list and count them
grouped_data_unique_with_count = grouped_data_exploded.groupBy("imo").agg(
    F.collect_list("mmsi").alias("mmsi_list"),
    F.count("mmsi").alias("mmsi_count")
)

In [11]:
# Menampilkan hasil
grouped_data_unique_with_count.show(20)

+-------+-----------+----------+
|    imo|  mmsi_list|mmsi_count|
+-------+-----------+----------+
|1000215|[538071332]|         1|
|1000253|[300000000]|         1|
|1000423|[319229500]|         1|
|1000899|[215606000]|         1|
|1002378|[538071262]|         1|
|1003451|[636018760]|         1|
|1004792|[319382000]|         1|
|1005411|[533131151]|         1|
|1005552|[319330000]|         1|
|1006128|[319086200]|         1|
|1006702|[339267000]|         1|
|1006910|[229169000]|         1|
|1006946|[319646000]|         1|
|1006984|[319518000]|         1|
|1007055|[215606000]|         1|
|1007380|[525300648]|         1|
|1007524|[229169000]|         1|
|1007914|[319075000]|         1|
|1007952|[538071061]|         1|
|1008217|[319369000]|         1|
+-------+-----------+----------+
only showing top 20 rows



In [12]:
# 1696875705
grouped_data_unique_with_count.count()

29002

In [13]:
# Filter record imo dg mmsi 1
imo_with_1mmsi = grouped_data_unique_with_count.filter(col("mmsi_count") == 1)

# Urutkan
imo_with_1mmsi = imo_with_1mmsi.orderBy(col("imo"))

# Tampilkan hasil
imo_with_1mmsi.show(imo_with_1mmsi.count(), truncate = False)

+-------+-----------+----------+
|imo    |mmsi_list  |mmsi_count|
+-------+-----------+----------+
|1000215|[538071332]|1         |
|1000253|[300000000]|1         |
|1000423|[319229500]|1         |
|1000899|[215606000]|1         |
|1002378|[538071262]|1         |
|1003451|[636018760]|1         |
|1004792|[319382000]|1         |
|1005411|[533131151]|1         |
|1005552|[319330000]|1         |
|1006128|[319086200]|1         |
|1006702|[339267000]|1         |
|1006910|[229169000]|1         |
|1006946|[319646000]|1         |
|1006984|[319518000]|1         |
|1007055|[215606000]|1         |
|1007380|[525300648]|1         |
|1007524|[229169000]|1         |
|1007914|[319075000]|1         |
|1007952|[538071061]|1         |
|1008217|[319369000]|1         |
|1008401|[235118775]|1         |
|1009742|[319084500]|1         |
|1010301|[319023100]|1         |
|1010698|[319032500]|1         |
|1010947|[319806000]|1         |
|1012050|[319053400]|1         |
|1012153|[518998580]|1         |
|1012763|[

In [17]:
imo_with_1mmsi.count()

27320

In [18]:
# Filter record imo dg mmsi 1
imo_with_great1mmsi = grouped_data_unique_with_count.filter(col("mmsi_count") > 1)

# Urutkan
imo_with_great1mmsi = imo_with_great1mmsi.orderBy(col("mmsi_count").desc())

# Tampilkan hasil
imo_with_great1mmsi.show(20)

+-------+--------------------+----------+
|    imo|           mmsi_list|mmsi_count|
+-------+--------------------+----------+
|9789312|[477271600, 63602...|        77|
|9789374|[477857000, 41472...|        76|
|9789348|[372895000, 53800...|        74|
|9789336|[352001248, 37091...|        72|
|9789350|[414755000, 24479...|        72|
|9789362|[372777000, 35124...|        71|
|9789324|[563166800, 37064...|        60|
|9876543|[413000000, 52530...|         4|
|9174220|[210129000, 35200...|         4|
|9206035|[621819060, 66811...|         4|
|9331696|[538003756, 53800...|         4|
|9665619|[219361000, 56457...|         4|
|9231509|[626199000, 63602...|         4|
|8775687|[413000000, 41233...|         3|
|8659651|[667002020, 52540...|         3|
|8818207|[352001340, 35200...|         3|
|9066473|[312360000, 45741...|         3|
|8515790|[525009272, 53300...|         3|
|9167631|[525401090, 52998...|         3|
|9182291|[620810000, 61388...|         3|
+-------+--------------------+----

In [19]:
imo_with_great1mmsi.count()

1682

In [20]:
# Memfilter DataFrame imo_with_great1mmsi dimana nilai kolom "imo" adalah 9789312
imo_with_great1mmsi_9789312 = imo_with_great1mmsi.filter(col("imo") == 9789312)

# Menampilkan DataFrame hasil filter secara vertikal
imo_with_great1mmsi_9789312.show(vertical=True)

-RECORD 0--------------------------
 imo        | 9789312              
 mmsi_list  | [477271600, 63602... 
 mmsi_count | 77                   



In [23]:
# Menampilkan kolom "mmsi_list" dari DataFrame imo_with_great1mmsi_9789312
imo_with_great1mmsi_9789312.select("mmsi_list").show(vertical=True)

-RECORD 0-------------------------
 mmsi_list | [477271600, 63602... 



In [24]:
# Menampilkan kolom "mmsi_list" dari DataFrame imo_with_great1mmsi_9789312
imo_with_great1mmsi_9789312.select("mmsi_list").show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|mmsi_list                                                                                                                                            

## Cek Jumlah Record

In [9]:
# Menghitung jumlah record per MMSI
mmsi_counts = data_sampel.select("mmsi").groupBy("mmsi").agg(count("*").alias("count"))

# Urutkan
mmsi_counts = mmsi_counts.orderBy(col("count"))

# Tampilkan hasil
mmsi_counts.show(mmsi_counts.count(), truncate = False)

+---------+------+
|mmsi     |count |
+---------+------+
|525200128|1     |
|351759000|1     |
|538004156|1     |
|636018881|1     |
|525022377|1     |
|56380200 |1     |
|525023335|1     |
|538004848|1     |
|457417000|1     |
|525021348|1     |
|538005722|1     |
|525010274|1     |
|525015471|1     |
|636092964|1     |
|525019567|1     |
|525024265|1     |
|525016269|1     |
|525300361|1     |
|525005120|1     |
|564776480|1     |
|525111008|1     |
|564883000|1     |
|538004637|1     |
|525015624|1     |
|525100330|1     |
|636015065|1     |
|525011093|1     |
|538003083|1     |
|525022679|1     |
|477700100|1     |
|525022074|1     |
|525900710|1     |
|538014311|1     |
|525800683|1     |
|525019589|1     |
|538004055|1     |
|525024384|1     |
|525300122|1     |
|525016672|1     |
|636092731|1     |
|525011047|1     |
|525003174|1     |
|457900547|1     |
|525018121|1     |
|525800702|1     |
|525300542|1     |
|538002021|1     |
|525015816|1     |
|533130415|1     |
|511899000|1

In [12]:
# Filter record kurang dari 10
mmsi_with_record_less10 = mmsi_counts.filter(col("count") < 10)

# Urutkan
mmsi_with_record_less10 = mmsi_with_record_less10.orderBy(col("count"))

# Tampilkan hasil
mmsi_with_record_less10.show(mmsi_with_record_less10.count(), truncate = False)

+---------+-----+
|mmsi     |count|
+---------+-----+
|538014311|1    |
|525019567|1    |
|525024281|1    |
|525022281|1    |
|525015624|1    |
|273342500|1    |
|525015816|1    |
|525900710|1    |
|525800702|1    |
|525002126|1    |
|636011499|1    |
|525005120|1    |
|525110000|1    |
|636017879|1    |
|525023335|1    |
|525024265|1    |
|511899000|1    |
|538004055|1    |
|636092731|1    |
|538001984|1    |
|457900547|1    |
|538004637|1    |
|351759000|1    |
|525100330|1    |
|525016672|1    |
|538002366|1    |
|525003174|1    |
|525800683|1    |
|525016524|1    |
|520093744|1    |
|525024384|1    |
|525005254|1    |
|373846000|1    |
|525018121|1    |
|636018881|1    |
|311000484|1    |
|525300542|1    |
|525022679|1    |
|538005722|1    |
|538004848|1    |
|525100025|1    |
|538003083|1    |
|525019589|1    |
|525019457|1    |
|525016047|1    |
|457417000|1    |
|525021348|1    |
|56380200 |1    |
|564776480|1    |
|525001147|1    |
|525024011|1    |
|564883000|1    |
|538002021

In [21]:
mmsi_with_record_less10.count()

245

## Cek SOG > 3

In [22]:
# Menghitung jumlah record per MMSI dengan SOG > 3
mmsi_with_sog_greater3 = data_sampel.select("mmsi", "sog").groupBy("mmsi").agg(
    count(when(col("sog") > 3, True)).alias("count_SOG_greater_than_3")
)

# Urutkan
mmsi_with_sog_greater3 = mmsi_with_sog_greater3.orderBy(col("count_SOG_greater_than_3"))

# Tampilkan hasil
mmsi_with_sog_greater3.show(mmsi_with_sog_greater3.count(), truncate = False)

+---------+------------------------+
|mmsi     |count_SOG_greater_than_3|
+---------+------------------------+
|533130918|0                       |
|576140000|0                       |
|525100184|0                       |
|667001856|0                       |
|566280000|0                       |
|56380200 |0                       |
|525015307|0                       |
|525024155|0                       |
|525330063|0                       |
|525018054|0                       |
|525001100|0                       |
|525015624|0                       |
|564776480|0                       |
|525112200|0                       |
|525119071|0                       |
|525019077|0                       |
|525700988|0                       |
|525024326|0                       |
|525900392|0                       |
|312688000|0                       |
|525303330|0                       |
|525100130|0                       |
|525501051|0                       |
|525100630|0                       |
|

In [23]:
# Filter MMSI dg SOG > 3 kurang dari 20
mmsi_with_sog_greater3_less20 = mmsi_with_sog_greater3.filter(col("count_SOG_greater_than_3") < 20)

# Urutkan
mmsi_with_sog_greater3_less20 = mmsi_with_sog_greater3_less20.orderBy(col("count_SOG_greater_than_3"))

# Tampilkan hasil
mmsi_with_sog_greater3_less20.show(mmsi_with_sog_greater3_less20.count(), truncate = False)

+---------+------------------------+
|mmsi     |count_SOG_greater_than_3|
+---------+------------------------+
|525016536|0                       |
|525200297|0                       |
|538004156|0                       |
|525100535|0                       |
|525100130|0                       |
|525109011|0                       |
|525019667|0                       |
|525008023|0                       |
|355090000|0                       |
|212870000|0                       |
|525006081|0                       |
|533996000|0                       |
|525600555|0                       |
|354333000|0                       |
|525020301|0                       |
|518000001|0                       |
|377414000|0                       |
|525022377|0                       |
|525400670|0                       |
|525018113|0                       |
|525015152|0                       |
|538005722|0                       |
|525016028|0                       |
|525003174|0                       |
|

In [24]:
mmsi_with_sog_greater3_less20.count()

835

## Cek Bendera

In [8]:
data_flag_filter = data_sampel.select("flag_country", "RegisteredOwnerCountryOfRegistration", "RegisteredOwnerCountryofDomicile",
                              "ShipManagerCountryOfRegistration", "ShipManagerCountryofDomicileName", 
                               "OperatorCountryOfRegistration", "OperatorCountryofDomicileName")\
                    .filter(
                            (col("flag_country") == col("RegisteredOwnerCountryOfRegistration")) &
                            (col("flag_country") == col("RegisteredOwnerCountryofDomicile")) &
                            (col("flag_country") == col("ShipManagerCountryOfRegistration")) &
                            (col("flag_country") == col("ShipManagerCountryofDomicileName")) &
                            (col("flag_country") == col("OperatorCountryOfRegistration")) &
                            (col("flag_country") == col("OperatorCountryofDomicileName"))
                    )

In [9]:
# Menghitung jumlah record yang memenuhi kondisi
count_same_values = data_flag_filter.count()

# Menampilkan hasil
print("Jumlah record dengan nilai kolom yang sama:", count_same_values)

Jumlah record dengan nilai kolom yang sama: 334534185


In [None]:
data_flag_filter = data_sampel.select("flag_country", "RegisteredOwnerCountryOfRegistration", "RegisteredOwnerCountryofDomicile",
                              "ShipManagerCountryOfRegistration", "ShipManagerCountryofDomicileName", 
                               "OperatorCountryOfRegistration", "OperatorCountryofDomicileName")\
                    .filter(
                            (col("flag_country") == col("RegisteredOwnerCountryOfRegistration")) 
#                             & (col("flag_country") == col("RegisteredOwnerCountryofDomicile")) 
                            & (col("flag_country") == col("ShipManagerCountryOfRegistration")) 
#                             & (col("flag_country") == col("ShipManagerCountryofDomicileName")) 
                            & (col("flag_country") == col("OperatorCountryOfRegistration")) 
#                             & (col("flag_country") == col("OperatorCountryofDomicileName"))
                    )

# Menghitung jumlah record yang memenuhi kondisi
count_same_values = data_flag_filter.count()

# Menampilkan hasil
print("Jumlah record dengan nilai kolom yang sama:", count_same_values)

Jumlah record dengan nilai kolom yang sama: 397821785


In [None]:
data_flag_filter = data_sampel.select("flag_country", "RegisteredOwnerCountryOfRegistration", "RegisteredOwnerCountryofDomicile",
                              "ShipManagerCountryOfRegistration", "ShipManagerCountryofDomicileName", 
                               "OperatorCountryOfRegistration", "OperatorCountryofDomicileName")\
                    .filter(
                            (col("flag_country") == col("RegisteredOwnerCountryOfRegistration")) 
#                             & (col("flag_country") == col("RegisteredOwnerCountryofDomicile")) 
#                             & (col("flag_country") == col("ShipManagerCountryOfRegistration")) 
#                             & (col("flag_country") == col("ShipManagerCountryofDomicileName")) 
#                             & (col("flag_country") == col("OperatorCountryOfRegistration")) 
#                             & (col("flag_country") == col("OperatorCountryofDomicileName"))
                    )

# Menghitung jumlah record yang memenuhi kondisi
count_same_values = data_flag_filter.count()

# Menampilkan hasil
print("Jumlah record dengan nilai kolom yang sama:", count_same_values)

Jumlah record dengan nilai kolom yang sama: 821958427


In [10]:
data_flag_filter = data_sampel.select("flag_country", "RegisteredOwnerCountryOfRegistration", "RegisteredOwnerCountryofDomicile",
                              "ShipManagerCountryOfRegistration", "ShipManagerCountryofDomicileName", 
                               "OperatorCountryOfRegistration", "OperatorCountryofDomicileName")\
                    .filter(
#                             (col("flag_country") == col("RegisteredOwnerCountryOfRegistration")) 
#                             & (col("flag_country") == col("RegisteredOwnerCountryofDomicile")) 
                            (col("flag_country") == col("ShipManagerCountryOfRegistration")) 
#                             & (col("flag_country") == col("ShipManagerCountryofDomicileName")) 
#                             & (col("flag_country") == col("OperatorCountryOfRegistration")) 
#                             & (col("flag_country") == col("OperatorCountryofDomicileName"))
                    )

# Menghitung jumlah record yang memenuhi kondisi
count_same_values = data_flag_filter.count()

# Menampilkan hasil
print("Jumlah record dengan nilai kolom yang sama:", count_same_values)

Jumlah record dengan nilai kolom yang sama: 485660459


In [8]:
data_flag_filter = data_sampel.select("flag_country", "RegisteredOwnerCountryOfRegistration", "RegisteredOwnerCountryofDomicile",
                              "ShipManagerCountryOfRegistration", "ShipManagerCountryofDomicileName", 
                               "OperatorCountryOfRegistration", "OperatorCountryofDomicileName")\
                    .filter(
#                             (col("flag_country") == col("RegisteredOwnerCountryOfRegistration")) 
#                             & (col("flag_country") == col("RegisteredOwnerCountryofDomicile")) 
#                             & (col("flag_country") == col("ShipManagerCountryOfRegistration")) 
#                             & (col("flag_country") == col("ShipManagerCountryofDomicileName")) 
                            (col("flag_country") == col("OperatorCountryOfRegistration")) 
#                             & (col("flag_country") == col("OperatorCountryofDomicileName"))
                    )

# Menghitung jumlah record yang memenuhi kondisi
count_same_values = data_flag_filter.count()

# Menampilkan hasil
print("Jumlah record dengan nilai kolom yang sama:", count_same_values)

Jumlah record dengan nilai kolom yang sama: 455495430


In [11]:
data_flag_filter = data_sampel.select("flag_country", "RegisteredOwnerCountryOfRegistration", "RegisteredOwnerCountryofDomicile",
                              "ShipManagerCountryOfRegistration", "ShipManagerCountryofDomicileName", 
                               "OperatorCountryOfRegistration", "OperatorCountryofDomicileName")\
                    .filter(
#                             (col("flag_country") == col("RegisteredOwnerCountryOfRegistration")) 
                            (col("flag_country") == col("RegisteredOwnerCountryofDomicile")) 
#                             & (col("flag_country") == col("ShipManagerCountryOfRegistration")) 
                            & (col("flag_country") == col("ShipManagerCountryofDomicileName")) 
#                             & (col("flag_country") == col("OperatorCountryOfRegistration")) 
                            & (col("flag_country") == col("OperatorCountryofDomicileName"))
                    )

# Menghitung jumlah record yang memenuhi kondisi
count_same_values = data_flag_filter.count()

# Menampilkan hasil
print("Jumlah record dengan nilai kolom yang sama:", count_same_values)

Jumlah record dengan nilai kolom yang sama: 344740203


In [12]:
data_flag_filter = data_sampel.select("flag_country", "RegisteredOwnerCountryOfRegistration", "RegisteredOwnerCountryofDomicile",
                              "ShipManagerCountryOfRegistration", "ShipManagerCountryofDomicileName", 
                               "OperatorCountryOfRegistration", "OperatorCountryofDomicileName")\
                    .filter(
#                             (col("flag_country") == col("RegisteredOwnerCountryOfRegistration")) 
                            (col("flag_country") == col("RegisteredOwnerCountryofDomicile")) 
#                             & (col("flag_country") == col("ShipManagerCountryOfRegistration")) 
#                             & (col("flag_country") == col("ShipManagerCountryofDomicileName")) 
#                             & (col("flag_country") == col("OperatorCountryOfRegistration")) 
#                             & (col("flag_country") == col("OperatorCountryofDomicileName"))
                    )

# Menghitung jumlah record yang memenuhi kondisi
count_same_values = data_flag_filter.count()

# Menampilkan hasil
print("Jumlah record dengan nilai kolom yang sama:", count_same_values)

Jumlah record dengan nilai kolom yang sama: 608364099


In [13]:
data_flag_filter = data_sampel.select("flag_country", "RegisteredOwnerCountryOfRegistration", "RegisteredOwnerCountryofDomicile",
                              "ShipManagerCountryOfRegistration", "ShipManagerCountryofDomicileName", 
                               "OperatorCountryOfRegistration", "OperatorCountryofDomicileName")\
                    .filter(
#                             (col("flag_country") == col("RegisteredOwnerCountryOfRegistration")) 
#                             & (col("flag_country") == col("RegisteredOwnerCountryofDomicile")) 
#                             & (col("flag_country") == col("ShipManagerCountryOfRegistration")) 
                            (col("flag_country") == col("ShipManagerCountryofDomicileName")) 
#                             & (col("flag_country") == col("OperatorCountryOfRegistration")) 
#                             & (col("flag_country") == col("OperatorCountryofDomicileName"))
                    )

# Menghitung jumlah record yang memenuhi kondisi
count_same_values = data_flag_filter.count()

# Menampilkan hasil
print("Jumlah record dengan nilai kolom yang sama:", count_same_values)

Jumlah record dengan nilai kolom yang sama: 412993542


In [14]:
data_flag_filter = data_sampel.select("flag_country", "RegisteredOwnerCountryOfRegistration", "RegisteredOwnerCountryofDomicile",
                              "ShipManagerCountryOfRegistration", "ShipManagerCountryofDomicileName", 
                               "OperatorCountryOfRegistration", "OperatorCountryofDomicileName")\
                    .filter(
#                             (col("flag_country") == col("RegisteredOwnerCountryOfRegistration")) 
#                             & (col("flag_country") == col("RegisteredOwnerCountryofDomicile")) 
#                             & (col("flag_country") == col("ShipManagerCountryOfRegistration")) 
#                             & (col("flag_country") == col("ShipManagerCountryofDomicileName")) 
#                             & (col("flag_country") == col("OperatorCountryOfRegistration")) 
                            (col("flag_country") == col("OperatorCountryofDomicileName"))
                    )

# Menghitung jumlah record yang memenuhi kondisi
count_same_values = data_flag_filter.count()

# Menampilkan hasil
print("Jumlah record dengan nilai kolom yang sama:", count_same_values)

Jumlah record dengan nilai kolom yang sama: 382772649


# Download

## Fungsi

In [15]:
def create_download_link(df, title, filename):
    csv = df.to_csv(index = False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload, title=title, filename=filename)
    return HTML(html)

## Download Data

In [19]:
# Jumlah Records (All)

# DataFrame Spark
rekaman_per_bulan = spark.createDataFrame(rekaman_per_bulan)

# Export ke Pandas DataFrame
rekaman_per_bulan = rekaman_per_bulan.toPandas()

# Download Data
create_download_link(rekaman_per_bulan, title="rekaman_per_bulan", filename="rekaman_per_bulan.csv")

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


In [10]:
# Records (1%)

# DataFrame Spark
#sampled_data = spark.createDataFrame(sampled_data)

# Export ke Pandas DataFrame
#sampled_data = sampled_data.toPandas()

# Download Data
#create_download_link(sampled_data, title=sampled_data, filename="sampled_data.csv")

In [None]:
# Partisi Records (1%)

# DataFrame Spark
#sampledd_data_1 = spark.createDataFrame(sampledd_data_1)
#sampledd_data_2 = spark.createDataFrame(sampledd_data_2)
#sampledd_data_3 = spark.createDataFrame(sampledd_data_3)
#sampledd_data_4 = spark.createDataFrame(sampledd_data_4)
#sampledd_data_5 = spark.createDataFrame(sampledd_data_5)
#sampledd_data_6 = spark.createDataFrame(sampledd_data_6)
#sampledd_data_7 = spark.createDataFrame(sampledd_data_7)
#sampledd_data_8 = spark.createDataFrame(sampledd_data_8)
#sampledd_data_9 = spark.createDataFrame(sampledd_data_9)
#sampledd_data_10 = spark.createDataFrame(sampledd_data_10)

# Export ke Pandas DataFrame
#sampledd_data_1 = sampledd_data_1.toPandas()
#sampledd_data_2 = sampledd_data_2.toPandas()
#sampledd_data_3 = sampledd_data_3.toPandas()
#sampledd_data_4 = sampledd_data_4.toPandas()
#sampledd_data_5 = sampledd_data_5.toPandas()
#sampledd_data_6 = sampledd_data_6.toPandas()
#sampledd_data_7 = sampledd_data_7.toPandas()
#sampledd_data_8 = sampledd_data_8.toPandas()
#sampledd_data_9 = sampledd_data_9.toPandas()
#sampledd_data_10 = sampledd_data_10.toPandas()

# Download Data
#create_download_link(sampledd_data_1, title=sampledd_data_1, filename="sampledd_data_1.csv")
#create_download_link(sampledd_data_2, title=sampledd_data_2, filename="sampledd_data_2.csv")
#create_download_link(sampledd_data_3, title=sampledd_data_3, filename="sampledd_data_3.csv")
#create_download_link(sampledd_data_4, title=sampledd_data_4, filename="sampledd_data_4.csv")
#create_download_link(sampledd_data_5, title=sampledd_data_5, filename="sampledd_data_5.csv")
#create_download_link(sampledd_data_6, title=sampledd_data_6, filename="sampledd_data_6.csv")
#create_download_link(sampledd_data_7, title=sampledd_data_7, filename="sampledd_data_7.csv")
#create_download_link(sampledd_data_8, title=sampledd_data_8, filename="sampledd_data_8.csv")
#create_download_link(sampledd_data_9, title=sampledd_data_9, filename="sampledd_data_9.csv")
#create_download_link(sampledd_data_10, title=sampledd_data_10, filename="sampledd_data_10.csv")

In [17]:
# Jumlah Records (1%)

# DataFrame Spark
#rekaman_per_bulan_1pers = spark.createDataFrame(rekaman_per_bulan_1pers)

# Export ke Pandas DataFrame
rekaman_per_bulan_1pers = rekaman_per_bulan_1pers.toPandas()

# Download Data
create_download_link(rekaman_per_bulan_1pers, title=rekaman_per_bulan_1pers, filename="rekaman_per_bulan_1pers.csv")

In [17]:
# Statistical Summary

# DataFrame Spark
#stats_df = spark.createDataFrame(stats_df)
#stats_df_2 = spark.createDataFrame(stats_df_2)

# Export ke Pandas DataFrame
# stats_df = stats_df.toPandas()
stats_df_2 = stats_df_2.toPandas()
# quantiles_per_nav_status = quantiles_per_nav_status.toPandas() 

# Download Data
# create_download_link(stats_df, title=stats_df, filename="stats_df.csv")
create_download_link(stats_df_2, title=stats_df_2, filename="stats_df_2.csv")
# create_download_link(quantiles_per_nav_status, title=quantiles_per_nav_status, filename="quantiles_per_nav_status.csv")

In [19]:
# (MMSI, IMO, Status Navigasi, Tipe Kapal, Negara Kapal) Unik

# DataFrame Spark
#df_unique_mmsi_spark = spark.createDataFrame(df_unique_mmsi_spark)
#df_unique_imo_spark = spark.createDataFrame(df_unique_imo_spark)
#mmsi_count_per_nav_status = spark.createDataFrame(mmsi_count_per_nav_status)
#mmsi_count_per_vessel_type = spark.createDataFrame(mmsi_count_per_vessel_type)
#mmsi_count_per_flag_country = spark.createDataFrame(mmsi_count_per_flag_country)

# Export ke Pandas DataFrame
#df_unique_mmsi_spark = df_unique_mmsi_spark.toPandas()
#df_unique_imo_spark = df_unique_imo_spark.toPandas()
mmsi_count_per_nav_status = mmsi_count_per_nav_status.toPandas()
#mmsi_count_per_vessel_type = mmsi_count_per_vessel_type.toPandas()
#mmsi_count_per_flag_country = mmsi_count_per_flag_country.toPandas()
   

# Download Data
#create_download_link(df_unique_mmsi_spark, title=df_unique_mmsi_spark, filename="df_unique_mmsi_spark.csv")
#create_download_link(df_unique_imo_spark, title=df_unique_imo_spark, filename="df_unique_imo_spark.csv")
create_download_link(mmsi_count_per_nav_status, title=mmsi_count_per_nav_status, filename="mmsi_count_per_nav_status.csv")
#create_download_link(mmsi_count_per_vessel_type, title=mmsi_count_per_vessel_type, filename="mmsi_count_per_vessel_type.csv")
#create_download_link(mmsi_count_per_flag_country, title=mmsi_count_per_flag_country, filename="mmsi_count_per_flag_country.csv")

In [65]:
# MMSI Unik per (Status Navigasi, Tipe Kapal, Negara Kapal) 

# DataFrame Spark
#unique_mmsi_per_nav_status = spark.createDataFrame(unique_mmsi_per_nav_status)
#unique_mmsi_per_vessel_type = spark.createDataFrame(unique_mmsi_per_vessel_type)
#unique_mmsi_per_flag_country = spark.createDataFrame(unique_mmsi_per_flag_country)

# Export ke Pandas DataFrame
#unique_mmsi_per_nav_status = unique_mmsi_per_nav_status.toPandas()
unique_mmsi_per_vessel_type = unique_mmsi_per_vessel_type.toPandas()
# unique_mmsi_per_flag_country = unique_mmsi_per_flag_country.toPandas()
  
# Download Data
#create_download_link(unique_mmsi_per_nav_status, title=unique_mmsi_per_nav_status, filename="unique_mmsi_per_nav_status.csv")
create_download_link(unique_mmsi_per_vessel_type, title=unique_mmsi_per_vessel_type, filename="unique_mmsi_per_vessel_type.csv")
# create_download_link(unique_mmsi_per_flag_country, title=unique_mmsi_per_flag_country, filename="unique_mmsi_per_flag_country.csv")

Error while receiving.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=60>
Closing down clientserver connection
Closing down clientserver connection


ERROR: Exception while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=60>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR: KeyboardInterrupt while

KeyboardInterrupt: 

In [None]:
# IMO Unik per (Status Navigasi, Tipe Kapal, Negara Kapal) 

# DataFrame Spark
#unique_imo_per_nav_status = spark.createDataFrame(unique_imo_per_nav_status)
#unique_imo_per_vessel_type = spark.createDataFrame(unique_imo_per_vessel_type)
#unique_imo_per_flag_country = spark.createDataFrame(unique_imo_per_flag_country)

# Export ke Pandas DataFrame
#unique_imo_per_nav_status = unique_imo_per_nav_status.toPandas()
unique_imo_per_vessel_type = unique_imo_per_vessel_type.toPandas()
# unique_imo_per_flag_country = unique_imo_per_flag_country.toPandas()
  
# Download Data
#create_download_link(unique_imo_per_nav_status, title=unique_imo_per_nav_status, filename="unique_imo_per_nav_status.csv")
create_download_link(unique_imo_per_vessel_type, title=unique_imo_per_vessel_type, filename="unique_imo_per_vessel_type.csv")
# create_download_link(unique_imo_per_flag_country, title=unique_imo_per_flag_country, filename="unique_imo_per_flag_country.csv")

In [19]:
# Nilai Default

# DataFrame Spark
#mmsi_default_per_month = spark.createDataFrame(mmsi_default_per_month)
#imo_default_per_month = spark.createDataFrame(imo_default_per_month)
#nav_status_default_per_month = spark.createDataFrame(nav_status_default_per_month)
#vessel_type_default_per_month = spark.createDataFrame(vessel_type_default_per_month)
#flag_country_default_per_month = spark.createDataFrame(flag_country_default_per_month)
#latitude_default_per_month = spark.createDataFrame(latitude_default_per_month)
#longitude_default_per_month = spark.createDataFrame(longitude_default_per_month)
#dt_pos_utc_default_per_month = spark.createDataFrame(dt_pos_utc_default_per_month)

# Export ke Pandas DataFrame
#mmsi_default_per_month = mmsi_default_per_month.toPandas()
#imo_default_per_month = imo_default_per_month.toPandas()
#nav_status_default_per_month = nav_status_default_per_month.toPandas()
#vessel_type_default_per_month = vessel_type_default_per_month.toPandas()
#flag_country_default_per_month = flag_country_default_per_month.toPandas()
#latitude_default_per_month = latitude_default_per_month.toPandas()
#longitude_default_per_month = longitude_default_per_month.toPandas()
#dt_pos_utc_default_per_month = dt_pos_utc_default_per_month.toPandas()
             

# Download Data
#create_download_link(mmsi_default_per_month, title=mmsi_default_per_month, filename="mmsi_default_per_month.csv")
#create_download_link(imo_default_per_month, title=imo_default_per_month, filename="imo_default_per_month.csv")
#create_download_link(nav_status_default_per_month, title=nav_status_default_per_month, filename="nav_status_default_per_month.csv")
#create_download_link(vessel_type_default_per_month, title=vessel_type_default_per_month, filename="vessel_type_default_per_month.csv")
#create_download_link(flag_country_default_per_month, title=flag_country_default_per_month, filename="flag_country_default_per_month.csv")
#create_download_link(latitude_default_per_month, title=latitude_default_per_month, filename="latitude_default_per_month.csv")
#create_download_link(longitude_default_per_month, title=longitude_default_per_month, filename="longitude_default_per_month.csv")
#create_download_link(dt_pos_utc_default_per_month, title=dt_pos_utc_default_per_month, filename="dt_pos_utc_default_per_month.csv")

In [None]:
# Invalid Value

# DataFrame Spark
mmsi_invalid = spark.createDataFrame(mmsi_invalid)
#imo_invalid = spark.createDataFrame(imo_invalid)
#nav_status_code_invalid = spark.createDataFrame(nav_status_code_invalid)
#vessel_type_code_invalid = spark.createDataFrame(vessel_type_code_invalid)
#flag_country_code_invalid = spark.createDataFrame(flag_country_code_invalid)
#latitude_invalid = spark.createDataFrame(latitude_invalid)
#longitude_invalid = spark.createDataFrame(longitude_invalid)
#dt_pos_utc_invalid = spark.createDataFrame(dt_pos_utc_invalid)

# Export ke Pandas DataFrame
mmsi_invalid = mmsi_invalid.toPandas()
#imo_invalid = imo_invalid.toPandas()
#nav_status_code_invalid = nav_status_code_invalid.toPandas()
#vessel_type_code_invalid = vessel_type_code_invalid.toPandas()
#flag_country_code_invalid = flag_country_code_invalid.toPandas()
#latitude_invalid = latitude_invalid.toPandas()
#longitude_invalid = longitude_invalid.toPandas()
#dt_pos_utc_invalid = dt_pos_utc_invalid.toPandas()
       

# Download Data
create_download_link(mmsi_invalid, title=mmsi_invalid, filename="mmsi_invalid.csv")
#create_download_link(imo_invalid, title=imo_invalid, filename="imo_invalid.csv")
#create_download_link(nav_status_code_invalid, title=nav_status_code_invalid, filename="nav_status_code_invalid.csv")
#create_download_link(vessel_type_code_invalid, title=vessel_type_code_invalid, filename="vessel_type_code_invalid.csv")
#create_download_link(flag_country_code_invalid, title=flag_country_code_invalid, filename="flag_country_code_invalid.csv")
#create_download_link(latitude_invalid, title=latitude_invalid, filename="latitude_invalid.csv")
#create_download_link(longitude_invalid, title=longitude_invalid, filename="longitude_invalid.csv")
#create_download_link(dt_pos_utc_invalid, title=dt_pos_utc_invalid, filename="dt_pos_utc_invalid.csv")

In [None]:
# Missing Value

# DataFrame Spark
missing_values_per_month_mmsi = spark.createDataFrame(missing_values_per_month_mmsi)
#missing_values_per_month_imo = spark.createDataFrame(missing_values_per_month_imo)
#missing_values_per_month_nav_status = spark.createDataFrame(missing_values_per_month_nav_status)
#missing_values_per_month_vessel_type = spark.createDataFrame(missing_values_per_month_vessel_type)
#missing_values_per_month_flag_country = spark.createDataFrame(missing_values_per_month_flag_country)
#missing_values_per_month_latitude = spark.createDataFrame(missing_values_per_month_latitude)
#missing_values_per_month_longitude = spark.createDataFrame(missing_values_per_month_longitude)
#missing_values_per_month_dt_pos_utc = spark.createDataFrame(missing_values_per_month_dt_pos_utc)

# Export ke Pandas DataFrame
missing_values_per_month_mmsi = missing_values_per_month_mmsi.toPandas()
#missing_values_per_month_imo = missing_values_per_month_imo.toPandas()
#missing_values_per_month_nav_status = missing_values_per_month_nav_status.toPandas()
#missing_values_per_month_vessel_type = missing_values_per_month_vessel_type.toPandas()
#missing_values_per_month_flag_country = missing_values_per_month_flag_country.toPandas()
#missing_values_per_month_latitude = missing_values_per_month_latitude.toPandas()
#missing_values_per_month_longitude = missing_values_per_month_longitude.toPandas()
#missing_values_per_month_dt_pos_utc = missing_values_per_month_dt_pos_utc.toPandas()
        

# Download Data
create_download_link(missing_values_per_month_mmsi, title=missing_values_per_month_mmsi, filename="missing_values_per_month_mmsi.csv")
#create_download_link(missing_values_per_month_imo, title=missing_values_per_month_imo, filename="missing_values_per_month_imo.csv")
#create_download_link(missing_values_per_month_nav_status, title=missing_values_per_month_nav_status, filename="missing_values_per_month_nav_status.csv")
#create_download_link(missing_values_per_month_vessel_type, title=missing_values_per_month_vessel_type, filename="missing_values_per_month_vessel_type.csv")
#create_download_link(missing_values_per_month_flag_country, title=missing_values_per_month_flag_country, filename="missing_values_per_month_flag_country.csv")
#create_download_link(missing_values_per_month_latitude, title=missing_values_per_month_latitude, filename="missing_values_per_month_latitude.csv")
#create_download_link(missing_values_per_month_longitude, title=missing_values_per_month_longitude, filename="missing_values_per_month_longitude.csv")
#create_download_link(missing_values_per_month_dt_pos_utc, title=missing_values_per_month_dt_pos_utc, filename="missing_values_per_month_dt_pos_utc.csv")

In [None]:
# Filter (MMSI, IMO, Status Navigasi, Tipe Kapal, Negara Kapal Default, Latitude, Longitude, dt_pos_utc)

# DataFrame Spark
unique_filtered_mmsi_per_month = spark.createDataFrame(unique_filtered_mmsi_per_month)
#unique_filtered_imo_per_month = spark.createDataFrame(unique_filtered_imo_per_month)
#mmsi_count_per_filtered_nav_status = spark.createDataFrame(mmsi_count_per_filtered_nav_status)
#unique_mmsi_per_filtered_nav_status = spark.createDataFrame(unique_mmsi_per_filtered_nav_status)
#mmsi_count_per_filtered_vessel_type = spark.createDataFrame(mmsi_count_per_filtered_vessel_type)
#unique_mmsi_per_filtered_vessel_type = spark.createDataFrame(unique_mmsi_per_filtered_vessel_type)
#mmsi_count_per_filtered_flag_country = spark.createDataFrame(mmsi_count_per_filtered_flag_country)
#unique_mmsi_per_filtered_flag_country = spark.createDataFrame(unique_mmsi_per_filtered_flag_country)

# Export ke Pandas DataFrame
unique_filtered_mmsi_per_month = unique_filtered_mmsi_per_month.toPandas()
#df_unique_filtered_imo_spark = df_unique_filtered_imo_spark.toPandas()
#mmsi_count_per_filtered_nav_status = mmsi_count_per_filtered_nav_status.toPandas()
#unique_mmsi_per_filtered_nav_status = unique_mmsi_per_filtered_nav_status.toPandas()
#mmsi_count_per_filtered_vessel_type = mmsi_count_per_filtered_vessel_type.toPandas()
#unique_mmsi_per_filtered_vessel_type = unique_mmsi_per_filtered_vessel_type.toPandas()
#mmsi_count_per_filtered_flag_country = mmsi_count_per_filtered_flag_country.toPandas()
#unique_mmsi_per_filtered_flag_country = unique_mmsi_per_filtered_flag_country.toPandas()


# Download Data
create_download_link(unique_filtered_mmsi_per_month, title=unique_filtered_mmsi_per_month, filename="unique_filtered_mmsi_per_month.csv")
#create_download_link(unique_filtered_imo_per_month, title=unique_filtered_imo_per_month, filename="unique_filtered_imo_per_month.csv")
#create_download_link(mmsi_count_per_filtered_nav_status, title=mmsi_count_per_filtered_nav_status, filename="mmsi_count_per_filtered_nav_status.csv")
#create_download_link(unique_mmsi_per_filtered_nav_status, title=unique_mmsi_per_filtered_nav_status, filename="unique_mmsi_per_filtered_nav_status.csv")
#create_download_link(mmsi_count_per_filtered_vessel_type, title=mmsi_count_per_filtered_vessel_type, filename="mmsi_count_per_filtered_vessel_type.csv")
#create_download_link(unique_mmsi_per_filtered_vessel_type, title=unique_mmsi_per_filtered_vessel_type, filename="unique_mmsi_per_filtered_vessel_type.csv")
#create_download_link(mmsi_count_per_filtered_flag_country, title=mmsi_count_per_filtered_flag_country, filename="mmsi_count_per_filtered_flag_country.csv")
#create_download_link(unique_mmsi_per_filtered_flag_country, title=unique_mmsi_per_filtered_flag_country, filename="unique_mmsi_per_filtered_flag_country.csv")

In [None]:
# Filter yg melakukan pelayaran/lintasan anomali(?)

# DataFrame Spark
rekaman_per_bulan_1pers = spark.createDataFrame(rekaman_per_bulan_1pers)
rekaman_per_bulan_1pers = spark.createDataFrame(rekaman_per_bulan_1pers)

# Export ke Pandas DataFrame
rekaman_per_bulan_1pers = rekaman_per_bulan_1pers.toPandas()

# Download Data
create_download_link(rekaman_per_bulan_1pers, filename="rekaman_per_bulan_1pers.csv")