# Fetch specific mmsi data

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder \
    .appName("AIS Data Processing - Fetch Specific MMSI") \
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017/ais_training_data.ais_data") \
    .config("spark.mongodb.output.uri", "mongodb://localhost:27017/ais_training_data.specific_mmsi_data") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .config("spark.mongodb.input.batchSize", "100000") \
    .config("spark.mongodb.output.batchSize", "10000") \
    .config("spark.mongodb.output.bulk.enabled", "true") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .getOrCreate()

# Read data from MongoDB
df = spark.read.format("mongo").option("bulkRead", "true").load()

# Filter for the specific MMSI
specific_mmsi = "265610860"
mmsi_df = df.filter(col("MMSI") == specific_mmsi)

# Persist the DataFrame in memory
mmsi_df = mmsi_df.persist()

# Save to MongoDB
output_collection = f"mmsi_{specific_mmsi}_data"
mmsi_df.write.format("mongo") \
    .mode("overwrite") \
    .option("database", "ais_training_data") \
    .option("collection", output_collection) \
    .save()

print(f"Data for MMSI {specific_mmsi} has been saved to MongoDB collection: {output_collection}")

# Unpersist the DataFrame to free up memory
mmsi_df.unpersist()

# Stop Spark session
spark.stop()

# predict ports 

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, when
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, BooleanType, ArrayType
import pandas as pd
import numpy as np
from math import radians, sin, cos, sqrt, atan2
from pyspark.sql.functions import col, expr
# Create Spark session
spark = SparkSession.builder \
    .appName("AIS Data Processing - Fetch Specific MMSI") \
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017/ais_training_data.ais_data") \
    .config("spark.mongodb.output.uri", "mongodb://localhost:27017/ais_training_data.specific_mmsi_data") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .config("spark.driver.memory", "8g") \
    .config("spark.mongodb.input.batchSize", "100000") \
    .config("spark.mongodb.output.batchSize", "10000") \
    .config("spark.mongodb.output.bulk.enabled", "true") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .getOrCreate()

# Function to calculate Haversine distance
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Earth's radius in kilometers
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    return R * c

# Read port data from CSV
ports_df = spark.read.csv("ports_output.csv", header=True, inferSchema=True)
ports_df = ports_df.select("Port Name", "Latitude", "Longitude")

# Collect ports data to driver for broadcasting
ports_data = ports_df.collect()

# Broadcast ports data
broadcast_ports = spark.sparkContext.broadcast(ports_data)

# UDF to find nearest port
@udf(StructType([
    StructField("NearestPort", StringType()),
    StructField("DistanceToPort", DoubleType())
]))
def find_nearest_port(lat, lon):
    min_distance = float('inf')
    nearest_port = None
    for port in broadcast_ports.value:
        distance = haversine_distance(lat, lon, port['Latitude'], port['Longitude'])
        if distance < min_distance:
            min_distance = distance
            nearest_port = port['Port Name']
    return (nearest_port, min_distance)

# Read data from MongoDB
df = spark.read.format("mongo").option("bulkRead", "true").load()

processed_df = df.withColumn("LastLocation", expr("Locations[size(Locations) - 1]"))  \
    .withColumn("NearestPortInfo", find_nearest_port(col("LastLocation.Latitude"), col("LastLocation.Longitude"))) \
    .withColumn("NearestPort", col("NearestPortInfo.NearestPort")) \
    .withColumn("DistanceToPort", col("NearestPortInfo.DistanceToPort")) \
    .withColumn("IsPortStop", when(col("DistanceToPort") < 10, True).otherwise(False)) \
    .withColumn("Destination", when(col("IsPortStop"), col("NearestPort")).otherwise(None)) \
    .drop("LastLocation", "NearestPortInfo")
# Write processed data back to MongoDB
processed_df.write.format("mongo").mode("overwrite").option("bulkWrite", "true").save()

print("Port stop processing completed.")

# Stop Spark session
spark.stop()

24/10/10 03:21:28 WARN Utils: Your hostname, talal resolves to a loopback address: 127.0.1.1; using 192.168.187.27 instead (on interface wlo1)
24/10/10 03:21:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/talal/.ivy2/cache
The jars for the packages stored in: /home/talal/.ivy2/jars
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-bde1ae09-1989-48b0-b14d-626124eee7eb;1.0
	confs: [default]


:: loading settings :: url = jar:file:/home/talal/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 in central
	found org.mongodb#mongodb-driver-sync;4.0.5 in central
	found org.mongodb#bson;4.0.5 in central
	found org.mongodb#mongodb-driver-core;4.0.5 in central
:: resolution report :: resolve 129ms :: artifacts dl 4ms
	:: modules in use:
	org.mongodb#bson;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-core;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-sync;4.0.5 from central in [default]
	org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   4   |   0   |   0   |   0   ||   4   |   0   |
	---------------------------------------------------------------------
:: retrieving ::

Port stop processing completed.


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, lit, desc
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Initialize Spark session
spark = SparkSession.builder \
    .appName("AIS Data Processing") \
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017/ais_training_data.specific_mmsi_data") \
    .config("spark.mongodb.output.uri", "mongodb://localhost:27017/ais_training_data.ais_data_summary") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .config("spark.mongodb.input.batchSize", "100000") \
    .config("spark.mongodb.output.batchSize", "10000") \
    .config("spark.driver.memory", "8g") \
    .config("spark.mongodb.output.bulk.enabled", "true") \
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY") \
    .getOrCreate()

# Read data from MongoDB
df = spark.read.format("mongo").option("bulkRead", "true").load()

# Filter out rows with undefined ship type and empty destination

# Add IsPortStop filter
port_stop_df = df.filter(col("IsPortStop") == True).filter(col("Destination").isNotNull())

# Group by Date, Ship_Type, and Destination, count occurrences, and sort in descending order
grouped_df = port_stop_df.groupBy("Date", "Ship_Type", "Destination") \
    .agg(count("*").alias("Count")) \
    .orderBy(desc("Count"))

grouped_df.show()
input()

# Write the sorted result to a new MongoDB collection
grouped_df.write.format("mongo") \
    .mode("overwrite") \
    .option("database", "ais_training_data") \
    .option("collection", "ais_data_summary") \
    .save()

print("Data processing completed and results stored in MongoDB.")

# Stop Spark session
spark.stop()

24/10/11 16:27:22 WARN Utils: Your hostname, talal resolves to a loopback address: 127.0.1.1; using 192.168.10.112 instead (on interface wlo1)
24/10/11 16:27:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/talal/.ivy2/cache
The jars for the packages stored in: /home/talal/.ivy2/jars
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-4b186e34-6f39-4e6e-bbe8-89c76b7d6df0;1.0
	confs: [default]


:: loading settings :: url = jar:file:/home/talal/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 in central
	found org.mongodb#mongodb-driver-sync;4.0.5 in central
	found org.mongodb#bson;4.0.5 in central
	found org.mongodb#mongodb-driver-core;4.0.5 in central
:: resolution report :: resolve 143ms :: artifacts dl 6ms
	:: modules in use:
	org.mongodb#bson;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-core;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-sync;4.0.5 from central in [default]
	org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   4   |   0   |   0   |   0   ||   4   |   0   |
	---------------------------------------------------------------------
:: retrieving ::

+----------+---------+---------------+-----+
|      Date|Ship_Type|    Destination|Count|
+----------+---------+---------------+-----+
|2023-09-10|Undefined|        Hamburg|  250|
|2023-09-07|Undefined|        Hamburg|  214|
|2023-06-08|Undefined|      Amsterdam|  210|
|2023-06-07|Undefined|        Hamburg|  204|
|2023-06-09|Undefined|        Hamburg|  201|
|2023-09-11|Undefined|        Hamburg|  194|
|2023-07-09|Undefined|          Donso|  179|
|2023-06-08|Undefined|      Dordrecht|  179|
|2023-06-08|Undefined|Hook of Holland|  178|
|2023-07-09|Undefined|          Laboe|  158|
|2023-06-20|Undefined|        Hamburg|  156|
|2023-06-17|Undefined|          Donso|  155|
|2023-06-07|Undefined|      Dordrecht|  155|
|2023-06-11|Undefined|          Donso|  154|
|2023-06-10|Undefined|          Donso|  148|
|2023-06-08|Undefined|      Rotterdam|  147|
|2023-06-06|Undefined|        Hamburg|  147|
|2023-06-05|Undefined|          Donso|  145|
|2023-09-17|Undefined|          Laboe|  144|
|2023-06-1

24/10/11 16:51:24 WARN IndexShuffleBlockResolver: Error deleting data /tmp/blockmgr-59437d75-345c-42d6-b1e2-88eb52f42e5e/12/shuffle_1_2579_0.data


Data processing completed and results stored in MongoDB.


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, month, year, concat, lpad, desc

# Initialize Spark session
spark = SparkSession.builder \
    .appName("AIS Data Summary Processing") \
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017/ais_training_data.ais_data_summary") \
    .config("spark.mongodb.output.uri", "mongodb://localhost:27017/ais_training_data.ais_data_monthly_summary") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .getOrCreate()

# Read data from MongoDB
df = spark.read.format("mongo").load()

# Extract year and month from the Date field and create a YYYY-MM format
df_with_month = df.withColumn("Year", year(col("Date"))) \
                  .withColumn("Month", lpad(month(col("Date")), 2, "0")) \
                  .withColumn("YearMonth", concat(col("Year"), lit("-"), col("Month")))

# Group by YearMonth, Ship_Type, and Destination, sum the Count
monthly_summary = df_with_month.groupBy("YearMonth", "Ship_Type", "Destination") \
    .agg(count("*").alias("TotalCount")) \
    .orderBy(desc("YearMonth"), desc("TotalCount"))

# Show the results
monthly_summary.show(n=20, truncate=False)

# Write the results to a new MongoDB collection
monthly_summary.write.format("mongo") \
    .mode("overwrite") \
    .option("database", "ais_training_data") \
    .option("collection", "ais_data_monthly_summary") \
    .save()

print("Monthly summary processed and saved to MongoDB.")

# Stop Spark session
spark.stop()

                                                                                

+---------+-----------+-------------------+----------+
|YearMonth|Ship_Type  |Destination        |TotalCount|
+---------+-----------+-------------------+----------+
|2023-12  |Sailing    |Frederikshavn      |31        |
|2023-12  |Tug        |Aalborg            |31        |
|2023-12  |Port tender|Orehoved           |31        |
|2023-12  |Fishing    |Hanstholm          |31        |
|2023-12  |Undefined  |Rodbyhavn          |31        |
|2023-12  |Sailing    |Brondby            |31        |
|2023-12  |Cargo      |Frederiksberg      |31        |
|2023-12  |Military   |Kyndby             |31        |
|2023-12  |Undefined  |Laboe              |31        |
|2023-12  |Pleasure   |Allinge            |31        |
|2023-12  |Reserved   |Asnaesvaerkets Havn|31        |
|2023-12  |Tug        |Rodbyhavn          |31        |
|2023-12  |Military   |Nykobing           |31        |
|2023-12  |Passenger  |Laesohavn          |31        |
|2023-12  |Tug        |Masnedsund         |31        |
|2023-12  

                                                                                

Monthly summary processed and saved to MongoDB.


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import countDistinct, length, sum

# Initialize Spark session
spark = SparkSession.builder \
    .appName("AIS Unique MMSI Count and Length") \
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017/ais_training_data.specific_mmsi_data") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .getOrCreate()

# Read data from MongoDB
df = spark.read.format("mongo").load()

# Count unique MMSI values and calculate total length of unique MMSI
result = df.select("MMSI").distinct() \
    .select(
        countDistinct("MMSI").alias("UniqueMMSICount"),
        sum(length("MMSI").cast("long")).alias("TotalLengthOfUniqueMMSI")
    )

# Show the results
result.show()


# Stop Spark session
spark.stop()

                                                                                

+---------------+-----------------------+
|UniqueMMSICount|TotalLengthOfUniqueMMSI|
+---------------+-----------------------+
|          53929|                 483865|
+---------------+-----------------------+



                                                                                

Number of unique MMSI: 53929
Total length of unique MMSI: 483865
