In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp

In [3]:
spark = SparkSession.builder \
    .appName("pyspark_notebook") \
    .master("local[*]") \
    .config("spark.executor.memory", "1g") \
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017") \
    .config("spark.mongodb.output.uri", "mongodb://localhost:27017") \
    .config("spark.mongodb.input.database", "MyVehiclesData") \
    .config("spark.mongodb.input.collection", "RawVehiclesData") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .getOrCreate()

:: loading settings :: url = jar:file:/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/georgiamparakou/.ivy2/cache
The jars for the packages stored in: /Users/georgiamparakou/.ivy2/jars
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-0d35eb53-e13b-4f9a-b983-da78d0c79fba;1.0
	confs: [default]
	found org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 in central
	found org.mongodb#mongodb-driver-sync;4.0.5 in central
	found org.mongodb#bson;4.0.5 in central
	found org.mongodb#mongodb-driver-core;4.0.5 in central
:: resolution report :: resolve 98ms :: artifacts dl 4ms
	:: modules in use:
	org.mongodb#bson;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-core;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-sync;4.0.5 from central in [default]
	org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules        

In [4]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType

In [5]:
custom_schema = StructType([
    StructField("_id", StructType([StructField("oid", StringType(), nullable=True)]), nullable=True),
    StructField("name", StringType(), nullable=True),
    StructField("orig", StringType(), nullable=True),
    StructField("dest", StringType(), nullable=True),
    StructField("link", StringType(), nullable=True),
    StructField("speed", DoubleType(), nullable=True),
    StructField("time", TimestampType(), nullable=True)
])

In [6]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource") \
            .option("uri", "mongodb://127.0.0.1:27017/MyVehiclesData.RawVehiclesData") \
            .load()


In [7]:
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- link: string (nullable = true)
 |-- name: string (nullable = true)
 |-- orig: string (nullable = true)
 |-- speed: double (nullable = true)
 |-- time: string (nullable = true)



In [8]:
from pyspark.sql.functions import col, to_timestamp

In [9]:
df = df.withColumn("time", to_timestamp(col("time"), "yyyy-MM-dd HH:mm:ss"))

In [10]:
df_sorted = df.orderBy(col("time"))

In [11]:
df_sorted.select("name", "orig", "dest", "link", "time", "speed") \
         .dropDuplicates(["name", "orig", "dest", "link", "time", "speed"]) \
         .show(100, truncate=False)

[Stage 1:>                                                          (0 + 1) / 1]

+----+----+----+----+-------------------+-----+
|name|orig|dest|link|time               |speed|
+----+----+----+----+-------------------+-----+
|0   |N1  |S1  |N1I1|2024-06-28 18:25:44|20.0 |
|0   |N1  |S1  |N1I1|2024-06-28 18:27:22|30.0 |
|0   |N1  |S1  |N1I1|2024-06-28 18:30:54|20.0 |
|0   |N1  |S1  |N1I1|2024-06-28 18:25:39|30.0 |
|0   |N1  |S1  |N1I1|2024-06-28 18:27:27|20.0 |
|0   |N1  |S1  |N1I1|2024-06-28 18:25:49|30.0 |
|0   |N1  |S1  |N1I1|2024-06-28 18:30:59|30.0 |
|0   |N1  |S1  |N1I1|2024-06-28 18:30:49|30.0 |
+----+----+----+----+-------------------+-----+



                                                                                