In [38]:
import findspark
findspark.init()

In [39]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp

In [40]:
spark = SparkSession.builder \
    .appName("pyspark_notebook") \
    .master("local[*]") \
    .config("spark.executor.memory", "1g") \
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017") \
    .config("spark.mongodb.output.uri", "mongodb://localhost:27017") \
    .config("spark.mongodb.input.database", "MyVehiclesData") \
    .config("spark.mongodb.input.collection", "RawVehiclesData") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .getOrCreate()

In [41]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType

In [42]:
custom_schema = StructType([
    StructField("_id", StructType([StructField("oid", StringType(), nullable=True)]), nullable=True),
    StructField("name", StringType(), nullable=True),
    StructField("orig", StringType(), nullable=True),
    StructField("dest", StringType(), nullable=True),
    StructField("link", StringType(), nullable=True),
    StructField("speed", DoubleType(), nullable=True),
    StructField("time", TimestampType(), nullable=True)
])

In [43]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource") \
            .option("uri", "mongodb://127.0.0.1:27017/MyVehiclesData.RawVehiclesData") \
            .load()


In [44]:
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- link: string (nullable = true)
 |-- name: string (nullable = true)
 |-- orig: string (nullable = true)
 |-- speed: double (nullable = true)
 |-- time: string (nullable = true)



In [45]:
from pyspark.sql.functions import col, to_timestamp

In [46]:
df = df.withColumn("time", to_timestamp(col("time"), "yyyy-MM-dd HH:mm:ss"))

In [47]:
df_sorted = df.orderBy(col("time"))

In [48]:
df_sorted.select("_id.oid", "name", "orig", "dest", "link", "time", "speed") \
         .dropDuplicates(["oid", "name", "orig", "dest", "link", "time", "speed"]) \
         .show(100, truncate=False)

+------------------------+----+----+----+--------+-------------------+-----+
|oid                     |name|orig|dest|link    |time               |speed|
+------------------------+----+----+----+--------+-------------------+-----+
|667ec94ae654e4743b5215f9|0   |N1  |S1  |I1S1    |2024-06-28 17:31:52|30.0 |
|667ec939e654e4743b5215e6|0   |N1  |S1  |N1I1    |2024-06-28 17:31:37|30.0 |
|667ecadbf7a91f6f61214158|0   |N1  |S1  |N1I1    |2024-06-28 17:31:37|30.0 |
|667ecadbf7a91f6f61214160|0   |N1  |S1  |I1S1    |2024-06-28 17:31:42|15.0 |
|667ecadbf7a91f6f61214162|1   |S2  |N2  |S2I2    |2024-06-28 17:31:27|25.0 |
|667ecadbf7a91f6f61214165|0   |N1  |S1  |N1I1    |2024-06-28 17:38:27|20.0 |
|667ec939e654e4743b5215e7|0   |N1  |S1  |I1S1    |2024-06-28 17:31:42|15.0 |
|667ecadbf7a91f6f61214164|1   |S2  |N2  |S2I2    |2024-06-28 17:31:22|30.0 |
|667ecadff7a91f6f6121416e|0   |N1  |S1  |N1I1    |2024-06-28 17:38:37|30.0 |
|667ec93fe654e4743b5215ed|0   |N1  |S1  |I1S1    |2024-06-28 17:31:47|30.0 |