In [34]:
import findspark
findspark.init()

In [35]:
from pyspark.sql import SparkSession

In [25]:
spark = SparkSession.builder \
    .appName("pyspark_notebook") \
    .master("local[*]") \
    .config("spark.executor.memory", "1g") \
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017") \
    .config("spark.mongodb.output.uri", "mongodb://localhost:27017") \
    .config("spark.mongodb.input.database", "MyVehiclesData") \
    .config("spark.mongodb.input.collection", "RawVehiclesData") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .getOrCreate()

In [26]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType

In [27]:
custom_schema = StructType([
    StructField("_id", StructType([StructField("oid", StringType(), nullable=True)]), nullable=True),
    StructField("name", StringType(), nullable=True),
    StructField("orig", StringType(), nullable=True),
    StructField("dest", StringType(), nullable=True),
    StructField("link", StringType(), nullable=True),
    StructField("speed", DoubleType(), nullable=True),
    StructField("time", TimestampType(), nullable=True)
])

In [28]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource") \
            .option("uri", "mongodb://127.0.0.1:27017/MyVehiclesData.RawVehiclesData") \
            .load()


In [29]:
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- link: string (nullable = true)
 |-- name: string (nullable = true)
 |-- orig: string (nullable = true)
 |-- speed: double (nullable = true)
 |-- time: string (nullable = true)



In [30]:
from pyspark.sql.functions import col, to_timestamp

In [31]:
df = df.withColumn("time", to_timestamp(col("time"), "yyyy-MM-dd HH:mm:ss"))

In [32]:
df = df.select("_id.oid", "name", "orig", "dest" ,"link", "speed", "time")

In [33]:
df.show()

+--------------------+----+----+----+----+-----+-------------------+
|                 oid|name|orig|dest|link|speed|               time|
+--------------------+----+----+----+----+-----+-------------------+
|663c97107418a0332...|   0|  N1|  S1|N1I1| 30.0|2024-05-09 12:27:55|
|663c97107418a0332...|   0|  N1|  S1|N1I1| 20.0|2024-05-09 12:28:00|
|663c9725c4f83968b...|   0|  N1|  S1|N1I1| 30.0|2024-05-09 12:27:55|
|663c9725c4f83968b...|   0|  N1|  S1|N1I1| 20.0|2024-05-09 12:28:00|
|663c9725c4f83968b...|   0|  N1|  S1|N1I1| 30.0|2024-05-09 12:28:05|
|663c9725c4f83968b...|   0|  N1|  S1|N1I1| 30.0|2024-05-09 12:28:14|
|663c9727c4f83968b...|   0|  N1|  S1|N1I1| 20.0|2024-05-09 12:28:19|
|663c972ac4f83968b...|   0|  N1|  S1|N1I1| 30.0|2024-05-09 12:28:24|
|663c972fc4f83968b...|   0|  N1|  S1|N1I1| 30.0|2024-05-09 12:28:29|
|663c9734c4f83968b...|   0|  N1|  S1|I1S1| 20.0|2024-05-09 12:28:34|
|663c9739c4f83968b...|   0|  N1|  S1|I1S1| 30.0|2024-05-09 12:28:39|
|663c973ec4f83968b...|   0|  N1|  