In [12]:
import findspark
findspark.init()

In [13]:
from pyspark.sql import SparkSession

In [14]:
spark = SparkSession.builder \
    .appName("pyspark_notebook") \
    .master("local[*]") \
    .config("spark.executor.memory", "1g") \
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017") \
    .config("spark.mongodb.output.uri", "mongodb://localhost:27017") \
    .config("spark.mongodb.input.database", "MyVehiclesData") \
    .config("spark.mongodb.input.collection", "RawVehiclesData") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .getOrCreate()

In [15]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType

In [16]:
custom_schema = StructType([
    StructField("_id", StructType([StructField("oid", StringType(), nullable=True)]), nullable=True),
    StructField("name", StringType(), nullable=True),
    StructField("orig", StringType(), nullable=True),
    StructField("dest", StringType(), nullable=True),
    StructField("link", StringType(), nullable=True),
    StructField("speed", DoubleType(), nullable=True),
    StructField("time", TimestampType(), nullable=True)
])

In [17]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource") \
            .option("uri", "mongodb://127.0.0.1:27017/MyVehiclesData.RawVehiclesData") \
            .load()


In [18]:
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- link: string (nullable = true)
 |-- name: string (nullable = true)
 |-- speed: double (nullable = true)
 |-- time: string (nullable = true)



In [19]:
from pyspark.sql.functions import col, to_timestamp

In [20]:
df = df.withColumn("time", to_timestamp(col("time"), "yyyy-MM-dd HH:mm:ss"))

In [21]:
df = df.select("_id.oid", "name", "orig", "dest" "link", "speed", "time")

In [22]:
df.show(20)

+--------------------+----+--------+-----+-------------------+
|                 oid|name|    link|speed|               time|
+--------------------+----+--------+-----+-------------------+
|663bb977999a29396...|   0|    I1S1| 30.0|2024-05-08 20:42:05|
|663bb977999a29396...|   0|    I1S1| 30.0|2024-05-08 20:42:10|
|663bb977999a29396...|   0|    I1S1| 30.0|2024-05-08 20:42:10|
|663bb977999a29396...|   0|trip_end| -1.0|2024-05-08 20:42:10|
|663bb977999a29396...|   0|    N1I1| 30.0|2024-05-08 20:41:40|
|663bb977999a29396...|   1|    S2I2| 30.0|2024-05-08 20:41:40|
|663bb977999a29396...|   0|    N1I1| 25.0|2024-05-08 20:41:45|
|663bb977999a29396...|   1|    S2I2| 25.0|2024-05-08 20:41:45|
|663bb977999a29396...|   0|    N1I1| 30.0|2024-05-08 20:41:50|
|663bb977999a29396...|   1|    S2I2| 30.0|2024-05-08 20:41:50|
|663bb977999a29396...|   0|    N1I1| 30.0|2024-05-08 20:41:55|
|663bb977999a29396...|   1|    S2I2| 30.0|2024-05-08 20:41:55|
|663bb97a999a29396...|   0|    I1S1| 15.0|2024-05-08 20