In [33]:
import findspark
findspark.init()

In [34]:
from pyspark.sql import SparkSession

In [35]:
spark = SparkSession.builder \
    .appName("pyspark_notebook") \
    .master("local[*]") \
    .config("spark.executor.memory", "1g") \
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017") \
    .config("spark.mongodb.output.uri", "mongodb://localhost:27017") \
    .config("spark.mongodb.input.database", "MyVehiclesData") \
    .config("spark.mongodb.input.collection", "RawVehiclesData") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .getOrCreate()

In [36]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType

In [37]:
custom_schema = StructType([
    StructField("_id", StructType([StructField("oid", StringType(), nullable=True)]), nullable=True),
    StructField("name", StringType(), nullable=True),
    StructField("orig", StringType(), nullable=True),
    StructField("dest", StringType(), nullable=True),
    StructField("link", StringType(), nullable=True),
    StructField("speed", DoubleType(), nullable=True),
    StructField("time", TimestampType(), nullable=True)
])

In [38]:
df = spark.read.format("com.mongodb.spark.sql.DefaultSource") \
            .option("uri", "mongodb://127.0.0.1:27017/MyVehiclesData.RawVehiclesData") \
            .load()


In [39]:
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- link: string (nullable = true)
 |-- name: string (nullable = true)
 |-- orig: string (nullable = true)
 |-- speed: double (nullable = true)
 |-- time: string (nullable = true)



In [40]:
from pyspark.sql.functions import col, to_timestamp

In [41]:
df = df.withColumn("time", to_timestamp(col("time"), "yyyy-MM-dd HH:mm:ss"))

In [42]:
df = df.select("_id.oid", "name", "orig", "dest" ,"link", "speed", "time")

In [43]:
df.show(20)

+--------------------+----+----+----+--------+-----+-------------------+
|                 oid|name|orig|dest|    link|speed|               time|
+--------------------+----+----+----+--------+-----+-------------------+
|663bbc5d556f48163...|   0|  E1|  W1|    E1I4| 50.0|2024-05-08 20:54:41|
|663bbc5d556f48163...|   0|  E1|  W1|    E1I4| 50.0|2024-05-08 20:54:46|
|663bbc5d556f48163...|   0|  E1|  W1|    E1I4| 50.0|2024-05-08 20:54:51|
|663bbc5d556f48163...|   0|  E1|  W1|    E1I4|  0.0|2024-05-08 20:54:56|
|663bbc60556f48163...|   0|  E1|  W1|    E1I4|  0.0|2024-05-08 20:55:01|
|663bbc64556f48163...|   0|  E1|  W1|    I4I3|  0.0|2024-05-08 20:55:06|
|663bbc67556f48163...|   0|  E1|  W1|    I3I2| 50.0|2024-05-08 20:55:11|
|663bbc6c556f48163...|   0|  E1|  W1|    I3I2| 50.0|2024-05-08 20:55:16|
|663bbc71556f48163...|   0|  E1|  W1|    I2I1|  0.0|2024-05-08 20:55:21|
|663bbc76556f48163...|   0|  E1|  W1|    I2I1| 50.0|2024-05-08 20:55:26|
|663bbc7b556f48163...|   0|  E1|  W1|    I2I1|  0.0