In [24]:
import findspark
findspark.init()

In [25]:
from pyspark.sql import SparkSession

In [26]:
spark = SparkSession.builder \
    .appName("pyspark_notebook1") \
    .master("local[*]") \
    .config("spark.executor.memory", "1g") \
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017") \
    .config("spark.mongodb.output.uri", "mongodb://localhost:27017") \
    .config("spark.mongodb.input.database", "MyVehiclesData") \
    .config("spark.mongodb.input.collection", "ProcessedVehiclesData") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .getOrCreate()

In [27]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, LongType

In [28]:
custom_schema1 = StructType([
    StructField("_id", StructType([StructField("oid", StringType(), nullable=True)]), nullable=True),
    StructField("link", StringType(), nullable=True),
    StructField("vcount", LongType(), nullable=True),
    StructField("vspeed", DoubleType(), nullable=True),
    StructField("Time", TimestampType(), nullable=True)
])

In [29]:
df1 = spark.read.format("com.mongodb.spark.sql.DefaultSource") \
            .option("uri", "mongodb://127.0.0.1:27017/MyVehiclesData.ProcessedVehiclesData") \
            .load()


In [30]:
df1.printSchema()

root
 |-- Time: string (nullable = true)
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- link: string (nullable = true)
 |-- vcount: long (nullable = true)
 |-- vspeed: double (nullable = true)



In [31]:
from pyspark.sql.functions import col, to_timestamp

In [32]:
df1 = df1.withColumn("Τime", to_timestamp(col("time"), "yyyy-MM-dd HH:mm:ss"))

In [33]:
df1 = df1.select("_id.oid", "link", "vspeed", "vcount", "Time")

In [34]:
df1.show(50)

+--------------------+----+------------------+------+-------------------+
|                 oid|link|            vspeed|vcount|               Time|
+--------------------+----+------------------+------+-------------------+
|663c97107418a0332...|N1I1|              25.0|     2|2024-05-09 12:27:55|
|663c9725c4f83968b...|N1I1|              27.5|     4|2024-05-09 12:27:55|
|663c9729c4f83968b...|N1I1|              26.0|     5|2024-05-09 12:27:55|
|663c972bc4f83968b...|N1I1|26.666666666666668|     6|2024-05-09 12:27:55|
|663c9730c4f83968b...|N1I1|27.142857142857142|     7|2024-05-09 12:27:55|
|663c9734c4f83968b...|I1S1|              20.0|     1|2024-05-09 12:28:34|
|663c9735c4f83968b...|N1I1|27.142857142857142|     7|2024-05-09 12:27:55|
|663c9739c4f83968b...|I1S1|              25.0|     2|2024-05-09 12:28:34|
|663c973ac4f83968b...|N1I1|27.142857142857142|     7|2024-05-09 12:27:55|
|663c973ec4f83968b...|I1S1|26.666666666666668|     3|2024-05-09 12:28:34|
|663c973fc4f83968b...|N1I1|27.14285714