In [12]:
import findspark
findspark.init()

In [13]:
from pyspark.sql import SparkSession

In [14]:
spark = SparkSession.builder \
    .appName("pyspark_notebook1") \
    .master("local[*]") \
    .config("spark.executor.memory", "1g") \
    .config("spark.mongodb.input.uri", "mongodb://localhost:27017") \
    .config("spark.mongodb.output.uri", "mongodb://localhost:27017") \
    .config("spark.mongodb.input.database", "MyVehiclesData") \
    .config("spark.mongodb.input.collection", "ProcessedVehiclesData") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .getOrCreate()

In [15]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, LongType

In [16]:
custom_schema1 = StructType([
    StructField("_id", StructType([StructField("oid", StringType(), nullable=True)]), nullable=True),
    StructField("link", StringType(), nullable=True),
    StructField("vcount", LongType(), nullable=True),
    StructField("vspeed", DoubleType(), nullable=True),
    StructField("Time", TimestampType(), nullable=True)
])

In [17]:
df1 = spark.read.format("com.mongodb.spark.sql.DefaultSource") \
            .option("uri", "mongodb://127.0.0.1:27017/MyVehiclesData.ProcessedVehiclesData") \
            .load()


In [18]:
df1.printSchema()

root
 |-- Time: string (nullable = true)
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- link: string (nullable = true)
 |-- vcount: long (nullable = true)
 |-- vspeed: double (nullable = true)



In [19]:
from pyspark.sql.functions import col, to_timestamp

In [20]:
df1 = df1.withColumn("Τime", to_timestamp(col("time"), "yyyy-MM-dd HH:mm:ss"))


In [21]:
df1 = df1.select("_id.oid", "link", "vspeed", "vcount", "Time")

In [22]:
df1.show(10)

+--------------------+--------------------+------------------+------+-------------------+
|                 oid|                link|            vspeed|vcount|               Time|
+--------------------+--------------------+------------------+------+-------------------+
|663b491a65c3f1381...|                S4I4|19.568345323741006|   278|2024-05-07 18:39:50|
|663b491a65c3f1381...|                I1S1|25.597014925373134|   268|2024-05-07 18:40:35|
|663b491a65c3f1381...|                I3S3|25.137931034482758|   145|2024-05-07 18:40:35|
|663b491a65c3f1381...|waiting_at_origin...|              -1.0|   324|2024-05-07 18:39:50|
|663b491a65c3f1381...|                I1W1| 33.40759078818973|   202|2024-05-07 18:40:20|
|663b491a65c3f1381...|                I2N2|25.259067357512954|   193|2024-05-07 18:40:35|
|663b491a65c3f1381...|                N3I3| 19.80654761904762|   336|2024-05-07 18:39:50|
|663b491a65c3f1381...|                S2I2|19.897260273972602|   438|2024-05-07 18:39:50|
|663b491a6