In [4]:
!pip install -q pyspark

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Colab PySpark") \
    .getOrCreate()


In [6]:
from pyspark.sql.types import StructType, StringType, IntegerType

# Define schema
schema = StructType() \
    .add("order_id", StringType()) \
    .add("customer_id", StringType()) \
    .add("product", StringType()) \
    .add("quantity", IntegerType()) \
    .add("region", StringType())

# Sample data
initial_data = [
    ("1", "C101", "Laptop", 2, "South"),
    ("2", "C102", "Chair", 6, "North"),
    ("3", "C103", "Mobile", 1, "East")
]

# Create DataFrame
df = spark.createDataFrame(initial_data, schema=schema)

# Save as CSV in local path
output_path = "/tmp/stream_orders"  # Local path in Colab VM

df.write \
  .mode("overwrite") \
  .option("header", True) \
  .csv(output_path)

print("Saved to:", output_path)


Saved to: /tmp/stream_orders


In [10]:
orders_stream = (
    spark.readStream
    .schema(schema)
    .option("header", True)
    .csv("/tmp/stream_orders")  # local path instead of dbfs
)

In [13]:
from pyspark.sql.functions import when, col

transformed_orders = orders_stream.withColumn(
    "bulk_order", when(col("quantity") > 5, True).otherwise(False)
)

In [14]:
# Step 3: Create rate stream
rate_df = (
    spark.readStream
    .format("rate")
    .option("rowsPerSecond", 1)
    .load()
)

# Add is_even column
transformed_df = rate_df.withColumn("is_even", (col("value") % 2 == 0))

In [15]:
# Step 4: Write to memory table
query = (
    transformed_df.writeStream
    .format("memory")         # Write to in-memory table
    .queryName("rate_table")  # Query name to use with spark.sql()
    .outputMode("append")
    .start()
)

In [16]:
# Step 5: Wait a few seconds for data to accumulate
import time
time.sleep(5)  # Allow some data to be generated


In [17]:
# Step 6: Query in-memory table
spark.sql("SELECT * FROM rate_table").show()


+--------------------+-----+-------+
|           timestamp|value|is_even|
+--------------------+-----+-------+
|2025-08-08 11:35:...|    0|   true|
|2025-08-08 11:35:...|    1|  false|
|2025-08-08 11:35:...|    2|   true|
|2025-08-08 11:35:...|    3|  false|
|2025-08-08 11:35:...|    4|   true|
|2025-08-08 11:35:...|    5|  false|
|2025-08-08 11:35:...|    6|   true|
+--------------------+-----+-------+



In [18]:
# Step 7: Stop the query
query.stop()


In [23]:
# Doing some variations to this stream

# To check the number is a multiple of 5 and 10
# Categorize numbers as 'Small', 'Medium', or 'Large'

# Transform the stream
transformed_df = rate_df \
    .withColumn("multiple_of_5", (col("value") % 5 == 0)) \
    .withColumn("multiple_of_10", (col("value") % 10 == 0)) \
    .withColumn("size_category",
                when(col("value") < 10, "Small")
                .when(col("value") < 50, "Medium")
                .otherwise("Large"))

# Write stream to in-memory table
query = (
    transformed_df.writeStream
    .format("memory")
    .queryName("rate_table")  # You can query this later
    .outputMode("append")
    .start()
)

# Wait to accumulate data
import time
time.sleep(5)

# Query and display the streaming memory table
spark.sql("SELECT * FROM rate_table").show()

# Stop the stream after viewing
query.stop()

+--------------------+-----+-------------+--------------+-------------+
|           timestamp|value|multiple_of_5|multiple_of_10|size_category|
+--------------------+-----+-------------+--------------+-------------+
|2025-08-08 11:43:...|    0|         true|          true|        Small|
|2025-08-08 11:43:...|    1|        false|         false|        Small|
|2025-08-08 11:43:...|    2|        false|         false|        Small|
|2025-08-08 11:43:...|    3|        false|         false|        Small|
+--------------------+-----+-------------+--------------+-------------+

