<a href="https://colab.research.google.com/github/mdias23i/DE-DataProcessing/blob/main/spark_streaming/examples/example_1_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Usecase 1
- Defining a sample dataset
- Splitting dataset in many CSVs and uploading them in the input folder in async mode
- Use Spark streaming to read from input folder
- Checking results from query in memory

# Setting up PySpark

In [2]:
%pip install pyspark



In [4]:
from pyspark.sql import SparkSession

#SparkSession.stop(spark)
spark = SparkSession.builder.master('local').appName('Test streaming').getOrCreate()
sc = spark.sparkContext

# Reading sample dataset

In [6]:
from pyspark import SparkFiles
from pyspark.sql.types import *
import uuid
from pyspark.sql.functions import udf
from pyspark.sql import DataFrame

def get_data(url: str, filename: str, schema) -> DataFrame:
  # read using SparkFiles
  spark.sparkContext.addFile(url)
  data = sc.textFile(SparkFiles.get(filename))

  # removing header
  header = data.first()
  data = data.filter(lambda line: line != header)

  df = spark.read.csv(data, header=True, schema=schema)
  return df

@udf
def generate_uuid():
  return str(uuid.uuid4())

schema = StructType([
StructField('Area Name',StringType(),True),
StructField('Area ID',StringType(),True),
StructField('Park Name',StringType(),True),
StructField('Park ID', StringType(), True),
StructField('Squirrel ID', StringType(), True),
StructField('Primary Fur Color', StringType(), True),
StructField('Highlights in Fur Color', StringType(), True),
StructField('Color Notes', StringType(), True),
StructField('Location', StringType(), True),
StructField('Above Ground (Height in Feet)', StringType(), True),
StructField('Specific Location', StringType(), True),
StructField('Activities', StringType(), True),
StructField('Interactions with Humans', StringType(), True),
StructField('Squirrel Latitude (DD.DDDDDD)', StringType(), True),
StructField('Squirrel Longitude (-DD.DDDDDD)', StringType(), True),
StructField('hash', StringType(), True)
])

df = get_data(url = "https://raw.githubusercontent.com/lucprosa/dataeng-basic-course/main/data/squirrel-data/squirrel-data.csv", filename = "squirrel-data.csv", schema = schema)
df = df.withColumn("hash", generate_uuid())

df = df.cache()
df.count()

df.show()

+---------------+-------+-------------------+-------+-----------+-----------------+-----------------------+-----------+------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+--------------------+
|      Area Name|Area ID|          Park Name|Park ID|Squirrel ID|Primary Fur Color|Highlights in Fur Color|Color Notes|    Location|Above Ground (Height in Feet)|Specific Location|          Activities|Interactions with Humans|Squirrel Latitude (DD.DDDDDD)|Squirrel Longitude (-DD.DDDDDD)|                hash|
+---------------+-------+-------------------+-------+-----------+-----------------+-----------------------+-----------+------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+--------------------+
|UPPER MANHATTAN|      A|    Fort Tryon Park|     01|    A-01-02|     

# Removing folders

In [7]:
!rm -rf /content/input/
!rm -rf /content/output/
!rm -rf /content/checkpoint/

# Splitting dataset in many CSVs and uploading them in async mode

- split csv files in many small files
- sleep some seconds between each process
- write small file into input folder

In [8]:
from pyspark.sql import DataFrame
import time
import asyncio

# Function to split csv into many files (assync)
async def splitDf(df: DataFrame, weight: float, files: int):
  weights = [weight for i in range(files)]
  dfs = df.randomSplit(weights)
  return dfs

# Function to write data as csv (assync)
async def writeFile(dfs: list[DataFrame], path: str, seconds_per_file: int):
  for i in range(len(dfs)):
    df = dfs[i]
    #print(f"Writing file {path}file_{i}.csv with {df.count()} lines")
    df.write.mode("append").format("csv").save(path)
    await asyncio.sleep(seconds_per_file)

async def main(df):
  files = 20
  seconds_per_file = 10
  dfs = await splitDf(df, 1.0, files)
  asyncio.create_task(writeFile(dfs, "/content/input/", seconds_per_file))


# Start producer



In [9]:
await main(df)

# Defining streaming functions

- streaming_1
  - read streaming data from csv
  - write streaming data using MEMORY
  - query "my_query" to check results

- streaming_2
  - read streaming data from csv
  - write streaming data as PARQUET
      - define checkpoint
      - trigger time of 5 seconds
      - mode append


In [10]:
from pyspark.sql.streaming import StreamingQuery

def streaming_1(queryName: str) -> StreamingQuery:

  # Start read of file stream (csv) from input folder
  stream1 = spark.readStream.format('csv').schema(schema).option('header', False).load('/content/input/')

  # Check if dataframe is streaming
  print(stream1.isStreaming)

  # Start write as streaming into memory
  query = (stream1.writeStream
  .format('memory')
  .queryName(queryName)
  .trigger(processingTime='5 seconds')
  .outputMode('append')
  .start()
  )
  return query

def streaming_2() -> StreamingQuery:

  # Start read of file stream (csv) from input folder
  stream1 = spark.readStream.format('csv').schema(schema).option('header', False).load('/content/input/')

  # Check if dataframe is streaming
  print(stream1.isStreaming)

  # Start write as streaming into memory
  query = (stream1.writeStream
  .format('parquet')
  .option('path', '/content/output')
  .option('checkpointLocation', '/content/checkpoint')
  .trigger(processingTime='5 seconds')
  .outputMode('append')
  .start()
  )
  return query

# Start streaming_1 (memory)

In [11]:
query = streaming_1("qry1")

True


In [15]:
# querying data in memory
spark.sql("select count(1) from qry1").show()

+--------+
|count(1)|
+--------+
|     432|
+--------+



In [16]:
# checking results
df.count() == spark.sql("select * from qry1").count()

True

# Stop writeStreaming

In [17]:
query.stop()

# Start streaming_2 (parquet)



In [18]:
query = streaming_2()

True


In [19]:
query.stop()

# Checking output

In [20]:
df.count()

432

In [21]:
input = spark.read.format("csv").load("/content/input/*")
output = spark.read.format("parquet").schema(schema).load("/content/output/*")

print(f"input - {input.count()}")
print(f"output - {output.count()}")


input - 432
output - 432


In [22]:
query.stop()

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

In [23]:
# Counting csv files in INPUT folder
!ls -l /content/input | grep .csv | wc -l

20


In [24]:
# Counting csv files in OUTPUT folder
!ls -l /content/output | grep .parquet | wc -l

1


In [25]:
# check streaming query attributes
query.lastProgress
query.lastProgress['numInputRows']
query.recentProgress
query.id
query.name
query.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

In [26]:
# checking input schema
input.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: string (nullable = true)
 |-- _c14: string (nullable = true)
 |-- _c15: string (nullable = true)



In [27]:
# checking output schema
output.printSchema()

root
 |-- Area Name: string (nullable = true)
 |-- Area ID: string (nullable = true)
 |-- Park Name: string (nullable = true)
 |-- Park ID: string (nullable = true)
 |-- Squirrel ID: string (nullable = true)
 |-- Primary Fur Color: string (nullable = true)
 |-- Highlights in Fur Color: string (nullable = true)
 |-- Color Notes: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Above Ground (Height in Feet): string (nullable = true)
 |-- Specific Location: string (nullable = true)
 |-- Activities: string (nullable = true)
 |-- Interactions with Humans: string (nullable = true)
 |-- Squirrel Latitude (DD.DDDDDD): string (nullable = true)
 |-- Squirrel Longitude (-DD.DDDDDD): string (nullable = true)
 |-- hash: string (nullable = true)

