<a href="https://colab.research.google.com/github/margaridagomes/dataeng-basic-course/blob/main/spark_streaming/coinbase_consumer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Consumer Coinbase

- To be executed in Google Colab
- Connect to GCLOUD
- Read data from GCS as streaming
- Analyze data

In [43]:
!pip install pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("GCS Streaming") \
    .config("spark.jars.packages", "com.google.cloud.bigdataoss:gcs-connector:hadoop3-2.2.5") \
    .getOrCreate()



In [44]:
from google.colab import auth
auth.authenticate_user()

project_id = 'data-eng-dev-437916'
!gcloud config set project {project_id}

Updated property [core/project].


In [45]:
!apt-get install openjdk-11-jdk -y
!pip install pyspark gcsfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
openjdk-11-jdk is already the newest version (11.0.27+6~us1-0ubuntu1~22.04).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [74]:
from pyspark.sql import SparkSession

GCS_JAR = "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.9/gcs-connector-hadoop3-2.2.9-shaded.jar"

spark = SparkSession.builder \
    .appName("GCSStreamingDemo") \
    .config("spark.jars", GCS_JAR) \
    .config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
    .config("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
    .getOrCreate()

In [75]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Inicializa a sessão Spark
spark = SparkSession.builder.appName("BTCPriceBatch").getOrCreate()

# Define o schema
schema = "type STRING, sequence LONG, product_id STRING, price STRING, time STRING"

# Lê os dados JSON em modo batch
df = spark.read.schema(schema).json("gs://edit-data-eng-dev/datalake/landing/btc/")

# Seleciona e transforma as colunas necessárias
transformed_df = df.select("time", "product_id", col("price").cast("double"))

# Grava o resultado como uma tabela temporária em memória (apenas para consulta)
transformed_df.createOrReplaceTempView("btc_price_batch")

In [70]:
transformed_df.show(10, False)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [71]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

schema = "type STRING, sequence LONG, product_id STRING, price STRING, time STRING"

dfs = spark.readStream.schema(schema).json("gs://edit-data-eng-dev/datalake/landing/btc/")

stream = dfs.select("time", "product_id", col("price").cast("double")) \
  .writeStream \
  .outputMode("append") \
  .queryName("btc_price_stream3") \
  .format("memory") \
  .start()

In [72]:
dfs = spark.sql("select * from btc_price_stream3")
dfs.show()

+----+----------+-----+
|time|product_id|price|
+----+----------+-----+
+----+----------+-----+



In [76]:
df = spark.sql("select * from btc_price_stream")
df.show()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
# Analysis

# Latest Bitcoin price
# Calculate average BTC price per minute
# Calculate standard deviation of price over time
# How many price tickets per minute?
# Find anomalies (price == nulls or with strange values)

In [73]:
# Latest Bitcoin price

from pyspark.sql.functions import *

# Latest Bitcoin price
df = spark.sql("select * from btc_price_stream where product_id = 'BTC-USD' order by time desc limit 1")
df.show()

import pyspark.sql.functions as f
from pyspark.sql.window import Window
WindowSpec = Window.partitionBy("product_id").orderBy("time")
df2 = df.filter("product_id = 'BTC-USD'").withColumn("last_price", f.first("price").over(WindowSpec))
#df2 = df.filter(f.col("last_price")==f.col("price"))

# Calcula o primeiro preço por produto
#df2 = df.withColumn("last_price", f.first("price").over(window_spec))

# Filtra onde o preço atual é igual ao primeiro preço
#df2 = df2.filter(f.col("last_price") == f.col("price"))

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [53]:
# Latest Bitcoin price
df2.show(10,False)

+---------------------------+----------+---------+----------+
|time                       |product_id|price    |last_price|
+---------------------------+----------+---------+----------+
|2025-07-05T10:23:12.016557Z|BTC-USD   |108165.88|108165.88 |
+---------------------------+----------+---------+----------+



In [56]:
# count and average per product_id
df.groupBy("product_id").agg(count(lit("1")).alias("count"), avg("price").alias("avg_price")).show()

+----------+-----+---------+
|product_id|count|avg_price|
+----------+-----+---------+
|   BTC-USD|    1|108179.29|
+----------+-----+---------+



In [58]:
!gsutil ls gs://edit-data-eng-dev/datalake/landing/btc/ | wc

    592     592   43808


In [65]:
df.select(f.min("time"), f.max("time")).show(1,False)

+---------------------------+---------------------------+
|min(time)                  |max(time)                  |
+---------------------------+---------------------------+
|2025-07-05T10:23:30.761387Z|2025-07-05T10:23:30.761387Z|
+---------------------------+---------------------------+



In [None]:
df3 = spark.read.json("gs://edit-data-eng-dev/datalake/landing/btc/btc_20250705102018542731.json")
df3.select(f.min("time"), f.max("time")).show(1,False)

In [None]:
# Calculate average BTC price per minute
df.filter("product_id == 'BTC-USD'").groupBy(window("time","1 min")).agg(avg("price").alias("avg_price")).orderBy("window").show(10, False)

In [None]:
# Calculate standard deviation of price over time
