In [4]:
from pyspark.sql import SparkSession,DataFrame
import os
import numpy as np
from pyspark.sql import functions as f
from pyspark.sql.types import *
import io
import time
from pyspark.sql import Row

In [5]:
local=True
# spark.rpc.message.maxSize if for write large csv file. The default value is 128, here we set it to 1024
if local:
    spark = SparkSession \
    .builder.master("local[4]") \
    .appName("SparkArrowCompression") \
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1') \
    .getOrCreate()
else: 
    spark = SparkSession \
    .builder.master("k8s://https://kubernetes.default.svc:443") \
    .appName("SparkArrowCompression") \
    .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:master") \
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT']) \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.memory","8g") \
    .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE']) \
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1') \
    .getOrCreate()

In [3]:
spark.sparkContext.stop()

In [26]:
print(spark.sparkContext.version)

3.1.1


In [3]:
! kubectl get pods

I0928 14:58:20.035335    2894 request.go:655] Throttling request took 1.171086553s, request: GET:https://kubernetes.default/apis/coordination.k8s.io/v1beta1?timeout=32s
NAME                               READY   STATUS    RESTARTS   AGE
flume-test-agent-df8c5b944-vtjbx   1/1     Running   0          9d
jupyter-266220-5bf4b859f8-wfkz6    1/1     Running   0          7h24m
kafka-client                       1/1     Running   0          82m
kafka-client1                      1/1     Running   0          79m
kafka-server-0                     1/1     Running   0          9d
kafka-server-1                     1/1     Running   0          9d
kafka-server-2                     1/1     Running   0          9d
kafka-server-zookeeper-0           1/1     Running   0          9d
rstudio-625080-8d7b9fbfb-wdmlw     1/1     Running   0          3h10m


In [28]:
! kubectl get pods | grep sparkarrow | awk '{print $1}' | xargs kubectl delete pods

I0928 13:59:34.345375    1999 request.go:655] Throttling request took 1.166803404s, request: GET:https://kubernetes.default/apis/acme.cert-manager.io/v1alpha3?timeout=32s
I0928 13:59:38.077155    2034 request.go:655] Throttling request took 1.181243355s, request: GET:https://kubernetes.default/apis/acme.cert-manager.io/v1beta1?timeout=32s
pod "sparkarrowcompression-77efdb7c26eb6e0b-exec-5" deleted
pod "sparkarrowcompression-77efdb7c26eb6e0b-exec-6" deleted
pod "sparkarrowcompression-77efdb7c26eb6e0b-exec-7" deleted
pod "sparkarrowcompression-77efdb7c26eb6e0b-exec-8" deleted


In [10]:
data = [("xiaomi", "2007"),
        ("xiaomi 3G","2008"),
      ("xiaomi 3GS","2009"),
      ("xiaomi 4","2010"),
      ("xiaomi 4S","2011"),
      ("xiaomi 5","2012"),
      ("xiaomi 8","2014"),
      ("xiaomi 3GS","2009"),
        ("xiaomi 10","2017")
       ]

df = spark.createDataFrame(data).toDF("key","value")


In [11]:
df.show()
df.printSchema()

+----------+-----+
|       key|value|
+----------+-----+
|    xiaomi| 2007|
| xiaomi 3G| 2008|
|xiaomi 3GS| 2009|
|  xiaomi 4| 2010|
| xiaomi 4S| 2011|
|  xiaomi 5| 2012|
|  xiaomi 8| 2014|
|xiaomi 3GS| 2009|
| xiaomi 10| 2017|
+----------+-----+

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



In [12]:
df.write \
  .format("kafka") \
  .option("kafka.bootstrap.servers","kafka-server.user-pengfei.svc.cluster.local:9092") \
  .option("topic","test_topic") \
  .save()

In [4]:
# Subscribe to 1 topic, with headers
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-server.user-pengfei.svc.cluster.local:9092") \
  .option("subscribe", "test_topic") \
  .option("includeHeaders", "true") \
  .load()

In [13]:
df = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-server.user-pengfei.svc.cluster.local:9092") \
  .option("subscribe", "test_topic") \
  .load()

In [7]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [14]:
df2 = df.selectExpr("CAST(key AS STRING)", 
             "CAST(value AS STRING)","topic","partition","offset","timestamp","timestampType")
df2.show(truncate=False)

+----------+-----+----------+---------+------+-----------------------+-------------+
|key       |value|topic     |partition|offset|timestamp              |timestampType|
+----------+-----+----------+---------+------+-----------------------+-------------+
|iphone 3GS|2009 |test_topic|0        |0     |2021-09-28 14:38:06.326|0            |
|iphone 8  |2014 |test_topic|0        |1     |2021-09-28 14:38:06.326|0            |
|iphone 4  |2010 |test_topic|0        |2     |2021-09-28 14:38:06.356|0            |
|iphone    |2007 |test_topic|0        |3     |2021-09-28 14:38:06.326|0            |
|iphone 4S |2011 |test_topic|0        |4     |2021-09-28 14:38:06.326|0            |
|iphone 10 |2017 |test_topic|0        |5     |2021-09-28 14:38:06.357|0            |
|iphone 3G |2008 |test_topic|0        |6     |2021-09-28 14:38:06.357|0            |
|iphone 5  |2012 |test_topic|0        |7     |2021-09-28 14:38:06.357|0            |
|xiaomi    |2007 |test_topic|0        |8     |2021-09-28 15:18:02