In [1]:
import findspark
import os

from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType
from pyspark.sql.functions import from_json, col

## Creating Spark context

In [2]:
findspark.init('/home/packages/spark-3.1.2-bin-hadoop3.2/')

In [3]:
spark = SparkSession.builder \
    .master("spark://10.67.22.100:7077")\
    .appName("MAPD Final Project session")\
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")\
    .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "false")\
    .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1")\
    .getOrCreate()

#non funziona, fai partire a mano

#KAFKA_HOME = "/home/packages/kafka_2.13-2.7.0"
#KAFKA_BOOTSTRAP_SERVERS = "localhost:9092"

#os.system("{0}/bin/zookeeper-server-start.sh {0}/config/zookeeper.properties".format(KAFKA_HOME)) 
#os.system("{0}/bin/kafka-server-start.sh {0}/config/server.properties".format(KAFKA_HOME)) 

In [4]:
#print('{0}/bin/zookeeper-server-start.sh {0}/config/zookeeper.properties'.format(KAFKA_HOME))
#print("{0}/bin/kafka-server-start.sh {0}/config/server.properties".format(KAFKA_HOME)) 

In [4]:
spark

## Trying to include Kafka in the process

In [5]:
KAFKA_BOOTSTRAP_SERVERS = "localhost:9092"

inputDF = spark\
    .readStream\
    .format("kafka")\
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS)\
    .option('subscribe', 'topic_stream')\
    .option("startingOffsets", "latest") \
    .load()

In [6]:
inputDF.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [7]:
schema = StructType(
        [StructField("HEAD",        StringType()),
         StructField("FPGA",        StringType()),
         StructField("TDC_CHANNEL", StringType()),
         StructField("ORBIT_CNT",   StringType()),
         StructField("BX_COUNTER",  StringType()),
         StructField("TDC_MEAS",    StringType() )]
)

In [8]:
jsonDF = inputDF.select(from_json(col("value").alias('value').cast("string"), schema).alias('value'))

In [9]:
jsonDF.printSchema()
#a dataframe with one row and complex structures inside

root
 |-- value: struct (nullable = true)
 |    |-- HEAD: string (nullable = true)
 |    |-- FPGA: string (nullable = true)
 |    |-- TDC_CHANNEL: string (nullable = true)
 |    |-- ORBIT_CNT: string (nullable = true)
 |    |-- BX_COUNTER: string (nullable = true)
 |    |-- TDC_MEAS: string (nullable = true)



In [10]:
#I flatten out the dataframe
flatDF = jsonDF.selectExpr("value.HEAD", 
                           "value.FPGA", 
                           "value.TDC_CHANNEL",
                           "value.ORBIT_CNT",
                           "value.BX_COUNTER",
                           "value.TDC_MEAS")

In [11]:
flatDF.printSchema()

root
 |-- HEAD: string (nullable = true)
 |-- FPGA: string (nullable = true)
 |-- TDC_CHANNEL: string (nullable = true)
 |-- ORBIT_CNT: string (nullable = true)
 |-- BX_COUNTER: string (nullable = true)
 |-- TDC_MEAS: string (nullable = true)



In [12]:
#credo che per questo primo stream non si possano avere diversi schema. Negli hints viene proposta una struttura
#ma è 'dopo aver fatto il processamento'

In [None]:
#volendo qui si può provare a scrive su console 
flatDF.writeStream\
    .outputMode("append")\
    .format("console")\
    .start()\
    .awaitTermination()

## Processing the data without streaming with Kafka

In [18]:
# load dataset on dataset/lecture2/dimuon

schema = StructType()                          \
      .add("HEAD",        IntegerType(), True) \
      .add("FPGA",        IntegerType(), True) \
      .add("TDC_CHANNEL", IntegerType(), True) \
      .add("ORBIT_CNT",   IntegerType(), True) \
      .add("BX_COUNTER",  IntegerType(), True) \
      .add("TDC_MEAS",    DoubleType(),  True)

df = spark.read.format("csv") \
      .option("header",True) \
      .schema(schema) \
      .load("/home/data_000019.txt")

In [19]:
df.printSchema()

root
 |-- HEAD: integer (nullable = true)
 |-- FPGA: integer (nullable = true)
 |-- TDC_CHANNEL: integer (nullable = true)
 |-- ORBIT_CNT: integer (nullable = true)
 |-- BX_COUNTER: integer (nullable = true)
 |-- TDC_MEAS: double (nullable = true)



Cleansing

In [20]:
df = df.where("HEAD != 2")

Counting

In [21]:
hits_ch0 = df.filter('(FPGA=0) AND (TDC_CHANNEL > 0) AND (TDC_CHANNEL < 64)').count()
hits_ch1 = df.filter('(FPGA=0) AND (TDC_CHANNEL > 64) AND (TDC_CHANNEL < 127)').count()
hits_ch2 = df.filter('(FPGA=1) AND (TDC_CHANNEL > 0) AND (TDC_CHANNEL < 64)').count()
hits_ch3 = df.filter('(FPGA=1) AND (TDC_CHANNEL > 64) AND (TDC_CHANNEL < 127)').count()

In [22]:
print(hits_ch0,hits_ch1,hits_ch2,hits_ch3)

5842 0 6278 0
