In [16]:
import findspark
import os
import numpy as np

from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType
from pyspark.sql.functions import from_json, col
from pyspark.sql.functions import max
from pyspark.sql.functions import min

## Creating Spark context

In [17]:
findspark.init('/home/packages/spark-3.1.2-bin-hadoop3.2/')

In [18]:
spark = SparkSession.builder \
    .master("spark://10.67.22.100:7077")\
    .appName("MAPD Final Project session")\
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")\
    .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "false")\
    .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1")\
    .getOrCreate()

#non funziona, fai partire a mano

#KAFKA_HOME = "/home/packages/kafka_2.13-2.7.0"
#KAFKA_BOOTSTRAP_SERVERS = "localhost:9092"

#os.system("{0}/bin/zookeeper-server-start.sh {0}/config/zookeeper.properties".format(KAFKA_HOME)) 
#os.system("{0}/bin/kafka-server-start.sh {0}/config/server.properties".format(KAFKA_HOME)) 

In [19]:
#print('{0}/bin/zookeeper-server-start.sh {0}/config/zookeeper.properties'.format(KAFKA_HOME))
#print("{0}/bin/kafka-server-start.sh {0}/config/server.properties".format(KAFKA_HOME)) 

In [20]:
spark

## Trying to include Kafka in the process

In [21]:
KAFKA_BOOTSTRAP_SERVERS = "localhost:9092"

inputDF = spark\
    .readStream\
    .format("kafka")\
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS)\
    .option('subscribe', 'topic_stream')\
    .option("startingOffsets", "latest") \
    .load()

In [22]:
inputDF.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [23]:
schema = StructType(
        [StructField("HEAD",        IntegerType()),
         StructField("FPGA",        IntegerType()),
         StructField("TDC_CHANNEL", IntegerType()),
         StructField("ORBIT_CNT",   IntegerType()),
         StructField("BX_COUNTER",  IntegerType()),
         StructField("TDC_MEAS",    DoubleType() )]
)

In [24]:
jsonDF = inputDF.select(from_json(col("value").alias('value').cast("string"), schema).alias('value'))

In [25]:
jsonDF.printSchema()
#a dataframe with one row and complex structures inside

root
 |-- value: struct (nullable = true)
 |    |-- HEAD: integer (nullable = true)
 |    |-- FPGA: integer (nullable = true)
 |    |-- TDC_CHANNEL: integer (nullable = true)
 |    |-- ORBIT_CNT: integer (nullable = true)
 |    |-- BX_COUNTER: integer (nullable = true)
 |    |-- TDC_MEAS: double (nullable = true)



In [26]:
#I flatten out the dataframe
flatDF = jsonDF.selectExpr("value.HEAD", 
                           "value.FPGA", 
                           "value.TDC_CHANNEL",
                           "value.ORBIT_CNT",
                           "value.BX_COUNTER",
                           "value.TDC_MEAS")

In [27]:
flatDF.printSchema()

root
 |-- HEAD: integer (nullable = true)
 |-- FPGA: integer (nullable = true)
 |-- TDC_CHANNEL: integer (nullable = true)
 |-- ORBIT_CNT: integer (nullable = true)
 |-- BX_COUNTER: integer (nullable = true)
 |-- TDC_MEAS: double (nullable = true)



In [28]:
#credo che per questo primo stream non si possano avere diversi schema. Negli hints viene proposta una struttura
#ma è 'dopo aver fatto il processamento'

In [None]:
#volendo qui si può provare a scrive su console 
flatDF.writeStream\
    .outputMode("append")\
    .format("console")\
    .start()\
    .awaitTermination()

## Processing the data without streaming with Kafka

In [5]:
# load dataset on dataset/lecture2/dimuon

schema = StructType()                          \
      .add("HEAD",        IntegerType(), True) \
      .add("FPGA",        IntegerType(), True) \
      .add("TDC_CHANNEL", IntegerType(), True) \
      .add("ORBIT_CNT",   IntegerType(), True) \
      .add("BX_COUNTER",  IntegerType(), True) \
      .add("TDC_MEAS",    DoubleType(),  True)

df = spark.read.format("csv") \
      .option("header",True) \
      .schema(schema) \
      .load("/home/data_000019.txt")

In [6]:
df.printSchema()

root
 |-- HEAD: integer (nullable = true)
 |-- FPGA: integer (nullable = true)
 |-- TDC_CHANNEL: integer (nullable = true)
 |-- ORBIT_CNT: integer (nullable = true)
 |-- BX_COUNTER: integer (nullable = true)
 |-- TDC_MEAS: double (nullable = true)



### Cleansing

In [7]:
df = df.where("HEAD != 2")

### Hits counting

In [8]:
hits = df.count()

### Hits counting per chamber

In [36]:
hits_ch0 = df.filter('(FPGA=0) AND (TDC_CHANNEL >= 0) AND (TDC_CHANNEL < 64)').count()
hits_ch1 = df.filter('(FPGA=0) AND (TDC_CHANNEL >= 64) AND (TDC_CHANNEL < 128)').count()
hits_ch2 = df.filter('(FPGA=1) AND (TDC_CHANNEL >= 0) AND (TDC_CHANNEL < 64)').count()
hits_ch3 = df.filter('(FPGA=1) AND (TDC_CHANNEL >= 64) AND (TDC_CHANNEL < 128)').count()

In [37]:
print(hits_ch0,hits_ch1,hits_ch2,hits_ch3)

31520 0 35628 0


### Active TDC_Channel counts

In [38]:
hist = {}
for chamber in [0,1,2,3]:
    hist[chamber] = {}
    
    bins, counts = (
        df.where((col('TDC_CHANNEL')>=(chamber % 2)*64) & (col('TDC_CHANNEL')<(chamber % 2 +1)*64) & (col('FPGA')==chamber//2))
        .select('TDC_CHANNEL')
        .rdd.map(lambda x: x.TDC_CHANNEL)
        .histogram(list(np.arange((chamber % 2)*64,(chamber % 2 +1)*64,1)))
    )
    
    hist[chamber]['bins'] = bins
    hist[chamber]['counts'] = counts

### Active TDC_CHANNEL per orbit

In [98]:
hist2 = {}
for chamber in [0,1,2,3]:
    hist2[chamber] = {}

    bins, counts = (
        df.where((col('TDC_CHANNEL')>=(chamber % 2)*64) & (col('TDC_CHANNEL')<(chamber % 2 +1)*64) & (col('FPGA')==chamber//2))
        .groupBy("ORBIT_CNT","TDC_CHANNEL").count()
        .select('ORBIT_CNT')
        .rdd.map(lambda x: x.ORBIT_CNT)
        .histogram(list(np.arange(df.agg(min("ORBIT_CNT"), max("ORBIT_CNT")).rdd.flatMap(lambda x: x).collect()[0],df.agg(min("ORBIT_CNT"), max("ORBIT_CNT")).rdd.flatMap(lambda x: x).collect()[1],5000)))
        
    )
    
    hist2[chamber]['bins'] = bins
    hist2[chamber]['counts'] = counts

# Loro


In [10]:

from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType

## the schema of the json data format used to create the messages
schema = StructType(
        [
                StructField("name", StringType()),
                StructField("surname", StringType()),
                StructField("amount", StringType()),
                StructField("delta_t", StringType()),
                StructField("flag", IntegerType())
        ]
)

## a new DF can be created from the previous by using the pyspark.sql functions
jsonDF = inputDF.select(from_json(col("value").alias('value').cast("string"), schema).alias('value'))

In [11]:
flatDF = jsonDF.selectExpr("value.name", 
                           "value.surname", 
                           "value.amount",
                           "value.delta_t",
                           "value.flag")

In [15]:
flatDF.writeStream\
 .outputMode('append')\
 .format('console')\
 .start()\
 .awaitTermination()

KeyboardInterrupt: 