# Management and Analysis of Physics Dataset - mod.B

## Final project: Streaming processing of cosmic rays using Drift Tubes detectors

The goal of this project is to reproduce a real-time processing of real data collected in a particle physics detector and publish the results in a dashboard for live monitoring.

### Students:
* Conforto Filippo (2021856)
* Domenichetti Lorenzo (missing)
* Faorlin Tommaso (2021857)

## Structured Streaming notebook

In [14]:
import json
import findspark
import numpy as np

from kafka import KafkaProducer
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType
from pyspark.sql.functions import from_json, col, max, min

## Creating Spark context

In [2]:
findspark.init('/home/packages/spark-3.1.2-bin-hadoop3.2')

In [3]:
spark = SparkSession.builder\
    .master("spark://10.67.22.100:7077")\
    .appName("MAPD Final Project session")\
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")\
    .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "false")\
    .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2")\
    .getOrCreate()

In [4]:
spark

## Trying to include Kafka in the process

In [5]:
KAFKA_BOOTSTRAP_SERVERS = "10.67.22.100:9092"

inputDF = spark\
    .readStream\
    .format("kafka")\
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS)\
    .option('subscribe', 'topic_stream')\
    .option("startingOffsets", "latest") \
    .load()

In [6]:
schema = StructType(
        [StructField("HEAD",        IntegerType()),
         StructField("FPGA",         IntegerType()),
         StructField("TDC_CHANNEL",  IntegerType()),
         StructField("ORBIT_CNT",    IntegerType()),
         StructField("BX_COUNTER",   IntegerType()),
         StructField("TDC_MEAS",    DoubleType() )]
)

In [7]:
jsonDF = inputDF.select(from_json(col("value").alias('value').cast("string"), schema).alias('value'))

In [8]:
#flatten out the dataframe
flatDF = jsonDF.selectExpr("value.HEAD", 
                           "value.FPGA", 
                           "value.TDC_CHANNEL",
                           "value.ORBIT_CNT",
                           "value.BX_COUNTER",
                           "value.TDC_MEAS")

##  Cleaning the upcoming dataframe

In [9]:
df = flatDF.where(col("HEAD")!=2)

## Uploading to consumer

In [10]:
producer = KafkaProducer(bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS)

In [11]:
def batch_proc(batch_df, epoch_id):
    
    hits = batch_df.count()
    
    if hits!=0:
        
        hits_ch0 = batch_df.filter('(FPGA=0) AND (TDC_CHANNEL >= 0) AND (TDC_CHANNEL < 64)').count()
        hits_ch1 = batch_df.filter('(FPGA=0) AND (TDC_CHANNEL >= 64) AND (TDC_CHANNEL < 128)').count()
        hits_ch2 = batch_df.filter('(FPGA=1) AND (TDC_CHANNEL >= 0) AND (TDC_CHANNEL < 64)').count()
        hits_ch3 = batch_df.filter('(FPGA=1) AND (TDC_CHANNEL >= 64) AND (TDC_CHANNEL < 128)').count()

        hist = {}
        for chamber in [0,1,2,3]:
            hist[chamber] = {}

            bins, counts = (
                batch_df.where((col('TDC_CHANNEL')>=(chamber % 2)*64) & (col('TDC_CHANNEL')<(chamber % 2 +1)*64) & (col('FPGA')==chamber//2))
                .select('TDC_CHANNEL')
                .rdd.map(lambda x: x.TDC_CHANNEL)
                .histogram(list(np.arange((chamber % 2)*64,(chamber % 2 +1)*64,1)))
            )

            hist[chamber]['bins'] = list(map(int,bins)) #must convert to python integers
            hist[chamber]['counts'] = list(map(int,counts))

        hist2 = {}
        min_hist, max_hist = batch_df.agg(min("ORBIT_CNT"), max("ORBIT_CNT")).rdd.flatMap(lambda x: x).collect()
        binning = list(np.linspace(min_hist, max_hist, 40))
                                    
        for chamber in [0,1,2,3]:
            hist2[chamber] = {}

            bins, counts = (
                batch_df.where((col('TDC_CHANNEL')>=(chamber % 2)*64) & (col('TDC_CHANNEL')<(chamber % 2 +1)*64) & (col('FPGA')==chamber//2))
                .groupBy("ORBIT_CNT","TDC_CHANNEL").count()
                .select('ORBIT_CNT')
                .rdd.map(lambda x: x.ORBIT_CNT)
                .histogram(binning)
            )

            hist2[chamber]['bins'] = list(map(int,bins))
            hist2[chamber]['counts'] = list(map(int,counts))

        result = {
            "hits" : hits,
            "hits_per_chamber": [hits_ch0, hits_ch1, hits_ch2, hits_ch3],
            "hist_1": hist,
            "hist_2": hist2
        }
        producer.send('topic_results', json.dumps(result).encode('utf-8'))
        producer.flush()
        
    else: 
        pass

In [None]:
df.writeStream\
    .trigger(processingTime='5 second')\
    .foreachBatch(batch_proc)\
    .start()\
    .awaitTermination()