In [1]:
from pyspark import SparkContext, SparkConf, SQLContext, Row
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json
from os import listdir
from os.path import isfile, join
import platform
import pandas as pd
from itertools import chain
import pyspark.sql.functions as func
from datetime import datetime
import numpy as np
import os
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, ArrayType

In [2]:
os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages com.datastax.spark:spark-cassandra-connector_2.12:3.0.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1 --conf spark.cassandra.connection.host=127.0.0.1 --conf spark.cassandra.connection.port=9042 --conf spark.cassandra.auth.username=cassandra --conf spark.cassandra.auth.password=cassandra pyspark-shell"

In [3]:
KAFKA_BOOTSTRAP_SERVER = "localhost:9092"

In [4]:
spark = SparkSession \
        .builder \
        .appName("LogsAnalysisWithSpark") \
        .master("local[*]") \
        .getOrCreate()

In [5]:
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

In [6]:
sample = spark.read.json("../../preprocessed/roster_data.json")
schema = sample.schema

In [7]:
def get_stream_for_topics(topics, spark, kafka_server):
    return spark.readStream\
    .format("kafka")\
    .option("kafka.bootstrap.servers", kafka_server)\
    .option("subscribe", ",".join(topics))\
    .load()

In [8]:
streamingInputDF = get_stream_for_topics(["rosterUpdate11"], spark, KAFKA_BOOTSTRAP_SERVER)

In [9]:
valuesDF = streamingInputDF.selectExpr("CAST(value AS STRING)")

In [10]:
eventsDF = valuesDF.withColumn("event", from_json(valuesDF.value, schema)).select("event")

In [11]:
eventsDF.printSchema()

root
 |-- event: struct (nullable = true)
 |    |-- call: string (nullable = true)
 |    |-- date: string (nullable = true)
 |    |-- message: struct (nullable = true)
 |    |    |-- messageId: long (nullable = true)
 |    |    |-- subscriptionIndex: long (nullable = true)
 |    |    |-- type: string (nullable = true)
 |    |    |-- updates: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- activeSpeaker: boolean (nullable = true)
 |    |    |    |    |-- canMove: boolean (nullable = true)
 |    |    |    |    |-- direction: string (nullable = true)
 |    |    |    |    |-- endpointRecording: string (nullable = true)
 |    |    |    |    |-- layout: string (nullable = true)
 |    |    |    |    |-- movedParticipant: string (nullable = true)
 |    |    |    |    |-- movedParticipantCallBridge: string (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |    |-- participant: string (nullable = t

In [12]:
messagesDF = eventsDF.withColumn("date", eventsDF.event.date)\
                     .withColumn("message", eventsDF.event.message)\
                     .withColumn("call", eventsDF.event.call)\
                     .select("message", "date", "call")

In [13]:
preprocessedDF = messagesDF.select("date", "call", explode(messagesDF.message.updates))

In [14]:
preprocessedDF.printSchema()

root
 |-- date: string (nullable = true)
 |-- call: string (nullable = true)
 |-- col: struct (nullable = true)
 |    |-- activeSpeaker: boolean (nullable = true)
 |    |-- canMove: boolean (nullable = true)
 |    |-- direction: string (nullable = true)
 |    |-- endpointRecording: string (nullable = true)
 |    |-- layout: string (nullable = true)
 |    |-- movedParticipant: string (nullable = true)
 |    |-- movedParticipantCallBridge: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- participant: string (nullable = true)
 |    |-- presenter: boolean (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- updateType: string (nullable = true)
 |    |-- uri: string (nullable = true)



In [15]:
col = preprocessedDF.col

In [16]:
finalDF = preprocessedDF.select(preprocessedDF.call,
                      preprocessedDF.date,
                      col.activeSpeaker.alias("activeSpeaker"),
                      col.canMove.alias("canMove"),
                      col.direction.alias("direction"),
                      col.endpointRecording.alias("endpointRecording"),
                      col.layout.alias("layout"),
                      col.movedParticipant.alias("movedParticipant"),
                      col.movedParticipantCallBridge.alias("movedParticipantCallBridge"),
#                       col.name.alias("name"),
                      col.participant.alias("participant"),
                      col.presenter.alias("presenter"),
                      col.state.alias("state"),
                      col.updateType.alias("updateType"),
                      col.uri.alias("uri"))

In [17]:
# Liczba uczestników w stanie initial -> initial
# Liczba uczestników w stanie ringing -> ringing
# Liczba uczestników w stanie connected -> connected
# Liczba uczestników w stanie onHold -> onhold
# Liczba uczestników z audioMuted -> audiomuted (chyba brakuje w przetworzonych danych)
# Liczba uczestników z videoMuted -> videomuted (chyba brakuje w przetworzonych danych)
# Liczba uczestników z activeSpeaker -> activespeaker
# Liczba uczestników presenter -> presenter
# Liczba uczestników endpointRecording -> endpointrecording

In [18]:
groupedDF = finalDF.withColumn("struct", struct("state", "activeSpeaker", "presenter", "endpointRecording", "participant", "date", "updateType"))\
                   .groupBy("call")\
                   .agg(func.collect_list("struct").alias("structArray"))

In [19]:
groupedDF.printSchema()

root
 |-- call: string (nullable = true)
 |-- structArray: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- state: string (nullable = true)
 |    |    |-- activeSpeaker: boolean (nullable = true)
 |    |    |-- presenter: boolean (nullable = true)
 |    |    |-- endpointRecording: string (nullable = true)
 |    |    |-- participant: string (nullable = true)
 |    |    |-- date: string (nullable = true)
 |    |    |-- updateType: string (nullable = true)



In [20]:
def get_last_nonempty_value(values):
    nonempty_values = [i for i in values if i]
    return nonempty_values[-1] if nonempty_values else None

In [21]:
def get_current_value(events, field):
    values = [event[field] for event in events]
    return get_last_nonempty_value(values)

In [22]:
def count_true_values(values):
    return len([value for value in values if value and value is not None])

In [23]:
class CallStats:
    def __init__(self, initial, connected, onhold, ringing, presenter, active_speaker, endpoint_recording, date):
        self.initial = initial
        self.connected = connected
        self.onhold = onhold
        self.ringing = ringing
        self.presenter = presenter
        self.active_speaker = active_speaker
        self.endpoint_recording = endpoint_recording
        self.datetime = date

In [24]:
call_stats_schema = StructType([\
    StructField("initial", IntegerType(), False),
    StructField("connected", IntegerType(), False),
    StructField("onhold", IntegerType(), False),
    StructField("ringing", IntegerType(), False),
    StructField("presenter", IntegerType(), False),
    StructField("active_speaker", IntegerType(), False),
    StructField("endpoint_recording", IntegerType(), False),
    StructField("datetime", TimestampType())
])

In [25]:
# Może da się to jakoś zoptymalizować? xD
def get_current_values(struct_array):
    import operator
    struct_array.sort(key = operator.itemgetter("date"))
    pattern = '%Y-%m-%dT%H:%M:%S.%f'
    date = datetime.strptime([struct["date"] for struct in struct_array][-1], pattern)
    
    removed = [struct["participant"] for struct in struct_array if struct["updateType"]=="remove"]
    current = [struct for struct in struct_array if struct["participant"] not in removed]
    participant_dict = dict()
    
    for struct in current:
        participant = struct["participant"]
        if participant in participant_dict:
            participant_dict[participant].append(struct)
        else:
            participant_dict[participant] = [struct]
    
    grouped = list(participant_dict.values())
    
    fields = ["state", "presenter", "activeSpeaker", "endpointRecording"]
    final = {field:list() for field in fields}
    for events in grouped:
        for field in fields:
            final[field].append(get_current_value(events, field))
    
    final["initial"] = 0
    final["connected"] = 0
    final["onhold"] = 0
    final["ringing"] = 0
    
    for state in final["state"]:
        final[state] = final[state]+1
        
    final["presenter_sum"] = count_true_values(final["presenter"])
    final["activeSpeaker_sum"] = count_true_values(final["activeSpeaker"])
    final["endpointRecording_sum"] = count_true_values(final["endpointRecording"])
    
    return CallStats(final["initial"],
                    final["connected"],
                    final["onhold"],
                    final["ringing"],
                    final["presenter_sum"],
                    final["activeSpeaker_sum"],
                    final["endpointRecording_sum"],
                    date)

get_current_values_udf = udf(get_current_values, call_stats_schema)

In [26]:
concat_udf = udf(lambda cols: "".join([x if x is not None else "*" for x in cols]), StringType())

In [27]:
spark.udf.register("get_current_values_udf", get_current_values_udf)
spark.udf.register("concat_udf", concat_udf)

<function __main__.<lambda>(cols)>

In [28]:
currentDF = groupedDF.withColumn("current_values", get_current_values_udf("structArray"))

In [29]:
currentDF.printSchema()

root
 |-- call: string (nullable = true)
 |-- structArray: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- state: string (nullable = true)
 |    |    |-- activeSpeaker: boolean (nullable = true)
 |    |    |-- presenter: boolean (nullable = true)
 |    |    |-- endpointRecording: string (nullable = true)
 |    |    |-- participant: string (nullable = true)
 |    |    |-- date: string (nullable = true)
 |    |    |-- updateType: string (nullable = true)
 |-- current_values: struct (nullable = true)
 |    |-- initial: integer (nullable = false)
 |    |-- connected: integer (nullable = false)
 |    |-- onhold: integer (nullable = false)
 |    |-- ringing: integer (nullable = false)
 |    |-- presenter: integer (nullable = false)
 |    |-- active_speaker: integer (nullable = false)
 |    |-- endpoint_recording: integer (nullable = false)
 |    |-- datetime: timestamp (nullable = true)



In [30]:
values = currentDF.current_values

In [31]:
testDF = currentDF.select(values.initial.alias("initial"),
                          values.connected.alias("connected"),
                          values.onhold.alias("onhold"),
                          values.ringing.alias("ringing"),
                          values.presenter.alias("presenter"),
                          values.active_speaker.alias("active_speaker"),
                          values.endpoint_recording.alias("endpoint_recording"),
                          values.datetime.alias("datetime"),
                          currentDF.call.alias("call_id"))\
                   .withColumn("id", concat_udf(func.array("call_id", "datetime")))\
                   .withColumn("hour", hour("datetime"))\
                   .withColumn("week_day_number", date_format("datetime", 'u').cast(IntegerType()))

In [32]:
testDF.printSchema()

root
 |-- initial: integer (nullable = true)
 |-- connected: integer (nullable = true)
 |-- onhold: integer (nullable = true)
 |-- ringing: integer (nullable = true)
 |-- presenter: integer (nullable = true)
 |-- active_speaker: integer (nullable = true)
 |-- endpoint_recording: integer (nullable = true)
 |-- datetime: timestamp (nullable = true)
 |-- call_id: string (nullable = true)
 |-- id: string (nullable = true)
 |-- hour: integer (nullable = true)
 |-- week_day_number: integer (nullable = true)



In [33]:
def writeToCassandra(writeDF, epochId):
     writeDF.write \
        .format("org.apache.spark.sql.cassandra") \
        .options(table="test_part", keyspace="engineering")\
        .mode("append") \
        .save()

In [34]:
writer = testDF\
        .writeStream\
        .outputMode("complete")\
        .foreachBatch(writeToCassandra)

In [35]:
query = writer.start()
query.awaitTermination()

KeyboardInterrupt: 