## To-do list
<li>Filter out large gap not alerting field</li>
<li>Fix streaming abort(daily/weekly/monthly)</li>

In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime
from pyspark.sql.types import StructType
import json
emailSchema = StructType() \
        .add("metadata", StructType()\
             .add("path",StringType())\
             .add("_attachment_mimetype",StringType())\
             .add("type_prefix",StringType())\
             .add("host",StringType())\
             .add("json",StringType())\
             .add("producer",StringType())\
             .add("topic",StringType())\
             .add("_id",StringType())\
             .add("type",StringType())\
             .add("timestamp",LongType())\
            )\
        .add("data",StructType()\
             .add("code",StringType())\
             .add("system",StringType())\
             .add("uri_path",StringType())\
             .add("method",StringType())\
             .add("clientip",StringType())\
             .add("client",StringType())\
             .add("rec_date",StringType())\
             .add("dn",StringType())\
             .add("api",StringType())\
             .add("rec_timestamp",StringType())\
             .add("frontend",StringType())\
            )\

In [2]:

# Subscribe to 1 topic

# .option("kafka.bootstrap.servers", "188.185.79.229:9092")\
# .option("startingOffsets", "earliest") \

raw_data = spark.readStream.format("kafka")\
.option("kafka.bootstrap.servers", "monit-kafka.cern.ch:9092")\
.option("subscribe","cmsweb_logs")\
.option("failOnDataLoss",False)\
.load()\
.select(\
        from_json(col("value").cast("string"),emailSchema)\
            .getField("metadata").alias("metadata").getField("host").alias("host")\
        ,col("timestamp").alias("timestamp")\
        ,from_json(col("value").cast("string"),emailSchema)\
            .getField("data").alias("data").getField("system").alias("system")\
        ,from_json(col("value").cast("string"),emailSchema)\
            .getField("data").alias("data").getField("dn").alias("user")\
)
raw_data=raw_data.filter(~raw_data.system.rlike("^(%|/)"))

raw_data.printSchema()

root
 |-- host: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- system: string (nullable = true)
 |-- user: string (nullable = true)



Exclude system starting with % and / from query system
since they are not representing real CMS systems.


In [3]:
groupped_data=raw_data\
.withWatermark("timestamp", "1 minute")\
.groupBy(window('timestamp', "1 minute", "1 minute"),"system")\
.agg(count("system").alias("count"))
groupped_data.printSchema()

root
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- system: string (nullable = true)
 |-- count: long (nullable = false)



In [4]:
raw_data_flow = raw_data.writeStream.queryName("email").outputMode("Append").format("memory").start()

In [5]:
groupped_data_flow = groupped_data.writeStream.queryName("groupped_email").outputMode("Append").format("memory").start()

In [6]:
hdfs_data_flow=groupped_data.writeStream \
.outputMode("append")\
.format("parquet")\
 .option("path", "/cms/users/carizapo/ming/groupdata_cmsweb_logs") \
 .option("checkpointLocation", "/cms/users/carizapo/ming/checkpoint_cmsweb_logs") \
 .outputMode("append") \
 .start()

In [9]:
spark.streams.active

[<pyspark.sql.streaming.StreamingQuery at 0x7f2720138f98>,
 <pyspark.sql.streaming.StreamingQuery at 0x7f27201381d0>,
 <pyspark.sql.streaming.StreamingQuery at 0x7f2720138278>]

In [25]:
groupped_data_flow.isActive

True

In [20]:
raw_data_flow.isActive

False

In [21]:
hdfs_data_flow.isActive

False

In [12]:
# groupped_data_flow.lastProgress
# groupped_data_flow.lastProgress
# raw_data_flow.processAllAvailable()
# hdfs_data_flow.processAllAvailable()

In [32]:
raw_data_flow.stop()

In [31]:
groupped_data_flow.stop()

In [33]:
hdfs_data_flow.stop()

In [63]:
alerts = spark.sql("select * from email")
alerts.show()

+----------------+--------------------+-------+--------------------+
|            host|           timestamp| system|                user|
+----------------+--------------------+-------+--------------------+
|vocms055.cern.ch|2019-07-10 12:08:...| phedex|/DC=ch/DC=cern/OU...|
|vocms055.cern.ch|2019-07-10 12:08:...|reqmgr2|/DC=ch/DC=cern/OU...|
|vocms055.cern.ch|2019-07-10 12:08:...|couchdb|/DC=ch/DC=cern/OU...|
|vocms055.cern.ch|2019-07-10 12:08:...|couchdb|/DC=ch/DC=cern/OU...|
|vocms055.cern.ch|2019-07-10 12:08:...|reqmgr2|/DC=ch/DC=cern/OU...|
|vocms055.cern.ch|2019-07-10 12:08:...|couchdb|/DC=ch/DC=cern/OU...|
|vocms055.cern.ch|2019-07-10 12:08:...|couchdb|/DC=ch/DC=cern/OU...|
|vocms055.cern.ch|2019-07-10 12:08:...| phedex|/DC=ch/DC=cern/OU...|
|vocms055.cern.ch|2019-07-10 12:08:...| phedex|/DC=ch/DC=cern/OU...|
|vocms055.cern.ch|2019-07-10 12:08:...|couchdb|/DC=ch/DC=cern/OU...|
|vocms055.cern.ch|2019-07-10 12:08:...|couchdb|/DC=ch/DC=cern/OU...|
|vocms055.cern.ch|2019-07-10 12:08

In [77]:
alerts = spark.sql("select * from groupped_email")
alerts.show()

+--------------------+-------------+-----+
|              window|       system|count|
+--------------------+-------------+-----+
|[2019-07-10 12:09...|          dbs|  339|
|[2019-07-10 12:09...|       phedex|  274|
|[2019-07-10 12:08...|      couchdb|22877|
|[2019-07-10 12:08...| t0wmadatasvc|    1|
|[2019-07-10 12:08...|          img|    2|
|[2019-07-10 12:08...|          css|    1|
|[2019-07-10 12:09...|          dqm|   66|
|[2019-07-10 12:09...|wmstatsserver|   23|
|[2019-07-10 12:09...|      reqmgr2|  448|
|[2019-07-10 12:08...|       sitedb|   14|
|[2019-07-10 12:08...|      wmstats|    1|
|[2019-07-10 12:09...|    crabcache|    3|
|[2019-07-10 12:08...|       phedex|13414|
|[2019-07-10 12:08...|          dbs| 9110|
|[2019-07-10 12:08...|          das|   60|
|[2019-07-10 12:08...|          dqm|   29|
|[2019-07-10 12:09...|    scheddmon|    2|
|[2019-07-10 12:08...|wmstatsserver|  301|
|[2019-07-10 12:08...|    crabcache|   94|
|[2019-07-10 12:08...|      reqmgr2| 8250|
+----------

In [None]:
raw_data_old = spark.readStream.format("kafka")\
.option("kafka.bootstrap.servers", "monit-kafka.cern.ch:9092")\
.option("subscribe","cmsweb_logs")\
.option("auto.offset.reset", "earliest")\
.option("startingOffsets", "earliest")\
.load()\
.select(\
        from_json(col("value").cast("string"),emailSchema)\
            .getField("metadata").alias("metadata").getField("host").alias("host")\
        ,col("timestamp").alias("timestamp")\
        ,from_json(col("value").cast("string"),emailSchema)\
            .getField("data").alias("data").getField("system").alias("system")\
        ,from_json(col("value").cast("string"),emailSchema)\
            .getField("data").alias("data").getField("dn").alias("user")\
)

raw_data_old=raw_data_old.filter(~raw_data.system.rlike("^(%|/)"))

In [None]:
groupped_data_hour=raw_data_old\
.withWatermark("timestamp", "15 minutes")\
.groupBy(window('timestamp', "1 hour", "30 minutes"),"system")\
.agg(count("system").alias("count"))

In [None]:
groupped_data_day=raw_data_old\
.withWatermark("timestamp", "1 hours")\
.groupBy(window('timestamp', "1 day", "1 day"),"system")\
.agg(count("system").alias("count"))

In [None]:
groupped_data_week=raw_data_old\
.withWatermark("timestamp", "1 hours")\
.groupBy(window('timestamp', "1 week", "1 week"),"system")\
.agg(count("system").alias("count"))

In [None]:
groupped_data_month=raw_data_old\
.withWatermark("timestamp", "1 hours")\
.groupBy(window('timestamp', "4 weeks", "2 weeks"),"system")\
.agg(count("system").alias("count"))

In [None]:
hdfs_data_hour_flow=groupped_data_hour.writeStream \
.outputMode("append")\
.format("parquet")\
 .option("path", "/cms/users/carizapo/ming/groupdata_hour_cmsweb_logs") \
 .option("checkpointLocation", "/cms/users/carizapo/ming/checkpoint_1a_cmsweb_logs") \
 .outputMode("append") \
 .start()

In [None]:
hdfs_data_day_flow=groupped_data_day.writeStream \
.outputMode("append")\
.format("parquet")\
 .option("path", "/cms/users/carizapo/ming/groupdata_day_cmsweb_logs") \
 .option("checkpointLocation", "/cms/users/carizapo/ming/checkpoint_2a_cmsweb_logs") \
 .outputMode("append") \
 .start()

In [None]:
hdfs_data_week_flow=groupped_data_week.writeStream \
.outputMode("append")\
.format("parquet")\
 .option("path", "/cms/users/carizapo/ming/groupdata_week_cmsweb_logs") \
 .option("checkpointLocation", "/cms/users/carizapo/ming/checkpoint_3a_cmsweb_logs") \
 .outputMode("append") \
 .start()

In [None]:
hdfs_data_month_flow=groupped_data_month.writeStream \
.outputMode("append")\
.format("parquet")\
 .option("path", "/cms/users/carizapo/ming/groupdata_month_cmsweb_logs") \
 .option("checkpointLocation", "/cms/users/carizapo/ming/checkpoint_4a_cmsweb_logs") \
 .outputMode("append") \
 .start()

In [None]:
# hdfs_data_flow.stop()
hdfs_data_hour_flow.stop()
hdfs_data_day_flow.stop()
hdfs_data_week_flow.stop()
hdfs_data_month_flow.stop()

In [None]:
raw_data_old_flow=raw_data_old.writeStream.queryName("old_data").outputMode("Append").format("memory").start()

In [None]:
alerts = spark.sql("select * from old_data")
alerts.show()