todo-list
=========
<li>Migrate static->stream(if possible)</li>
<li>Find new efficient way to do pattern detection/anomaly detection</li>
<li>Email alert config (now sent to yanisa.sunthornyotin@cern.ch)</li>

In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
userSchema = StructType() \
        .add("window",StructType()\
             .add("start",TimestampType())\
             .add("end",TimestampType()))\
        .add("system", StringType())\
        .add("count", LongType())

In [2]:
raw_data = spark\
.readStream.format("parquet")\
.schema(userSchema)\
.load("/cms/users/carizapo/ming/groupdata_cmsweb_logs");
raw_data.printSchema()

root
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- system: string (nullable = true)
 |-- count: long (nullable = true)



In [3]:
temp_data = spark\
.read.format("parquet")\
.load("/cms/users/carizapo/ming/groupdata_cmsweb_logs");
temp_data.printSchema()

root
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- system: string (nullable = true)
 |-- count: long (nullable = false)



In [15]:
temp_data.show()

+--------------------+--------------------+-----+
|              window|              system|count|
+--------------------+--------------------+-----+
|[2019-06-21 10:02...|/das/request?pid=...|    4|
|[2019-06-21 09:29...|/das/request?pid=...|    3|
|[2019-06-21 09:44...|/das/request?pid=...|    9|
|[2019-06-21 09:44...|/das/request?pid=...|    6|
|[2019-06-21 09:43...|/das/request?pid=...|    1|
|[2019-06-21 10:02...|/das/request?inst...|    1|
|[2019-06-21 09:28...|/das/request?inst...|    1|
|[2019-06-21 09:44...|/das/request?inst...|    1|
|[2019-06-21 09:44...|/das/request?inpu...|    1|
|[2019-06-21 09:43...|/das/request?inst...|    1|
|[2019-06-21 09:53...|/das/request?pid=...|    1|
|[2019-06-21 09:23...|/das/request?pid=...|    2|
|[2019-06-21 09:53...|/das/request?view...|    1|
|[2019-06-21 09:23...|/das/request?view...|    1|
|[2019-06-21 09:38...|/dqm/offline/data...|    1|
|[2019-06-21 09:38...|/dqm/offline/data...|    1|
|[2019-06-21 09:42...|/dqm/offline/data...|    1|


In [5]:
from pyspark.sql import Window

w = Window.partitionBy('system',window("window.start", "7 days"))
# .orderBy(col("window.start").cast('long')).rangeBetween(-days(7), 0)

freq_analyze_df=temp_data.select('*', avg('count').over(w).alias('avg')).sort('system','window')\
.select('*', (col('count') - first('avg').over(w)).alias('diff'))\
.select('*', when((abs(col('diff')) > col('avg')*0.7), 1).otherwise(0).alias('label'))
# freq_analyze_df.show()

In [6]:
raw_data=raw_data.join(freq_analyze_df, ["system","window","count"], "inner")
raw_data.printSchema()

root
 |-- system: string (nullable = true)
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- count: long (nullable = true)
 |-- avg: double (nullable = true)
 |-- diff: double (nullable = true)
 |-- label: integer (nullable = false)



In [7]:
filter_alert_data = raw_data.filter("label > 0")

In [8]:
raw_data_flow = raw_data.writeStream.queryName("hdfs").outputMode("Append").format("memory").start()

In [9]:
filter_alert_data_flow = filter_alert_data.writeStream.queryName("alert").outputMode("Append").format("memory").start()

In [90]:
raw_data_flow.stop()

In [180]:
filter_alert_data_flow.stop()

In [15]:
# concat_data = raw_data.withColumn('feature', concat(col('system'), col('count'))).writeStream.queryName("concat").outputMode("Append").format("memory").start()

In [None]:
# concat_data.stop()

In [45]:
spark.streams.active

[<pyspark.sql.streaming.StreamingQuery at 0x7f04fb5fee48>,
 <pyspark.sql.streaming.StreamingQuery at 0x7f04fb5fecf8>,
 <pyspark.sql.streaming.StreamingQuery at 0x7f04fb5fe470>,
 <pyspark.sql.streaming.StreamingQuery at 0x7f04fb5fe5f8>]

In [32]:
# raw_data_flow.lastProgress
filter_alert_data_flow.lastProgress
# raw_data_flow.processAllAvailable()
# filter_alert_data_flow.processAllAvailable()

In [123]:
alerts = spark.sql("select * from hdfs")
alerts.show()

+------+--------------------+-----+----+-----+-----+
|system|              window|count| avg| diff|label|
+------+--------------------+-----+----+-----+-----+
|   dqm|[2019-06-21 09:57...|    1|21.5|-20.5|    1|
|   dqm|[2019-06-21 09:58...|    1|21.5|-20.5|    1|
|   dqm|[2019-06-21 09:59...|   24|21.5|  2.5|    0|
|   dqm|[2019-06-21 10:00...|   19|21.5| -2.5|    0|
|   dqm|[2019-06-21 09:24...|    4|21.5|-17.5|    1|
|   dqm|[2019-06-21 09:27...|   12|21.5| -9.5|    0|
|   dqm|[2019-06-21 09:28...|   30|21.5|  8.5|    0|
|   dqm|[2019-06-21 09:29...|   39|21.5| 17.5|    1|
|   dqm|[2019-06-21 09:30...|   14|21.5| -7.5|    0|
|   dqm|[2019-06-21 09:38...|   26|21.5|  4.5|    0|
|   dqm|[2019-06-21 09:39...|   10|21.5|-11.5|    0|
|   dqm|[2019-06-21 09:42...|   22|21.5|  0.5|    0|
|   dqm|[2019-06-21 09:43...|   34|21.5| 12.5|    0|
|   dqm|[2019-06-21 09:44...|   65|21.5| 43.5|    1|
|   dqm|[2019-06-21 09:45...|    5|21.5|-16.5|    1|
|   dqm|[2019-06-21 09:53...|   29|21.5|  7.5|

In [62]:
alerts = spark.sql("select * from alert")
alerts.show()

+--------------+--------------------+-----+------+-------+-----+
|        system|              window|count|   avg|   diff|label|
+--------------+--------------------+-----+------+-------+-----+
|           dqm|[2019-06-21 09:24...|    4|  21.5|  -17.5|    1|
|           dqm|[2019-06-21 09:29...|   39|  21.5|   17.5|    1|
|           dqm|[2019-06-21 09:44...|   65|  21.5|   43.5|    1|
|           dqm|[2019-06-21 09:45...|    5|  21.5|  -16.5|    1|
|           dqm|[2019-06-21 09:57...|    1|  21.5|  -20.5|    1|
|           dqm|[2019-06-21 09:58...|    1|  21.5|  -20.5|    1|
|           dqm|[2019-06-21 10:02...|   57|  21.5|   35.5|    1|
|           dqm|[2019-06-21 10:03...|    2|  21.5|  -19.5|    1|
|           dbs|[2019-06-21 10:02...| 6735|1428.2| 5306.8|    1|
|           dbs|[2019-06-21 10:03...|  273|1428.2|-1155.2|    1|
|           dbs|[2019-06-21 09:58...|  362|1428.2|-1066.2|    1|
|           dbs|[2019-06-21 09:59...| 4514|1428.2| 3085.8|    1|
|           dbs|[2019-06-

In [11]:
from notifier import Notifier
import json

In [21]:
notifier = Notifier(config=json.loads(s='''
{
  "cases": {
    "exit_2": {
      "alert_name": "cms-htcondor-es-validation",
      "email": {
        "send_ok": true,
        "to": [
          "yanisa.sunthornyotin@cern.ch"
        ]
      },
      "entities": [
        "default entity"
      ],
      "snow": {
        "assignment_level": 3,
        "functional_element": "",
        "grouping": true,
        "service_element": "MONITORING"
      },
      "source": "cms-monit-notifier",
      "status": "ERROR",
      "targets": [
        "email",
        "snow"
      ]
    }
  },
  "default_case": {
    "alert_name": "cms-htcondor-es-validation",
    "email": {
      "send_ok": true,
      "to": [
        "yanisa.sunthornyotin@cern.ch"
      ]
    },
    "entities": [
      "default entity"
    ],
    "source": "cms-monit-notifier",
    "status": "OK",
    "targets": [
      "email"
    ]
  },
  "notification_endpoint": "http://monit-alarms.cern.ch:10011"
}'''
                                    ))


In [13]:
sc.addPyFile('notifier.py')

In [41]:
alert_flow = filter_alert_data.writeStream\
.foreach(lambda alert: notifier.send_notification(subject=alert.system,description=json.dumps(alert.asDict(), default=str)))\
.start()

In [44]:
alert_flow.isActive

True

In [42]:
alert_flow.processAllAvailable()

In [22]:
alert_flow.stop()

In [16]:
hdfs_data_week_flow=raw_data.writeStream \
.outputMode("append")\
.format("parquet")\
 .option("path", "/cms/users/carizapo/ming/moving_avg_cmsweb_logs") \
 .option("checkpointLocation", "/cms/users/carizapo/ming/checkpoint_moving_avg_cmsweb_logs") \
 .outputMode("append") \
 .start()

In [17]:
hdfs_data_week_flow.isActive

True

In [28]:
spark.streams.active

[<pyspark.sql.streaming.StreamingQuery at 0x7f04fb6b18d0>,
 <pyspark.sql.streaming.StreamingQuery at 0x7f04fb6b19e8>,
 <pyspark.sql.streaming.StreamingQuery at 0x7f04fb6b1a90>,
 <pyspark.sql.streaming.StreamingQuery at 0x7f04fb6b1908>]