todo-list
=========
<li>Migrate static->stream(if possible)</li>
<li>Find new efficient way to do pattern detection/anomaly detection</li>
<li>Email alert config (now sent to yanisa.sunthornyotin@cern.ch)</li>

In [None]:
!pip install --user stldecompose 

In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
userSchema = StructType() \
        .add("window",StructType()\
             .add("start",TimestampType())\
             .add("end",TimestampType()))\
        .add("system", StringType())\
        .add("count", LongType())

In [2]:
raw_data = spark\
.readStream.format("parquet")\
.schema(userSchema)\
.load("/cms/users/carizapo/ming/groupdata_cmsweb_logs");
raw_data.printSchema()

root
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- system: string (nullable = true)
 |-- count: long (nullable = true)



In [3]:
temp_data = spark\
.read.format("parquet")\
.load("/cms/users/carizapo/ming/groupdata_cmsweb_logs");
temp_data.printSchema()

root
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- system: string (nullable = true)
 |-- count: long (nullable = false)



In [4]:
# temp_data.show()

Calculate % Difference according to the mean of the amounth of email within 1 week
-----------

In [4]:
from pyspark.sql import Window
w = Window.partitionBy('system',window("window.start", "7 days"))
# .orderBy(col("window.start").cast('long')).rangeBetween(-days(7), 0)

freq_analyze_df=temp_data.select('*', avg('count').over(w).alias('avg')).sort('system','window')\
.select('*', ((col('count') - first('avg').over(w))).alias('diff'))\
.select('*', ((col('diff')/first('avg').over(w))).alias('%diff'))\
.select('*', when((abs(col('%diff'))>3), 1).otherwise(0).alias('label'))
# freq_analyze_df.filter("system=='popdb'").show()

In [5]:
raw_data=raw_data.join(freq_analyze_df, ["system","window","count"], "inner")
raw_data.printSchema()

root
 |-- system: string (nullable = true)
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- count: long (nullable = true)
 |-- avg: double (nullable = true)
 |-- diff: double (nullable = true)
 |-- %diff: double (nullable = true)
 |-- label: integer (nullable = false)



In [None]:
filter_alert_data = raw_data.filter("label > 0")

In [6]:
raw_data_flow = raw_data.writeStream.queryName("hdfs").outputMode("Append").format("memory").start()

In [None]:
filter_alert_data_flow = filter_alert_data.writeStream.queryName("alert").outputMode("Append").format("memory").start()

In [None]:
raw_data_flow.stop()

In [None]:
filter_alert_data_flow.stop()

In [None]:
spark.streams.active

In [None]:
# raw_data_flow.lastProgress
filter_alert_data_flow.lastProgress
# raw_data_flow.processAllAvailable()
# filter_alert_data_flow.processAllAvailable()

In [73]:
alerts = spark.sql("select * from hdfs")
alerts.show()

+--------------------+--------------------+-----+------------------+--------------------+--------------------+-----+
|              system|              window|count|               avg|                diff|               %diff|label|
+--------------------+--------------------+-----+------------------+--------------------+--------------------+-----+
|          %{request}|[2019-06-29 06:43...|    2|1.1827956989247312|  0.8172043010752688|  0.6909090909090908|    0|
|           %{system}|[2019-06-28 16:16...| 1762|342.41309483631454|  1419.5869051636855|   4.145831238849956|    1|
|           %{system}|[2019-06-28 20:59...|  679|342.41309483631454|  336.58690516368546|  0.9829849098632919|    0|
|           %{system}|[2019-06-29 08:29...|  826|342.41309483631454|  483.58690516368546|  1.4122909212769943|    0|
|           %{system}|[2019-06-30 13:23...|  180|342.41309483631454| -162.41309483631454| -0.4743191696975073|    0|
|           %{system}|[2019-07-01 09:43...|  434|342.41309483631

In [None]:
alerts = spark.sql("select * from alert")
alerts.show()

In [None]:
alerts = spark.sql("select * from concat")
alerts.show()

In [78]:
freq_analyze_df.filter("system=='%{system}' and label==1").show()

+--------------------+---------+-----+------------------+------------------+------------------+-----+
|              window|   system|count|               avg|              diff|             %diff|label|
+--------------------+---------+-----+------------------+------------------+------------------+-----+
|[2019-06-30 03:13...|%{system}| 1389|342.41309483631454|1046.5869051636855| 3.056503740500902|    1|
|[2019-07-01 11:44...|%{system}| 1397|342.41309483631454|1054.5869051636855|3.0798673329587904|    1|
|[2019-07-01 11:59...|%{system}| 3656|342.41309483631454|3313.5869051636855| 9.677161753255074|    1|
|[2019-07-01 12:14...|%{system}| 1482|342.41309483631454|1139.5869051636855|3.3281055028238566|    1|
|[2019-07-01 12:30...|%{system}| 2061|342.41309483631454|1718.5869051636855| 5.019045506963542|    1|
|[2019-07-01 18:29...|%{system}| 1549|342.41309483631454|1206.5869051636855|3.5237755896586735|    1|
|[2019-07-01 18:45...|%{system}| 4742|342.41309483631454|4399.5869051636855|12.848

In [229]:
import numpy as np

from pyspark.ml.linalg import Vectors
w = Window.partitionBy('system',window("window.start","5 hours"))

concat_alert_df=freq_analyze_df\
.drop_duplicates(['%diff','system'])\
.withColumn('features', collect_list(col('%diff')).over(w).cast("array<double>"))\
.withColumn('num_features', size(collect_list(col('%diff')).over(w).cast("array<double>")))\
.withColumn('start', collect_list(col('window.start')).over(w))\
.withColumn('labels', collect_list(col('label')).over(w))\
.drop_duplicates(subset=['features','label'])


Concerns : The size of feature is not equal in each row
-----
The decision tree failed

In [231]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.linalg import Vectors, VectorUDT
to_vector = udf(lambda a: Vectors.dense(a), VectorUDT())
transform=concat_alert_df.select("system","start","labels",to_vector("features").alias("features"))
# transform.show()

In [179]:
transform_pd=transform.toPandas()

In [None]:
transform_pd[transform_pd.label==0].head(10)

In [203]:
pd.set_option('display.max_colwidth', -1)
transform_pd[transform_pd.system=='/server-status'].head(10)

Unnamed: 0,system,start,label,features
282,/server-status,[2019-07-02 11:17:00],0,[2.2560952097172806]
407,/server-status,"[2019-06-21 10:02:00, 2019-06-21 10:01:00, 2019-06-21 10:03:00]",0,"[1.046153846153846, -0.30000000000000004, -0.8384615384615385]"
527,/server-status,"[2019-06-30 23:32:00, 2019-06-30 23:30:00, 2019-06-30 23:57:00, 2019-06-30 23:43:00, 2019-06-30 23:44:00, 2019-06-30 23:38:00, 2019-06-30 23:31:00, 2019-06-30 23:29:00, 2019-06-30 23:42:00, 2019-06-30 23:59:00]",0,"[-0.7495311377140553, 1.204125988116313, -0.49906227542811066, 0.7031882635444238, 0.6530944910872348, 0.502813173715668, 1.304313533030691, -0.5992498203424885, -0.4489685029709217, 1.154032215659124]"
592,/server-status,"[2019-07-01 00:23:00, 2019-07-01 00:38:00, 2019-07-01 01:47:00, 2019-07-01 03:59:00, 2019-07-01 01:14:00, 2019-07-01 01:12:00, 2019-07-01 00:28:00, 2019-07-01 02:01:00, 2019-07-01 03:57:00, 2019-07-01 04:13:00, 2019-07-01 01:29:00, 2019-07-01 03:28:00, 2019-07-01 00:15:00, 2019-07-01 00:00:00, 2019-07-01 00:39:00, 2019-07-01 01:42:00, 2019-07-01 02:29:00, 2019-07-01 02:28:00, 2019-07-01 03:44:00, 2019-07-01 02:44:00, 2019-07-01 02:02:00, 2019-07-01 01:28:00, 2019-07-01 03:43:00, 2019-07-01 00:29:00, 2019-07-01 01:38:00, 2019-07-01 00:43:00, 2019-07-01 01:00:00, 2019-07-01 03:42:00, 2019-07-01 01:39:00, 2019-07-01 00:27:00, 2019-07-01 03:29:00, 2019-07-01 04:27:00, 2019-07-01 01:27:00, 2019-07-01 00:59:00, 2019-07-01 03:58:00]",0,"[0.25234431142972336, 0.45271940125847904, 0.35253185634410117, 0.7532820360016127, 1.1039384432019352, -0.2986871855993549, 0.6030007186300459, -0.3988747305137328, -0.24859341314216601, 1.3544073054878798, 1.5547823953166355, 0.552906946172857, -0.14840586822778815, -0.7996249101712443, -0.9499062275428111, -0.8998124550856221, 0.8534695809159906, 1.0538446707447462, 0.9035633533731795, 1.6549699402310134, 0.10206299405815653, 0.30243808388691223, 0.2022505389725344, 1.4545948504022577, -0.6493435927996775, 0.8033758084588016, -0.8497186826284332, -0.0983120957705992, 0.15215676651534546, -0.5491560478852996, 1.5046886228594467, -0.6994373652568664, -0.34878095805654385, 1.7050637126882024, 1.0037508982875574]"
754,/server-status,[2019-07-03 03:41:00],0,[2.005626347431336]
813,/server-status,"[2019-07-01 06:28:00, 2019-07-01 05:02:00, 2019-07-01 08:58:00, 2019-07-01 05:57:00, 2019-07-01 09:43:00, 2019-07-01 06:58:00, 2019-07-01 05:29:00, 2019-07-01 07:14:00]",0,"[0.9536571258303684, 0.0519692216009676, 1.254219760573502, -0.04821832331341027, 0.0018754491437786625, 0.4026256288012901, 1.4045010779450688, 1.6048761677738246]"
815,/server-status,[2019-06-28 22:29:00],0,[1.9054388025169582]
855,/server-status,"[2019-06-21 09:38:00, 2019-06-21 09:53:00, 2019-06-21 09:42:00, 2019-06-21 09:28:00, 2019-06-21 09:27:00, 2019-06-21 09:39:00, 2019-06-21 09:59:00, 2019-06-21 09:29:00, 2019-06-21 09:44:00, 2019-06-21 09:24:00, 2019-06-21 09:30:00, 2019-06-21 09:23:00]",0,"[0.34615384615384603, 0.6153846153846152, -0.6230769230769231, 0.5076923076923076, -0.5153846153846154, -0.7307692307692308, 0.23846153846153836, 0.723076923076923, 1.4769230769230768, -0.9461538461538461, 0.13076923076923067, 0.5615384615384614]"
923,/server-status,[2019-06-29 11:59:00],0,[1.7551574851453913]
1031,/server-status,[2019-07-01 11:45:00],0,[-0.19849964068497708]


In [233]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors
# Normalize each Vector using $L^1$ norm.
# normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
# l1NormData = normalizer.transform(transform)
# print("Normalized using L^1 norm")
# l1NormData.show()

# Normalize each Vector using $L^\infty$ norm.
lInfNormData = normalizer.transform(transform, {normalizer.p: float("inf")})
print("Normalized using L^inf norm")
# lInfNormData.show()

Normalized using L^inf norm


In [235]:
to_array = udf(lambda x: x.toArray().tolist(), ArrayType(DoubleType()))
lInfNormData_new = lInfNormData\
.withColumn('normFeatures_list', to_array(lInfNormData.normFeatures))\
.withColumn('features_list', to_array(lInfNormData.features))\
.drop("normFeatures")

In [236]:
lInfNormData_new.printSchema()

root
 |-- system: string (nullable = true)
 |-- start: array (nullable = true)
 |    |-- element: timestamp (containsNull = true)
 |-- labels: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- features: vector (nullable = true)
 |-- normFeatures_list: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- features_list: array (nullable = true)
 |    |-- element: double (containsNull = true)



In [237]:
from pyspark.sql.functions import udf, explode
from pyspark.sql.types import *
from pyspark.ml.linalg import *
parsed_data=lInfNormData_new.withColumn("tmp", arrays_zip("start", "normFeatures_list","features_list",'labels'))\
    .withColumn("tmp", explode("tmp"))\
    .select("system", col("tmp.start"), col("tmp.normFeatures_list"), col("tmp.labels"),col("tmp.features_list"),"features")
# parsed_data.show()

In [240]:
new_alert=parsed_data.select('*', when((abs(col('normFeatures_list'))>0.9), 1).otherwise(0).alias('new_label'))
parsed_data.printSchema()

root
 |-- system: string (nullable = true)
 |-- start: timestamp (nullable = true)
 |-- normFeatures_list: double (nullable = true)
 |-- labels: integer (nullable = true)
 |-- features_list: double (nullable = true)
 |-- features: vector (nullable = true)



Some data have negative % difference (features) because the email comes less than the mean calculated from 7 days interval
In this case, maybe the former benchmark work better( the newer one will amplify those differences to be 1 or -1 in case there's no abnormal email(+/-peak) coming).
<li>The former benchmart set at %difference = 300% then alert</li>
<li>Maybe we can assume that short lenght dataframe means there's no email coming %difference = 0%</li>

In [241]:
new_alert.filter('new_label==1 and labels==0').toPandas().head(5)

Unnamed: 0,system,start,normFeatures_list,labels,features_list,features,new_label
0,wmdatamining,2019-06-21 09:28:00,1.0,0,0.955556,"[-0.5555555555555556, -0.28888888888888886, 0.6592592592592592, 0.9555555555555556]",1
1,/das/request?pid=9e19d0bbe2c68dc55cc6312812c84273&input=dataset%3D%2FDsToTau_To3Mu_MuFilter_TuneCUEP8M1_13TeV-pythia8%2FRunIIFall17MiniAODv2-PU2017_12Apr2018_94X_mc2017_realistic_v14-v1%2FMINIAODSIM&ajax=1&instance=prod%2Fglobal&view=,2019-07-02 11:45:00,1.0,0,1.5,"[-0.5, 1.5, 0.0]",1
2,newwq,2019-06-29 18:28:00,1.0,0,1.457131,"[0.5912314402585368, 0.6249677958823927, 1.4571312346041718]",1
3,/crabcache/file?hashkey=c39a92b59d396e328ee405998022e54bc6ebd30254d90ee08882b5d6fb916d27,2019-07-03 09:59:00,1.0,0,1.03125,"[0.6249999999999999, -0.59375, 0.21874999999999994, -0.18750000000000006, 1.0312499999999998]",1
4,/confdb/,2019-06-21 09:53:00,1.0,0,1.333333,"[1.3333333333333333, -0.6666666666666666]",1


In [242]:
new_alert.filter('new_label==0 and labels==1').toPandas().head(5)

Unnamed: 0,system,start,normFeatures_list,labels,features_list,features,new_label
0,wmstatsserver,2019-07-03 04:01:00,0.360061,1,3.064622,"[3.0646220871478937, 2.026937846678993, 2.044160821624535, 2.491958170208625, 2.9871186998929553, 0.20991398992432012, 8.511387913675527, 2.8105832067011507, 3.2583805552852407, 1.9537402031604398, 0.8127181130182874, 2.8794751064833184, 1.9967976405242946, 1.9408229719512835, 1.7728989662322496]",0
1,wmstatsserver,2019-07-03 05:14:00,0.382826,1,3.258381,"[3.0646220871478937, 2.026937846678993, 2.044160821624535, 2.491958170208625, 2.9871186998929553, 0.20991398992432012, 8.511387913675527, 2.8105832067011507, 3.2583805552852407, 1.9537402031604398, 0.8127181130182874, 2.8794751064833184, 1.9967976405242946, 1.9408229719512835, 1.7728989662322496]",0
2,acdcserver,2019-07-03 10:29:00,0.521602,1,3.68744,"[3.687440035635965, 1.247327302631579, 0.06858209978070176, 7.069447299890351, -0.1903011924342105, 4.205206620065789, 2.657415021929825, 2.4260725054824563, 1.2858843887061404, 1.7871265076754388, 0.7571014939692983, 6.243224026864035, -0.8898368969298246, 1.3740148711622808, 0.6138894599780702, 1.7045041803728072, 1.3299496299342106, 2.7069884183114037, 0.4541529605263158, 2.40954804002193, 1.946863007127193, 1.0710663377192982, 4.3318941885964914]",0
3,acdcserver,2019-07-03 07:13:00,0.594842,1,4.205207,"[3.687440035635965, 1.247327302631579, 0.06858209978070176, 7.069447299890351, -0.1903011924342105, 4.205206620065789, 2.657415021929825, 2.4260725054824563, 1.2858843887061404, 1.7871265076754388, 0.7571014939692983, 6.243224026864035, -0.8898368969298246, 1.3740148711622808, 0.6138894599780702, 1.7045041803728072, 1.3299496299342106, 2.7069884183114037, 0.4541529605263158, 2.40954804002193, 1.946863007127193, 1.0710663377192982, 4.3318941885964914]",0
4,acdcserver,2019-07-03 10:13:00,0.883128,1,6.243224,"[3.687440035635965, 1.247327302631579, 0.06858209978070176, 7.069447299890351, -0.1903011924342105, 4.205206620065789, 2.657415021929825, 2.4260725054824563, 1.2858843887061404, 1.7871265076754388, 0.7571014939692983, 6.243224026864035, -0.8898368969298246, 1.3740148711622808, 0.6138894599780702, 1.7045041803728072, 1.3299496299342106, 2.7069884183114037, 0.4541529605263158, 2.40954804002193, 1.946863007127193, 1.0710663377192982, 4.3318941885964914]",0


In [181]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
pdAlert= freq_analyze_df.toPandas()

In [54]:
tags = pdAlert['window'].apply(pd.Series)
pdAlert['start']= tags[0]
pdAlert['end']= tags[1]
filter_data = pdAlert.drop(['avg', 'diff','window'],axis=1)
filter_data['start'] = pd.to_datetime(filter_data['start'], errors='coerce')
filter_data=filter_data[(filter_data.system == '/couchdb/workqueue/_bulk_docs/')]
# print(filter_data.head(20))
filter_data.set_index('start', inplace=True)
filter_data=filter_data.loc['2019-06-28':'2019-07-1']
prep=filter_data[['count']]
# prep.head(1000).plot()
# filter_data=filter_data.resample('H')\
#        .mean()\
#        .interpolate('linear')
# print(filter_data.head(20))

In [None]:
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline
# the main library has a small set of functionality
from stldecompose import decompose, forecast
from stldecompose.forecast_funcs import (naive,
                                         drift, 
                                         mean, 
                                         seasonal_naive)

In [None]:
decomp = decompose(prep, period=24)

In [None]:
decomp.plot();
plt.xlim('2019-06-29','2019-07-1');

In [None]:
import math as math
train_factor=math.floor(len(prep)*0.7)
short_prep = prep.head(train_factor)
# apply the decomp to the truncated observation
short_decomp = decompose(short_prep, period=24)

In [None]:
fcast = forecast(short_decomp, steps=8000, fc_func=drift,seasonal=True)

In [None]:
plt.plot(prep, '--', label='truth')
plt.plot(short_prep, '--', label='obs')
plt.plot(short_decomp.trend, ':', label='decomp.trend')
plt.plot(fcast, '-', label=fcast.columns[0])
plt.xlim('2019-06-30','2019-07-3')

plt.ylim(0,100)
plt.legend();

In [14]:
from skmultiflow.data import DataStream
from skmultiflow.trees import HoeffdingTree
from skmultiflow.evaluation import EvaluatePrequential

In [23]:
from pyspark.mllib.regression import LabeledPoint
prep_df=(transform.select(col("label"), col("features"))
  .rdd
  .map(lambda row: LabeledPoint(row.label, as_old(row.features))))

from pyspark.mllib import linalg as mllib_linalg
from pyspark.ml import linalg as ml_linalg
def as_old(v):
    if isinstance(v, ml_linalg.SparseVector):
        return mllib_linalg.SparseVector(v.size, v.indices, v.values)
    if isinstance(v, ml_linalg.DenseVector):
        return mllib_linalg.DenseVector(v.values)
    raise ValueError("Unsupported type {0}".format(type(v)))
 

In [None]:
from notifier import Notifier
import json

In [None]:
# sns.set(style="ticks", color_codes=True)
# sns.relplot(x="start", y="%diff",hue="system", kind="line", data=pdAlert);

In [None]:
notifier = Notifier(config=json.loads(s='''
{
  "cases": {
    "exit_2": {
      "alert_name": "cms-htcondor-es-validation",
      "email": {
        "send_ok": true,
        "to": [
          "yanisa.sunthornyotin@cern.ch"
        ]
      },
      "entities": [
        "default entity"
      ],
      "snow": {
        "assignment_level": 3,
        "functional_element": "",
        "grouping": true,
        "service_element": "MONITORING"
      },
      "source": "cms-monit-notifier",
      "status": "ERROR",
      "targets": [
        "email",
        "snow"
      ]
    }
  },
  "default_case": {
    "alert_name": "cms-htcondor-es-validation",
    "email": {
      "send_ok": true,
      "to": [
        "yanisa.sunthornyotin@cern.ch"
      ]
    },
    "entities": [
      "default entity"
    ],
    "source": "cms-monit-notifier",
    "status": "OK",
    "targets": [
      "email"
    ]
  },
  "notification_endpoint": "http://monit-alarms.cern.ch:10011"
}'''
                                    ))


In [None]:
sc.addPyFile('notifier.py')

In [None]:
alert_flow = filter_alert_data.writeStream\
.foreach(lambda alert: notifier.send_notification(subject=alert.system,description=json.dumps(alert.asDict(), default=str)))\
.start()

In [None]:
alert_flow.isActive

In [None]:
alert_flow.processAllAvailable()

In [None]:
alert_flow.stop()

In [None]:
hdfs_data_week_flow=raw_data.writeStream \
.outputMode("append")\
.format("parquet")\
 .option("path", "/cms/users/carizapo/ming/moving_avg_cmsweb_logs") \
 .option("checkpointLocation", "/cms/users/carizapo/ming/checkpoint_moving_avg_cmsweb_logs") \
 .outputMode("append") \
 .start()

In [None]:
hdfs_data_week_flow.isActive

In [None]:
spark.streams.active