todo-list
=========
<li>Migrate static->stream(if possible)</li>
<li>Find new efficient way to do pattern detection/anomaly detection</li>
<li>Email alert config (now sent to yanisa.sunthornyotin@cern.ch)</li>

In [35]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
userSchema = StructType() \
        .add("window",StructType()\
             .add("start",TimestampType())\
             .add("end",TimestampType()))\
        .add("system", StringType())\
        .add("api", StringType())\
        .add("user", StringType())\
        .add("count_req", LongType())\
        .add("req_load", LongType())\
        .add("system_load", LongType())\
        .add("api_load", LongType())\
        .add("user_load", LongType())\
        .add("avg_req", DoubleType())\
        .add("%diff_req", DoubleType())\
        .add("avg_sys", DoubleType())\
        .add("%diff_sys", DoubleType())\
        .add("avg_api", DoubleType())\
        .add("%diff_api", DoubleType())\
        .add("avg_user", DoubleType())\
        .add("%diff_user", DoubleType())\

In [56]:
#For static flow to join
raw_data = spark\
.readStream.format("parquet")\
.schema(userSchema)\
.load("/cms/users/carizapo/ming/fullDiff_cmsweb_logs");

In [57]:
alerts_hist = spark\
.read.format("parquet")\
.load("/cms/users/carizapo/ming/fullDiff_cmsweb_logs");
alerts_hist.printSchema()

root
 |-- system: string (nullable = true)
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- api: string (nullable = true)
 |-- user: string (nullable = true)
 |-- count_req: long (nullable = true)
 |-- req_load: long (nullable = true)
 |-- system_load: long (nullable = false)
 |-- api_load: long (nullable = false)
 |-- user_load: long (nullable = false)
 |-- avg_req: double (nullable = true)
 |-- %diff_req: double (nullable = true)
 |-- avg_sys: double (nullable = true)
 |-- %diff_sys: double (nullable = true)
 |-- avg_api: double (nullable = true)
 |-- %diff_api: double (nullable = true)
 |-- avg_user: double (nullable = true)
 |-- %diff_user: double (nullable = true)



In [58]:
alerts_hist.show()

+------+------+---+----+---------+--------+-----------+--------+---------+-------+---------+-------+---------+-------+---------+--------+----------+
|system|window|api|user|count_req|req_load|system_load|api_load|user_load|avg_req|%diff_req|avg_sys|%diff_sys|avg_api|%diff_api|avg_user|%diff_user|
+------+------+---+----+---------+--------+-----------+--------+---------+-------+---------+-------+---------+-------+---------+--------+----------+
+------+------+---+----+---------+--------+-----------+--------+---------+-------+---------+-------+---------+-------+---------+--------+----------+



In [42]:
drop_col=['window']
raw_data_init = alerts_hist.withColumn('date',col("window.start")).drop(*drop_col)

In [39]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator


sys_indexer = StringIndexer(inputCol="system", outputCol="system_hash")
user_indexer = StringIndexer(inputCol="user", outputCol="user_hash")
api_indexer = StringIndexer(inputCol="api", outputCol="api_hash")
inputs = [sys_indexer.getOutputCol(), user_indexer.getOutputCol(),api_indexer.getOutputCol()]
encoder = OneHotEncoderEstimator(inputCols=inputs, outputCols=["system_vec","user_vec","api_vec"])

pipeline = Pipeline(stages=[sys_indexer,user_indexer,api_indexer, encoder])
pipelineModel=pipeline.fit(raw_data_init)
result=pipelineModel.transform(raw_data_init)

IllegalArgumentException: 'requirement failed: The input column system_hash should have at least two distinct values.'

In [6]:
%run StoreItemDemand/custom_transformers.ipynb

In [7]:
train_data, test_data = result.randomSplit([0.8,0.2], seed=1234)

In [8]:
df_train = train_data.withColumn('set', lit(0))
df_train = df_train.withColumn('id', lit(-1))
df_test = test_data.withColumn('set', lit(1))

joined = df_test.union(df_train.select(*df_test.columns))

train_data = joined.filter('set == 0')
test_data = joined.filter('set == 1')

In [9]:
train, validation = train_data.randomSplit([0.8,0.2], seed=1234)

In [10]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler,StandardScaler
# Feature extraction
dc = DateConverter(inputCol='date', outputCol='dateFormated')
hrex = HourExtractor(inputCol='date')
minex = MinExtractor(inputCol='date')
dex = DayExtractor(inputCol='dateFormated')
mex = MonthExtractor(inputCol='dateFormated')
yex = YearExtractor(inputCol='dateFormated')
wdex = WeekDayExtractor(inputCol='dateFormated')
wex = WeekendExtractor()
mbex = MonthBeginExtractor()
meex = MonthEndExtractor()
# Data process
va = VectorAssembler(inputCols=["system_vec","user_vec","api_vec",'count_req','%diff_req','%diff_sys','%diff_api','%diff_user'\
                                 ,'weekday', 'weekend', 'monthbegin', 'monthend','hour','minute', 'day', 'month', 'year'], outputCol="features")
# scaler = StandardScaler(inputCol="raw_features", outputCol="features", withStd=True, withMean=True)
# scaler = MinMaxScaler(inputCol="raw_features", outputCol="features")

pipeline = Pipeline(stages=[dc,hrex,minex, dex, mex,wdex,wex,mbex,meex, yex, va])

In [11]:
pipeline_model = pipeline.fit(train)

In [12]:
train_transformed = pipeline_model.transform(train)
validation_transformed = pipeline_model.transform(validation)
test_transformed = pipeline_model.transform(test_data)

From the graph, we choose the k that start to make no change to y-axis

# Create the model and train/predict the data

In [13]:
from pyspark.ml.clustering import KMeans
from numpy import array
from math import sqrt
kmeans = KMeans(k=10, seed=1)  # 10 clusters here
model = kmeans.fit(validation_transformed.select('features'))

In [14]:
predictions = model.transform(train_transformed)

Label the outlier to spark dataframe
<li>Add calculated distance column to predictions dataframe</li>
<li>Drop unnecessary column from predictions dataframe</li>
<li>Add Max_distance/Label column to dataframe</li>

In [15]:
predictions = model.transform(train_transformed)
from scipy.spatial import distance
columns_drop=['system_hash','user_hash','api_hash','system_vec','user_vec','api_vec','dateFormated','hour','minute'\
             ,'day','month','year','weekday','weekend','monthbegin','monthend','features']
centers = model.clusterCenters()
fixed_entry = centers #for example, the entry against which you want distances
distance_udf = udf(lambda x,y: float(distance.euclidean(x, fixed_entry[y])), FloatType())

# For joining static dataframe
predictions = predictions.withColumn('distances', distance_udf(col('features'),col('prediction')))\
.select('*', avg('distances').over(Window.partitionBy('prediction')).alias('avg_distances'),\
       stddev('distances').over(Window.partitionBy('prediction')).alias('std_distances'),\
       max('distances').over(Window.partitionBy('prediction')).alias('max_distances'))\
.drop(*columns_drop)
# predictions.printSchema()

# 2 Benchmark for detect outlier
<li>Define outlier region to be +/- 10% of the max value</li>
<li>Define the datapoint that have the distances further than 2 standard deviation of the mean distance per cluster as an outlier</li>

In [16]:
# outlier_factor=0.1
# alert_udf= udf(lambda max_distances,dist: dist>= max_distances-(max_distances*outlier_factor), BooleanType())
alert_udf= udf(lambda avg_dist,std_dist,dist: dist>= avg_dist+2*std_dist, BooleanType())
#Static
alerts = predictions.withColumn('label', alert_udf(col('avg_distances'),col('std_distances'),col('distances')))

# Publish new alert from Kmeans 

In [17]:
# Static processing
alerts_broadcast= alerts.select([c for c in alerts.columns if c in\
                                 {'label','system','api','user','date'}]).where(alerts.label==1)
alerts_broadcast=alerts_broadcast.toDF('system_temp', 'api_temp','user_temp','date','label')

In [18]:
alerts_email=raw_data.join(alerts_broadcast,[alerts_broadcast.date==raw_data.window.start,\
                                             alerts_broadcast.user_temp==raw_data.user,\
                                            alerts_broadcast.api_temp==raw_data.api,\
                                            alerts_broadcast.system_temp==raw_data.system] , "inner")\
                    .drop('system_temp','api_temp','user_temp','date')

In [19]:
from notifier import Notifier
import json
notifier = Notifier(config=json.loads(s='''
{
  "cases": {
    "exit_2": {
      "alert_name": "cms-htcondor-es-validation",
      "email": {
        "send_ok": true,
        "to": [
          "yanisa.sunthornyotin@cern.ch"
        ]
      },
      "entities": [
        "default entity"
      ],
      "snow": {
        "assignment_level": 3,
        "functional_element": "",
        "grouping": true,
        "service_element": "MONITORING"
      },
      "source": "cms-monit-notifier",
      "status": "ERROR",
      "targets": [
        "email",
        "snow"
      ]
    }
  },
  "default_case": {
    "alert_name": "cms-htcondor-es-validation",
    "email": {
      "send_ok": true,
      "to": [
        "yanisa.sunthornyotin@cern.ch"
      ]
    },
    "entities": [
      "default entity"
    ],
    "source": "cms-monit-notifier",
    "status": "OK",
    "targets": [
      "email"
    ]
  },
  "notification_endpoint": "http://monit-alarms.cern.ch:10011"
}'''
                                    ))
sc.addPyFile('notifier.py')

In [20]:
#Static processing
alert_flow = alerts_email.writeStream\
.foreach(lambda alert: notifier.send_notification(subject=alert.system,description=json.dumps(alert.asDict(), default=str)))\
.start()

In [32]:
spark.streams.active

[<pyspark.sql.streaming.StreamingQuery at 0x7fc9451ec9b0>]

In [34]:
alert_flow.stop()

In [33]:
alert_flow.lastProgress
# alert_flow.processAllAvailable()

{'id': '208e9c58-cf5b-4cd9-8c90-399cedc1aa0a',
 'runId': 'a0941b1e-88d6-48b8-b359-d2f23b466405',
 'name': None,
 'timestamp': '2019-08-05T11:55:33.224Z',
 'batchId': 0,
 'numInputRows': 2940,
 'processedRowsPerSecond': 9.398736605201913,
 'durationMs': {'addBatch': 309710,
  'getBatch': 794,
  'getOffset': 1475,
  'queryPlanning': 702,
  'triggerExecution': 312807,
  'walCommit': 49},
 'stateOperators': [{'numRowsTotal': 2988,
   'numRowsUpdated': 2988,
   'memoryUsedBytes': 3103463,
   'customMetrics': {'loadedMapCacheHitCount': 0,
    'loadedMapCacheMissCount': 0,
    'stateOnCurrentVersionSizeBytes': 1931815}}],
 'sources': [{'description': 'FileStreamSource[hdfs://analytix/cms/users/carizapo/ming/fullDiff_cmsweb_logs]',
   'startOffset': None,
   'endOffset': {'logOffset': 0},
   'numInputRows': 2940,
   'processedRowsPerSecond': 9.398736605201913}],
 'sink': {'description': 'ForeachWriterProvider(org.apache.spark.sql.execution.python.PythonForeachWriter@47fe4140,Right(<function1>)

# Kmeans Model evaluation using Silhouette method

# Using Elbow method to determine which 'K' to use

In [None]:
# from pyspark.ml.clustering import KMeans
# from pyspark.ml.evaluation import ClusteringEvaluator
# import matplotlib.pyplot as plt
# import numpy as np

# cost = np.zeros(20)
# for k in range(2,20):
#     kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features")
#     model = kmeans.fit(train_transformed.sample(False,0.1, seed=1))
#     cost[k] = model.computeCost(train_transformed) # requires Spark 2.0 or later

In [None]:
# fig, ax = plt.subplots(1,1, figsize =(8,6))
# ax.plot(range(2,20),cost[2:20])
# ax.set_xlabel('k')
# ax.set_ylabel('cost')

In [None]:
# # Evaluate clustering by computing Silhouette score
# evaluator = ClusteringEvaluator()
# silhouette = evaluator.evaluate(predictions)
# print("Silhouette with squared euclidean distance = " + str(silhouette))

# Take a glimpse of how the training result looks like

In [None]:
# transform_df=predictions.toPandas()
# print(transform_df['system'].unique())

In [None]:
# import pandas as pd
# filter_data=transform_df[(transform_df.system =='couchdb')]
# filter_data.loc[:,'outlier']=filter_data.apply(lambda x:x['distances']>=x['avg_distances']+x['std_distances'],axis=1)
# filter_data[filter_data.avg_distances>=0].head(10)

In [None]:
# filter_data[filter_data.outlier==True].head(10)

In [None]:
# filter_data.set_index('date', inplace=True)

Calculate the Euclidean distance and label the outlier

Make the group of furthest point from cluster center to be the outlier

In [None]:
# import math
# max_df=(filter_data.groupby('prediction'))['distances'].max()
# outlier_factor=0.4
# filter_data.loc[:,'outlier']=filter_data.apply(lambda x:x['distances']>= max_df.loc[x['prediction']]-(max_df.loc[x['prediction']]*outlier_factor), axis=1)

In [None]:
# prep=filter_data[['%diff_req','%diff_sys','%diff_api','%diff_user']]
# label=filter_data[['outlier']]

In [None]:
# %matplotlib inline

# prep_fix=prep.loc['2019-07-17 07:02:00':'2019-07-18 11:26:00']
# label_fix=label.loc['2019-07-17 07:02:00':'2019-07-18 11:26:00']
# prep_fix.plot()
# plt.plot(prep_fix.index,label_fix.outlier,'o')
# prep.plot()
# plt.plot(prep.index,label.outlier,'o')