
%pip install --upgrade azure-ai-anomalydetector

In [1]:
import os
import time
from datetime import datetime

from azure.ai.anomalydetector import AnomalyDetectorClient
from azure.ai.anomalydetector.models import DetectionRequest
from azure.core.exceptions import HttpResponseError

# Prepare data for Azure Anomaly Detection



Data is obtained from https://archive.ics.uci.edu/ml/datasets/SML2010

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.master("local").appName("AzureData").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-03-13 02:28:49,116 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [12]:
df= spark.read.csv("file:////home/hadoop/lohrasp/data-science/Azure/SML2010.csv",
    inferSchema=True,header=True)

The columns Date and Time will be concatenated to construct the  timestamp column. 

In [110]:
df.select("Date","Time").show(2)

+----------+-----+
|      Date| Time|
+----------+-----+
|13/03/2012|11:45|
|13/03/2012|12:00|
+----------+-----+
only showing top 2 rows



In [13]:
from pyspark.sql.functions import col,concat,lit,to_timestamp,udf
from pyspark.sql.types import StringType

## Each time series should be a CSV file with two (and only two) columns, "timestamp" and "timestamp" (all in lowercase) as the header row.


Although the following code provides timestamp, but for Azure timestamp should have 'iso-8601' 

### Not iso-8601

In [137]:
df1 = df.withColumn("timestamp1", concat(col("Date"),lit(" ") ,col("Time")))
df2 = df1.withColumn("timestamp",to_timestamp("timestamp1","dd/MM/yyyy HH:mm"))
for c in df.drop("Date","time").columns:
    df2.withColumnRenamed(c,"value").select("timestamp","value").toPandas().to_csv(f"data/{c}.csv",sep=",")

### iso-8601
The following ISO 8601 UTC formats are currently accepted by Azure Storage. The date value is required, while the time value is optional:([reference]("https://docs.microsoft.com/en-us/rest/api/storageservices/formatting-datetime-values"))

    YYYY-MM-DD
    YYYY-MM-DDThh:mm<TZDSuffix>
    YYYY-MM-DDThh:mm:ss<TZDSuffix>


In [14]:
def convert_date(datestring):
    d,m,y = datestring.split("/")
    return "-".join([y,m,d])
convert_date_UDF = udf(lambda z: convert_date(z),StringType())

In [15]:
df1 = df.withColumn("Date1",convert_date_UDF("Date"))#
df2 = df1.withColumn("timestamp", concat(col("Date1"),lit("T") ,col("Time")))
df2.select("timestamp").toPandas().head(2)
for c in df.drop("Date","time").columns:
    df2.withColumnRenamed(c,"value").select("timestamp","value").toPandas().to_csv(f"data/{c}.csv",sep=",",index=False)


In [47]:
df1.toPandas().head(2)

Unnamed: 0,Date,avg(Meteo_Exterior_Sol_Oest),avg(Temperature_Comedor_Sensor),avg(Humedad_Comedor_Sensor),avg(Exterior_Entalpic_1),avg(CO2_Comedor_Sensor),avg(Temperature_Exterior_Sensor),avg(Humedad_Exterior_Sensor),avg(Exterior_Entalpic_2),avg(Weather_Temperature),...,avg(Meteo_Exterior_Piranometro),avg(Lighting_Habitacion_Sensor),avg(Meteo_Exterior_Sol_Sud),avg(Temperature_Habitacion_Sensor),avg(Meteo_Exterior_Viento),avg(CO2_Habitacion_Sensor),avg(Meteo_Exterior_Sol_Est),avg(Meteo_Exterior_Crepusculo),avg(Exterior_Entalpic_turbo),avg(Precipitacion)
0,17/03/2012,17330.414958,19.40044,44.252475,0.0,205.698104,17.661935,53.903472,0.0,15.0,...,229.015708,44.591629,28945.614896,18.983603,1.275833,208.025354,14084.089563,321.087069,0.0,0.0
1,20/03/2012,906.229396,14.328148,36.61289,0.0,200.39076,11.266743,71.105419,0.0,11.980556,...,18.248523,17.663475,969.919646,14.127175,2.756861,201.955115,1064.447854,296.263149,0.0,0.926389


In [62]:
df11 = df.withColumn("timestamp" , convert_date_UDF(col("Date"))) 
df1 = df11.groupBy("timestamp").agg({c:'avg' for c in df.drop("Date","Time","timestamp").columns})
for c in df1.drop("timestamp").columns:
    df1.withColumnRenamed(c,"value").select("timestamp","value").toPandas().to_csv(f"data2/{c}.csv",sep=",",index=False)

In [61]:
df1.toPandas().head(2)

Unnamed: 0,timestamp,avg(Meteo_Exterior_Sol_Oest),avg(Temperature_Comedor_Sensor),avg(Humedad_Comedor_Sensor),avg(Exterior_Entalpic_1),avg(CO2_Comedor_Sensor),avg(Temperature_Exterior_Sensor),avg(Humedad_Exterior_Sensor),avg(Exterior_Entalpic_2),avg(Weather_Temperature),...,avg(Meteo_Exterior_Piranometro),avg(Lighting_Habitacion_Sensor),avg(Meteo_Exterior_Sol_Sud),avg(Temperature_Habitacion_Sensor),avg(Meteo_Exterior_Viento),avg(CO2_Habitacion_Sensor),avg(Meteo_Exterior_Sol_Est),avg(Meteo_Exterior_Crepusculo),avg(Exterior_Entalpic_turbo),avg(Precipitacion)
0,2012-03-19,16771.943229,19.478197,34.189458,0.0,201.914021,15.409863,37.14647,0.0,15.0,...,222.65768,40.190433,24604.482771,18.905979,1.184563,204.308469,11141.635458,317.661243,0.0,0.0
1,2012-03-22,18136.550396,15.335876,48.926484,0.0,212.914031,14.01248,58.610003,0.0,11.306947,...,247.061055,45.967877,28481.745604,14.944966,1.230139,219.618667,15032.296188,323.243211,0.0,0.0


## All the CSV files should be zipped into one zip file without any subfolders.

In [74]:
!zip SML2010v4.zip data3/* 

  adding: data3/CO2_Comedor_Sensor.csv (deflated 62%)
  adding: data3/Day_Of_Week.csv (deflated 78%)


## The zip file should be uploaded to Azure Blob storage

https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-portal

# Anomaly Detector 

Create an anomaly detector in Azure portal

In [3]:
subscription_key = "8b68d766f0af469ba3fb2ca1da3a7e8e"
anomaly_detector_endpoint = "https://anomalydetectormattnaj1.cognitiveservices.azure.com/"


In [4]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.anomalydetector import AnomalyDetectorClient

ad_client = AnomalyDetectorClient(AzureKeyCredential(subscription_key), anomaly_detector_endpoint)

In [4]:
list(ad_client.list_multivariate_model())


[]

In [10]:
Blob_SAS_token = "sp=r&st=2022-03-14T02:48:36Z&se=2022-03-21T10:48:36Z&spr=https&sv=2020-08-04&sr=b&sig=bOfgmnfnS3bBl%2BVLPum%2B6p3LlZZItBLvu0fKOeWa7%2Bw%3D"
Blob_SAS_URL = "https://aifundamentalstorge234.blob.core.windows.net/sml2010/SML2010v3.zip?sp=r&st=2022-03-14T02:48:36Z&se=2022-03-21T10:48:36Z&spr=https&sv=2020-08-04&sr=b&sig=bOfgmnfnS3bBl%2BVLPum%2B6p3LlZZItBLvu0fKOeWa7%2Bw%3D"

In [11]:
from azure.ai.anomalydetector.models import ModelInfo,ModelStatus
start_time = "2012-03-13T11:45:00"
end_time = "2012-04-11T06:30:00" 
data_feed2 = ModelInfo(start_time=start_time, end_time=end_time, source=Blob_SAS_URL)

In [12]:
response_header2 = ad_client.train_multivariate_model(data_feed2, cls=lambda *args: [args[i] for i in range(len(args))])[-1]
trained_model_id = response_header2['Location'].split("/")[-1]
new_model_list = list(ad_client.list_multivariate_model(skip=0, top=10000))


Datetime with no tzinfo will be considered UTC.
Datetime with no tzinfo will be considered UTC.


In [20]:
model_status = None
while model_status != ModelStatus.READY and model_status != ModelStatus.FAILED:
     model_info = ad_client.get_multivariate_model(trained_model_id).model_info
     model_status = model_info.status

if model_status == ModelStatus.READY:
    # Model list after training
    new_model_list = list(ad_client.list_multivariate_model(skip=0, top=10000))
    print("Done.\n--------------------")
    print("{:d} available models after training.".format(len(new_model_list)))


Done.
--------------------
1 available models after training.


Detect anomaly in the same data source (but a different interval)

In [None]:
from azure.ai.anomalydetector.models import DetectionRequest,DetectionStatus

try:
    detection_req = DetectionRequest(source=Blob_SAS_URL, start_time=start_time, end_time=end_time)
    response_header = ad_client.detect_anomaly(trained_model_id, detection_req,
                                                    cls=lambda *args: [args[i] for i in range(len(args))])[-1]
    result_id = response_header['Location'].split("/")[-1]

    # Get results 
    r = ad_client.get_detection_result(result_id)
    while r.summary.status != DetectionStatus.READY and r.summary.status != DetectionStatus.FAILED:
        r = ad_client.get_detection_result(result_id)
        time.sleep(2)
except HttpResponseError as e:
    print('Error code: {}'.format(e.error.code), 'Error message: {}'.format(e.error.message))
except Exception as e:
    raise e