<a href="https://colab.research.google.com/github/marcochisci/Anomaly_detection/blob/main/Anomaly_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Apache Spark 3.0.0

In [1]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark3.0.0
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz
# unzip it
!tar xf spark-3.0.0-bin-hadoop3.2.tgz
# install findspark 
!pip install -q findspark

# Set Environment Variables

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"

#Create local spark session


In [3]:
import findspark
findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

Installation Test and pyspark version

In [4]:
#create a test schema
from pyspark.sql.types import *
from pyspark.sql import Row

schema = StructType([StructField('name', StringType()), StructField('age',IntegerType())])
rows = [Row(name='Severin', age=33), Row(name='John', age=48)]
df = spark.createDataFrame(rows, schema)

df.printSchema()
df.show()


# Check the pyspark version
import pyspark
print(pyspark.__version__)


root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)

+-------+---+
|   name|age|
+-------+---+
|Severin| 33|
|   John| 48|
+-------+---+

3.0.0


# Airquino Table Data

In [11]:
import pandas as pd
import psycopg2
import datetime
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.dates as mdates


conn = psycopg2.connect(host='playground.magentalab.it', port='55432', database='airqino', user='datareader', password='homntLZnlhQd9prtVA9SFezQek')
 # sensor_id è l'identificativo del sensore preso dalla tabella sensor, station_id è l'identificativo di centralina preso dalla tabella station
query = """select sd.data_acquired as timestamp, sd.float_value as value
from station_data sd 
where sd.sensor_id = {} 
and sd.data_acquired >= to_timestamp('2021-06-01 00:00:00', 'YYYY-mm-dd HH24:MI:SS')
and sd.station_id = {} order by sd.data_acquired asc;""".format(29510692, 23284701)

df = pd.read_sql(query, conn)
df = df.set_index('timestamp')

display(df.head(5))

Unnamed: 0_level_0,value
timestamp,Unnamed: 1_level_1
2021-06-29 10:40:00,466.0
2021-06-29 10:42:00,470.0
2021-06-29 10:46:00,480.0
2021-06-29 10:48:00,476.0
2021-06-29 11:00:00,458.0


# Testing stationarity 

In [12]:
from statsmodels.tsa.stattools import adfuller

def test_stationarity(ts_data, column='', signif=0.05, series=False):
    if series:
        adf_test = adfuller(ts_data, autolag='AIC')
    else:
        adf_test = adfuller(ts_data[column], autolag='AIC')
    p_value = adf_test[1]
    if p_value <= signif:
        test_result = "Stationary"
    else:
        test_result = "Non-Stationary"
    return test_result

adf_test_results = {
    col: test_stationarity(df, col)
    for col in df.columns
}
adf_test_results    

{'value': 'Stationary'}

Converting to stationary with differencing

In [13]:
def differencing(data, column, order):
    differenced_data = data[column].diff(order)
    differenced_data.fillna(differenced_data.mean(), inplace=True)
    return differenced_data
for col in df.columns:
    df[col] = differencing(df, col, 1)

Test for stationarity now

In [14]:
adf_test_results = {
    col: test_stationarity(df, col)
    for col in df.columns
}
adf_test_results    

{'value': 'Stationary'}