In [1]:
# Pyspark in Jupyter : findSpark package to make a Spark Context available in your code
import findspark
findspark.init("/usr/hdp/current/spark2-client")

In [2]:
# Spark Context Initialization 

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.types import *
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import lit ,row_number,col, monotonically_increasing_id, when
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

sc = SparkContext()
sqlc = SQLContext(sc)

In [3]:
# Visualization Tools 
import seaborn as sns
from matplotlib import style
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from matplotlib.gridspec import GridSpec
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
import plotly.graph_objects as go
import plotly.express as px

init_notebook_mode(connected=True)
sns.set()
style.use('fivethirtyeight')

In [4]:
import numpy as np
import pandas as pd
import datetime


In [5]:
# Define Constants 
day = [(2,116),(81,1445) , (217,2855), (3634,4410), (5156,5885), (6554,7189), (7833,8476), (9078,9624)] 
night = [(117,810), (1446,2169), (2856,3633), (4411,5155), (5886,6553), (7190,7832), (8477,9077)]
dry = [(2,6148), (7462,9624)]
rainy = [(6149,7461)] 
#proportionality constant and taken as; 1.8615
proConstant = 1.8615
standardTemp = 23.17
standardPH = 7.0
standardTurb = 198
headers = ['Temperature','pH','Turbidity','Quality']
variables = ['Humidity','Night']
colors = ['lightcoral','deepskyblue',
          'orchid',    'tomato',
          'teal',      'darkcyan',
          'limegreen', 'darkorange']

In [6]:
# Calculating water quality sum(wi*value), wi=I/Si
def cal_water_quality_index(rawData):
    rawData = rawData.withColumn("sTemp", lit(standardTemp))
    rawData = rawData.withColumn("sPH", lit(standardPH))
    rawData = rawData.withColumn("sTurb", lit(standardTurb))
    qualityUdf =F.udf(cal_quality, FloatType())
    rawData=rawData.withColumn('Quality', qualityUdf(rawData.Temperature,rawData.pH,rawData.Turbidity,rawData.sTemp,rawData.sPH,rawData.sTurb))

    rawData = rawData.drop('index','sTemp','sPH','sTurb')
    return rawData

In [7]:
# Row based processing using UDF function at pyspark  
def cal_quality(Temp,PH,Trub,sTemp,sPH,sTurb):
    quality = Temp*(proConstant/sTemp) + PH*(proConstant/sPH) + Trub*(proConstant/sTurb)
    return quality

In [8]:
# Adding Humidity:Dry=0/Rainy=1, Time: Day=0/Night=1, Quality Columns to raw data
def add_new_columns(rawData):
    rawData = rawData.withColumn("Humidity", lit(0))
    rawData = rawData.withColumn("Night", lit(0))
    rawData = rawData.withColumn("index", monotonically_increasing_id()+2)
    for row in night:
        rawData = rawData.withColumn("Night", when(rawData.index.between(int(row[0]),int(row[1])),lit(1))\
                                     .otherwise(rawData.Night))
    for row in rainy:
        rawData = rawData.withColumn("Humidity", when(rawData.index.between(int(row[0]),int(row[1])),lit(1))\
                                     .otherwise(rawData.Humidity))
    return rawData

In [9]:
# Plot Anomalies 
def plot_anomalies(plotData, variable):
    if head == 'Quality':
        return
    fig = px.line(plotData, x="Date_Time", y=[variable], title='Water Quality', template = 'plotly_dark')
    anomalyData = plotData.loc[plotData["Prediction"] == -1]
    anomalyData = anomalyData.set_index('Date_Time')
    fig.add_trace(go.Scatter(x=anomalyData.index, y=anomalyData[variable], mode = 'markers', 
                name = 'Anomaly', 
                marker=dict(color='red',size=10)))
        
    fig.show()

In [10]:
# Data Downsampleing per minutes
def downsample(rawData,minutes):
    resampledData = rawData.select(rawData.columns[:7]).toPandas()
    resampledData = resampledData.set_index('Date_Time')
    resampledData = resampledData.resample(str(minutes)+'T').mean()
    resampledData['Humidity'] = resampledData['Humidity'].fillna(0).round().astype(int)
    resampledData['Night'] = resampledData['Night'].fillna(0).round().astype(int)
    return resampledData

In [11]:
# Preprocessing Steps
def preprocessing (rawData):
    # rename colname Temperature (°C) to Temperature 
    rawData = rawData.withColumnRenamed(rawData.columns[0], 'Date_Time')
    rawData = rawData.withColumnRenamed(rawData.columns[1], 'Temperature')
    rawData = rawData.withColumnRenamed(rawData.columns[3], 'Turbidity')
    # Adding Humidity:Dry=0/Rainy=1, Time: Day=0/Night=1, Quality Columns to raw data
    rawData = add_new_columns(rawData)
    rawData = cal_water_quality_index(rawData)
    return rawData

In [12]:
# Reading Raw Data file from HDFS 
# Please change dataPath with your directory path
dataPath = 'hdfs://node1.sepahtan:8020/data/'
dataPathSens30 = dataPath + "Sensor_data_for_30_cm.csv"
rawDataSens30= sqlc.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(dataPathSens30)
rawDataSens30.count()


9623

In [13]:
rawDataSens30 = preprocessing(rawDataSens30)
downsampleData =  downsample (rawDataSens30,60)
x_train = downsampleData[['Temperature','pH','Turbidity']]
x_train = x_train.values.tolist()



In [14]:

classifier = IsolationForest(n_estimators=100,max_samples='auto',contamination=0.3, random_state=200, n_jobs=-1)

In [15]:

clf = classifier.fit(x_train)

In [16]:

prediction =clf.predict(x_train)

In [17]:
rawDataDF=downsampleData
rawDataDF['Prediction'] = prediction.tolist()
rawDataDF = rawDataDF.reset_index()

In [18]:
#  Plotting anomalies
plotData = rawDataDF[['Date_Time','Temperature','pH','Turbidity','Prediction']]

In [19]:

anomalyData = plotData.loc[plotData["Prediction"] == -1]
anomalyData=anomalyData.set_index('Date_Time')

In [20]:

for head in headers:
    plot_anomalies(plotData, head)



In [21]:
fig = px.scatter_3d(plotData, x='Temperature', y='pH', z='Turbidity',title='Water Quality',color='Prediction')#, symbol='species')
fig.show()