# Dashboard

In [None]:
import warnings, logging
warnings.filterwarnings("ignore")
logging.getLogger().setLevel(logging.ERROR)
from dash import Dash, html, dcc, Input, Output, State
import plotly.express as px
import os, glob
import pytz
import pandas as pd
import psycopg2, psycopg2.extras
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
import kats
from kats.utils.decomposition import TimeSeriesDecomposition
from kats.detectors.trend_mk import MKDetector
from kats.consts import TimeSeriesData
from kats.detectors.seasonality import FFTDetector
from kats.detectors.cusum_detection import CUSUMDetector
from kats.detectors.outlier import OutlierDetector
import calendar
import plotly.tools as tls
import plotly.graph_objs as go

## Data Retriving
- Generic data retriving
- Dataset split based on year
- Dataset retriving based on a given date range

In [None]:
def retrieve_data(db_name, username, db_host, db_password, db_port, query, column):
    conn = psycopg2.connect(dbname=db_name, user=username, host=db_host, password=db_password, port=db_port)
    cur = conn.cursor(column, cursor_factory=psycopg2.extras.DictCursor)    
    cur.execute(query)
    df = cur.fetchall()
    return df 

In [None]:
def split_df(df):
    temp1 = []    
    temp2 = []
    for elem in range(0, len(df['data'])):        
        if(df.loc[elem].at["data"].year == 2021):
            temp1.append(df.loc[elem])
        elif(df.loc[elem].at["data"].year == 2022):
            temp2.append(df.loc[elem])
    df1 = pd.DataFrame(temp1).sort_values(by='data') 
    df2 = pd.DataFrame(temp2).sort_values(by='data') 
    return df1, df2

In [None]:
def get_splitted_df(db_name, username, db_host, db_password, db_port, query, column, atm, sostanza, analysis_column):
    df = retrieve_data(db_name, username, db_host, db_password,db_port, query, column)
    df = pd.DataFrame(np.array(df), columns = ['data', analysis_column])
    df['data'] = pd.to_datetime(df['data'], format = '%Y-%m-%d %H:%M:%S', utc=True)
    df.sort_index(inplace = True)     
    df1, df2 = split_df(df)
    return df1, df2

In [None]:
def get_df_from_range(df, start_date, end_date):
    temp = []        
    utc=pytz.UTC
    start_date = datetime.strptime(start_date,"%Y-%m-%d").replace(tzinfo=utc)
    end_date = datetime.strptime(end_date,"%Y-%m-%d").replace(tzinfo=utc)
    
    for elem in df.index:        
        if(df.loc[elem].at["data"] >= start_date and df.loc[elem].at["data"] <= end_date):
            temp.append(df.loc[elem])      
    df = []
    if(len(temp) != 0):
        df = pd.DataFrame(temp).sort_values(by='data')          
    
    return df

## Functions for chart generation
- Generic chart
- Weekly average chart (necessary for setting odor thresholds)
- Scatter plot chart

In [None]:
#Generic chart
def generate_chart(x_value, y_value, title, chart_type):
    chart = {
            "data": [
                {
                    "x": x_value,
                    "y": y_value,
                    "type": chart_type,                
                },
            ],
            "layout": {
                "title": {
                    "text": title,
                    "x": 0.05,
                    "xanchor": "left",
                },
                "xaxis": {"fixedrange": True},
                "yaxis": {"fixedrange": True},
                "colorway": ["#17B897"],
            },
        }
        
    return chart

In [None]:
#Weekly average chart (necessary for setting odor thresholds)
def generate_average_chart(sostanza, x_value, y_value, df, title, chart_type):
    chart = None
    if sostanza == 'h2s_ppb' and len(df21) != 0:
    
        fig = px.line(df, x="data", y="weekly_average")
        fig.update_traces(line_color='aqua')
        fig.update_layout({
            'plot_bgcolor': 'rgba(0, 0, 0, 0)',
            'paper_bgcolor': 'rgba(0, 0, 0, 0)',             
        })        
        fig.add_hline(y=0.1, line_width=1, line_color="red")
        fig.add_hline(y=0.41, line_width=1, line_color="red")
        fig.add_hline(y=8, line_width=1, line_color="red")        


        chart = fig       
        
    else:
        chart = generate_chart(x_value, y_value, title, chart_type)
    return chart

In [None]:
#Scatter plot chart
def generate_scatter_plot(x_value, y_value, title):
    chart = {
            "data": [
                 go.Scatter(
                    x=x_value,
                    y=y_value,                    
                    mode='markers',
                    opacity=0.8,
                    marker={
                        'size': 15,
                        'line': {'width': 0.5, 'color': 'white'}
                    },
                 )
            ],
            "layout": {
                "title": {
                    "text": title,
                    "x": 0.05,
                    "xanchor": "left",
                },
                "xaxis": {"fixedrange": True},
                "yaxis": {"fixedrange": True},
                "colorway": ["#17B897"],
            },
        }

        
    return chart

## Update chart functions (Dashboard)
- Data ingestion
- Global Null Values
- Weekly Null Values
- Values below minimum threshold detectable by sensors
- Weekly Average
- Seasonality 
- Outliers


### Data Ingestion

In [None]:
def insert_data(f, cur):
    atm_names = ['atm05', 'atm06', 'atm07', 'atm10', 'atm11', 'atm12', 'atm13', 'atm06']
    for atm in atm_names:
        if atm in f:
            file = open(f, 'r')        
            next(file)                
            cur.copy_from(file, atm, sep=',', null='')    
            cur.execute("INSERT INTO file_caricati (nome) VALUES (%s);", (f,))

In [None]:
def data_ingestion(dir_path, db_name, username, db_host, db_password, db_port, column):
    query = 'select nome from file_caricati'
    df = retrieve_data(dir_path, db_name, username, db_host, db_password, db_port, column)       
    conn = psycopg2.connect(dbname=db_name, user=username, host=db_host, password=db_password, port=db_port)
    cur = conn.cursor()
    os.chdir(dir_path)
    file_names = [i for i in glob.glob('*.{}'.format('csv'))]      
    for f in file_names:  
        if f.lower().endswith(('.csv')) and not df:
            insert_data(f, cur)
        elif f.lower().endswith(('.csv')) and [f] not in df:
            insert_data(f, cur)
            
    cur.close()
    conn.commit()
    chart2022 = None
    chart2021 = None
        
    return chart2021, chart2022

### Global Null Values

In [None]:
def get_null_values(atm_list, db_name, username, db_host, db_password, db_port, column):    
    list_temp = []   
    for atm in atm_list:
        query = "select data, trs_ppb from " + atm
        df = retrieve_data(db_name, username, db_host, db_password, db_port, query, column)
        df = pd.DataFrame(np.array(df), columns = ['data', 'trs_ppb'])
        df['data'] = pd.to_datetime(df['data'], format = '%Y-%m-%d %H:%M:%S', utc=True)
        df.sort_index(inplace = True)               
        for column in df:
            if column != 'data':
                null_values = df[column].isna().sum()                
                percent_missing_values = (null_values * 100) / len(df[column])
                list_temp.append(round(percent_missing_values, 1))            
        dic = {"atm": atm_list, "valori": list_temp}
    return dic

### Weekly Null Values

In [None]:
def weekly_null_values(db_name, username, db_host, db_password, db_port, chemical, atm, start_date21, end_date21, start_date22, end_date22):   
    query = "select time_bucket('1 week', data) as bucket, sum(case when " + chemical + " is null then 1 else 0 end) null_values,count(" + chemical + ") not_nulls from " + atm + " GROUP BY bucket"                        
    df = retrieve_data(db_name, username, db_host, db_password, db_port, query, 'data')        
    df = pd.DataFrame(np.array(df), columns = ['data', 'null_values', 'not_null_values'])
    df['data'] = pd.to_datetime(df['data'], format = '%Y-%m-%d %H:%M:%S', utc=True)    
    df['percentage_null_values'] = ((df['null_values'] / (df['null_values'] + df['not_null_values']))*100)      
    for value in df['percentage_null_values']: 
        df['percentage_null_values'] = df['percentage_null_values'].replace(value, round(value, 2))
    df21, df22 = split_df(df)
    df21_range = get_df_from_range(df21, start_date21, end_date21)                        
    df22_range = get_df_from_range(df22, start_date22, end_date22)                        
    title1 = "Weekly Null Values 2021 " + chemical + " " + atm
    title2 = "Weekly Null Values 2022 " + chemical + " " + atm    
    title_empty = "No data for the selected items"       
    if len(df21_range) != 0:            
        chart2021 = generate_chart(df21_range["data"], df21_range["null_values"], title1, 'bar')
    else:            
        chart2021 = generate_chart([], [], title_empty, 'bar')             
    if len(df22_range) != 0:                                         
        chart2022 = generate_chart(df22_range["data"], df22_range["null_values"], title1, 'bar')
    else:            
        chart2022 = generate_chart([], [], title_empty, 'bar')
    return chart2021, chart2022  

### Values below minimum threshold detectable by sensors

In [None]:
def count_weekly_threshold(db_name, username, db_host, db_password, db_port, column, column_name, min_value, max_value, atm):
    query = "select time_bucket('1 week', data) as bucket, COUNT(*) AS total_values, COUNT(CASE WHEN " + column_name + " < " + str(min_value) + " THEN 1 END) AS min_values, COUNT(CASE WHEN " + column_name + " > " + str(max_value) + " THEN 1 END) AS max_values from " + atm + " GROUP BY bucket"
    df = retrieve_data(db_name, username, db_host, db_password, db_port, query, column)        
    df = pd.DataFrame(np.array(df), columns = ['data', 'total_values', 'min_values', 'max_values',])                         
    df['data'] = pd.to_datetime(df['data'], format = '%Y-%m-%d %H:%M:%S', utc=True)                            
    df['min_percentage'] = ((df['min_values'] / (df['total_values'])*100))    
    df['max_percentage'] = ((df['max_values'] / (df['total_values'])*100)) 

    for value in df['min_percentage']: 
        df['min_percentage'] = df['min_percentage'].replace(value, round(value, 2))
    for value in df['max_percentage']: 
        df['max_percentage'] = df['max_percentage'].replace(value, round(value, 2))

    df1, df2 = split_df(df)
    return df1, df2

In [None]:
def show_values_below_threshold(db_name, username, db_host, db_password, db_port, cur_column, chemical, atm, start_date21, end_date21, start_date22, end_date22):
    df21 = []
    df22 = [] 
    dic_min_threshold = {"trs_ppb" : 0, "voc_ppm" : 0.6, "c6h6_ppb" : 0.1, "h2s_ppb" : 2, "pidvoc_ppb" : 0}
    dic_max_threshold = {"trs_ppb" : 10000, "voc_ppm" : 25, "c6h6_ppb" : 30, "h2s_ppb" : 3000, "pidvoc_ppb" : 40000}
    
    df21, df22 = count_weekly_threshold(db_name, username, db_host, db_password, db_port, cur_column, chemical, dic_min_threshold[chemical], dic_max_threshold[chemical], atm)            
    df21_range = get_df_from_range(df21, start_date21, end_date21)   
    df22_range = get_df_from_range(df22, start_date22, end_date22)   
        
    title1 = "Values below threshold 2021 " + chemical + " " + atm,
    title2 = "Values below threshold 2022 " + chemical + " " + atm,
    chart2021 = generate_chart(df21_range["data"], df21_range["min_percentage"], title1, "bar")        
    chart2022 = generate_chart(df22_range["data"], df22_range["min_percentage"], title2, "bar")        
        
    return chart2021, chart2022

### Weekly Average

In [None]:
def weely_average(db_name, username, db_host, db_password, db_port, cur_column, chemical, atm, start_date21, end_date21,  start_date22, end_date22):
    query = "select time_bucket('1 week', data) as bucket, avg(" + chemical + ") FROM " + atm + " GROUP BY bucket"
    df21, df22 = get_splitted_df(db_name, username, db_host, db_password, db_port, query, cur_column, atm, chemical, 'weekly_average')        
    df21_range = get_df_from_range(df21, start_date21, end_date21)        
    df22_range = get_df_from_range(df22, start_date22, end_date22)        
    title1 = "Weekly Average 2021 " + atm + " " + chemical
    title2 = "Weekly Average 2022 " + atm + " " + chemical 
    title_empty = "No data for the selected items"            
    
    if len(df21_range) != 0:            
        chart2021 = generate_average_chart(chemical, df21_range['data'], df21_range['weekly_average'],df21_range, title1, 'lines')
    else:            
        chart2021 = generate_average_chart(chemical, [], [], [], title_empty, 'lines')
    if len(df22) != 0:            
        chart2022 = generate_average_chart(chemical, df22['data'], df22['weekly_average'],df22_range, title2, 'lines')
    else:            
        chart2022 = generate_average_chart(chemical, [], [], [], title_empty, 'lines')    
    return chart2021, chart2022

### Outliers

In [None]:
def outliers_prep(db_name, username, db_host, db_password, db_port, query, cur_column, column_name):    
    df = retrieve_data(db_name, username, db_host, db_password, db_port, query, cur_column)    
    df pd.DataFrame(np.array(df), columns = ['data', column_name])
    df['data'] = pd.to_datetime(df['data'], format = '%Y-%m-%d %H:%M:%S', utc=True)    
    df[column_name] = df[column_name].astype(float)
    df.interpolate(method ='ffill', limit_direction='forward', inplace=True)
    df1, df2 = split_df(df)    
    return df1, df2            

In [None]:
def outliers(df, db_name, username, db_host, db_password, db_port, cur_column, chemical, title):               
    df_temp = TimeSeriesData(df.rename(columns={"data":"time"}))         
    outlier_detector = OutlierDetector(df_temp, "additive")
    outlier_detector.detector()
    outliers = outlier_detector.outliers  
    df_out = []            
    for data in df.index:                 
        if df.loc[data].at['data'] in outliers[0]:                       
            df_out.append(df.loc[data])                                      
       
    if len(df_out) != 0:
        df_out = pd.DataFrame(np.array(df_out), columns = ['data', chemical])            
        chart = generate_scatter_plot(df_out["data"], df_out[chemical], title)
    else:            
        title_empty = "No data for the selected items"            
        chart= generate_scatter_plot([], [], title_empty)
                 
    return chart

## Dashboard

In [None]:
app = Dash(__name__)

columns = ['trs_ppb', 'voc_ppm', 'c6h6_ppb', 'h2s_ppb', 'pidvoc_ppb']
atm_names = ['atm05', 'atm06', 'atm07', 'atm10', 'atm11', 'atm12', 'atm13', 'atm14']
analysis = ['Data Ingestion', 'Global Null Values', 'Weekly Null Values', 'Values below threshold',
            'Weekly Average', 'Outliers']

query = "select time_bucket('1 week', data) as bucket, avg(trs_ppb) FROM atm05 GROUP BY bucket"
df1, df2 = get_splitted_df("db_tesi", "postgres", "localhost", "fall22","5432", query, 'data', 'atm05', 'trs_ppb', 'weekly_average')

app.layout = html.Div(children=[
    html.H1(children='Data analysis of a local oil extraction site', style={'color': 'teal', 'margin': '4px auto', 'text-align': 'center', 'font-size': '40px'}),   
    
    html.Div(
    children=[
        html.Div(
            children=[
                html.Div(children="ATM", className="menu-title"),
                dcc.Dropdown(
                    id="atm-filter",
                    options=[
                        {"label": atm, "value": atm}
                        for atm in atm_names
                    ],
                    value="atm05",
                    clearable=False,
                    className="dropdown",                    
                ),
            ], style={'width': '10%', 'font-size': '20px', 'font-weight': 'bold', 'margin-bottom': '6px'}),        
        html.Div(
            children=[
                html.Div(children="Chemical", className="menu-title"),
                dcc.Dropdown(
                    id="substance-filter",
                    options=[
                        {"label": chemical, "value": chemical}
                        for chemical in columns
                    ],
                    value="trs_ppb",
                    clearable=False,
                    searchable=False,
                    className="dropdown",                    
                ),
            ], style={'width': '10%', 'font-size': '20px', 'font-weight': 'bold', 'margin-bottom': '6px'}),        
        html.Div(
            children=[
                html.Div(children="Analysis Type", className="menu-title"),
                dcc.Dropdown(
                    id="analysis-filter",
                    options=[
                        {"label": analisi, "value": analisi}
                        for analisi in analysis
                    ],
                    value="Global Null Values",
                    clearable=False,
                    searchable=False,
                    className="dropdown",                    
                ),
            ], style={'width': '20%', 'font-size': '20px', 'font-weight': 'bold', 'margin-bottom': '6px'}), 
        html.Div(
            children=[
                html.Div(
                    children="Date Range 2021",
                    className="menu-title"
                    ),
                dcc.DatePickerRange(
                    id="date-range21",
                    min_date_allowed=df1.data.min().date(),
                    max_date_allowed=df1.data.max().date(),
                    start_date=df1.data.min().date(),
                    end_date=df1.data.max().date(),                    
                ),
            ], style={'width': '20%', 'font-size': '20px', 'font-weight': 'bold', 'margin-bottom': '6px'}),  
        html.Div(
            children=[
                html.Div(
                    children="Date Range 2022",
                    className="menu-title"
                    ),
                dcc.DatePickerRange(
                    id="date-range22",
                    min_date_allowed=df2.data.min().date(),
                    max_date_allowed=df2.data.max().date(),
                    start_date=df2.data.min().date(),
                    end_date=df2.data.max().date(),                    
                ),
            ], style={'width': '20%', 'font-size': '20px', 'font-weight': 'bold', 'margin-bottom': '6px'}),  
        
    ],
    
    className="menu", style={'height': '124px',
                             'width': '1224px',
                             'display': 'flex', 
                             'gap': '2rem', 
                             'justify-content': 'center',
                             'margin':'15px auto',
                             'background-color': 'linear-gradient(to bottom, white, teal)',
                             'box-shadow': '0 4px 6px 0 teal'}),
    
   
    #dcc.Graph(id='chart2021', style={'display': 'inline-block'}),
    dcc.Graph(id='chart2021'),
    dcc.Graph(id='chart2022') 
    
])


@app.callback(    
    [Output("chart2021", "figure"), Output("chart2022", "figure")],
    [Input("atm-filter", "value"), 
     Input("substance-filter", "value"), 
     Input("analysis-filter", "value"),
     Input("date-range21", "start_date"),
     Input("date-range21", "end_date"),
     Input("date-range22", "start_date"),
     Input("date-range22", "end_date")
    ],     
)

def update_charts(atm, chemical, analysis_type, start_date21, end_date21, start_date22, end_date22):
    if(analysis_type == 'Weekly Average'):
        chart2021, chart2022 = weely_average(db_name, username, host, password, port 'data', chemical, atm, start_date21, end_date21,  start_date22, end_date22)
        
    elif(analysis_type == 'Global Null Values'): 
        dic = get_null_values(atm_names, db_name, username, host, password, port, 'data')
        df = pd.DataFrame.from_dict(dic)        
        chart2022 = None
        chart2021  = generate_chart(df["atm"], df["valori"], "Global Null Values", "bar")
        
    elif (analysis_type == 'Weekly Null Values'):                
        chart2021, chart2022 = weekly_null_values(db_name, username, host, password, port, chemical, atm, start_date21, end_date21, start_date22, end_date22)
        
    elif (analysis_type == 'Values below threshold'):        
        chart2021, chart2022 = show_values_below_threshold(db_name, username, host, password, port, 'data', chemical, atm, start_date21, end_date21, start_date22, end_date22)
        
    elif (analysis_type == 'Outliers'):
        query = 'SELECT data, ' + chemical + ' FROM ' + atm
        df21, df22 = outliers_prep(db_name, username, host, password, port, query, 'data', chemical)     
        df21_range = get_df_from_range(df21, start_date21, end_date21)    
        df22_range = get_df_from_range(df22, start_date22, end_date22)   
        title21 = "Outliers 2021 " + chemical + " " + atm            
        title22 = "Outliers 2022 " + chemical + " " + atm            
        chart2021 = outliers(df21_range, db_name, username, host, password, port, 'data', chemical, title21) 
        chart2022 = outliers(df22_range, db_name, username, host, password, port, 'data', chemical, title22)
                                               
        
    elif (analysis_type == 'Data Ingestion'):        
        chart2021, chart2022 = data_ingestion('../Caricamento Dati/', db_name, username, host, password, port)
    

    return chart2021, chart2022



if __name__ == '__main__':
    app.run_server(debug=False)
