In [1]:
import pandas as pd
import numpy as np
import altair as alt
from sklearn.preprocessing import StandardScaler

from datetime import datetime

import ipywidgets as widgets
from IPython.display import HTML, clear_output, display
from ipywidgets import HBox, VBox

%load_ext lab_black

In [2]:
# Data processed by process_data.ipynb notebook is stored locally
# in your Machine Learning environment. Also, the processed data
# is uploaded to the default blob storage. Mount default blob storage
# to your Azure Web App as "/ml_blob_storage" folder. This way the processed
# dataset will be available in "/ml_blob_storage" folder for your web app.

# Load data
try:
    # You need to use "/ml_blob_storage" as a mount path to mount Azure Storage
    # in Azure Web App configuration
    data = pd.read_pickle(
        "/ml_blob_storage/data_for_plots.pkl"
    )  # This path is used to load data for web app
except FileNotFoundError:
    # local (on Azure Machine Learning) data file structure:
    # process_data.ipynb
    # |-- data /
    # |   |-- data_for_plots.pkl
    # |-- dashbaord /
    # |   |-- dashboard.ipynb
    # |   |-- Dockerfile
    # |   |-- requirements.txt
    data = pd.read_pickle(
        "../data/data_for_plots.pkl"
    )  # This path is used to load data while working in AMLS environment

In [3]:
def scale_values_per_item(data):
    """
    This function removes outliers and calculates scaled values
    for each type of measurement CO, SO2, etc.
    Scaled values will be used to adjust the colour of the cells in a heat map.
    This way the colour scale will not go out of range for chosen dates. The actual,
    recorded values will be displayed via tool tip on the heat map.
    """
    all_data = []
    for name, values in data.groupby("Item name"):
        copy_of_data = values.copy(deep=True)
        # array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1)
        actaul_values = np.array(copy_of_data["value"]).reshape(-1, 1)

        # This will copy the array of values then
        # scale the values by removing mean and dividing by standard deviation (std)
        # if std is greated than X the observations with those values will be removed and
        # the process will be repeated until all the outliers are removed
        copy_of_values = actaul_values.copy()
        max_std = 5
        std_above = True
        while std_above:
            # scale the values
            scaler = StandardScaler()
            scalled_values = scaler.fit_transform(copy_of_values)
            # check for outliers
            if max(scalled_values) > max_std:
                mask = scalled_values > max_std
                copy_of_values[mask] = -99
            else:
                std_above = False

        # add scalled value to the main dataframe
        copy_of_data["value_scalled"] = scalled_values.reshape(-1)

        all_data.append(copy_of_data)

    all_data = pd.concat(all_data).reset_index(drop=True)

    return all_data

In [4]:
def filter_by_date(data, start_date, end_date):
    """
    e.g.
    start_date = "2017-01-01"
    end_date = "2017-03-01"
    """
    mask = (data["DayDate"] > start_date) & (data["DayDate"] <= end_date)
    filterd_data = data[mask]

    return filterd_data

### Air Pollution Measurement Information in Seoul, Korea

In [21]:
print(f"Data available from {data['DayDate'].min()} to {data['DayDate'].max()} dates.")

Data available from 2017-01-01 00:00:00 to 2019-12-31 00:00:00 dates.


In [26]:
# Create start, end date and station picker and their labels
# to allow dynamic interaction with the plot
all_station_names = data["Station name(district)"].unique()

station_name_label = widgets.Label(value="Station name:")
station_name_picker = widgets.Dropdown(
    options=all_station_names, value=all_station_names[0],
)

start_date_label = widgets.Label(value="Start date:")
start_date_picker = widgets.DatePicker()

end_date_label = widgets.Label(value="End date:")
end_date_picker = widgets.DatePicker()

left = widgets.VBox([start_date_label, end_date_label, station_name_label])
right = widgets.VBox([start_date_picker, end_date_picker, station_name_picker])

legend = widgets.Label(
    value="Colour represents recorded values for each measurement from low - blue to high - red."
)

In [27]:
labels = widgets.VBox([widgets.HBox([left, right]), legend])  # Labels
out = widgets.Output()


def plot(change):
    start_date = start_date_picker.value
    end_date = end_date_picker.value
    station_name = station_name_picker.value
    error_message = ""

    with out:
        # Set some default dates before you use date picker
        if start_date == None:
            start_date = datetime.strptime("2017-01-01", "%Y-%m-%d")
            end_date = datetime.strptime("2017-03-01", "%Y-%m-%d")

        min_date = data["DayDate"].min()
        max_date = data["DayDate"].max()

        plot = None

        try:
            # Error handling, checking if dates are within the range.
            if start_date:
                start_date = pd.to_datetime(start_date)
                if start_date < min_date:
                    raise Exception(f"Date out of range. Min date available {min_date}")
            if end_date:
                end_date = pd.to_datetime(end_date)
                if end_date > max_date:
                    raise Exception(f"Date out of range. Max date available {max_date}")
            if (start_date != None) & (end_date != None):
                difference = (end_date - start_date).days
                if difference > 60:
                    raise Exception(
                        f"Currently only 60 day range can be displayed. Your date range is {difference} days"
                    )

            station_data = data[data["Station name(district)"] == station_name]
            station_data = filter_by_date(station_data, start_date, end_date)
            station_data = scale_values_per_item(station_data)

            plot = (
                alt.Chart(station_data)
                .mark_rect()
                .encode(
                    x=alt.X("Date", sort="ascending"),
                    y=alt.Y("Item name:O", title=""),
                    color=alt.Color(
                        "value_scalled:Q",
                        scale=alt.Scale(scheme="redyellowblue", domain=[-4, 4]),
                        sort="descending",
                        legend=None,
                    ),
                    tooltip=[
                        alt.Tooltip("Date:O", title="Date"),
                        alt.Tooltip("DayName:O", title="Week day"),
                        alt.Tooltip("Item name:O", title="Measurement"),
                        alt.Tooltip("value:Q", title="Value", format=",.3f"),
                    ],
                )
            ).properties(height=100, width=840)
        except TypeError:
            error_message = "Please choose both start and end dates."
        except Exception as error:
            error_message = error

        clear_output(wait=True)
        display(labels, plot, error_message)


start_date_picker.observe(plot, names="value")
end_date_picker.observe(plot, names="value")
station_name_picker.observe(plot, names="value")

plot(None)
out

Output()

In [22]:
HTML(
    """
    <iframe width="640px" height= "1080px"
    src= "https://forms.office.com/Pages/ResponsePage.aspx?id=Ah2656Iq1UiBhePca_e4bXaJLBhM1sJFnZ5dBLvC2X9UQVNSSlBTN1I2MVNHUkJUREgwR0FDWVZUNy4u&embed=true"
    frameborder= "0"
    marginwidth= "0"
    marginheight= "0"
    style= "border: none;
    max-width:100%" allowfullscreen webkitallowfullscreen mozallowfullscreen msallowfullscreen> </iframe>
"""
)