In this notebook you will load data sets, combine then all, remove features which will not be used,<br>
scale data and save it as a pickle file to use to produce plots.

In [1]:
import pandas as pd
from azureml.core import Workspace, Datastore, Dataset
import os

%load_ext lab_black

## Get data

In [2]:
def read_csv(csv_path):
    """
    This function read csv from default Blob Storage which
    is attached to Machine Learning resource
    using Azure ML SDK and returns data as pandas dataframe.    
    """

    # log in the Machine Learning resource
    ws = Workspace.from_config()

    # connect to the data store
    datastore_name = "workspaceblobstore"  # using default blob
    datastore = Datastore.get(ws, datastore_name=datastore_name)

    # get a file from the datastore
    data_paths = [(datastore, csv_path)]
    ds = Dataset.Tabular.from_delimited_files(data_paths)

    # convert the dataframe to a pandas dataframe
    df = ds.to_pandas_dataframe()

    return df

In [3]:
# Read datasets into memory
measurements = read_csv("Measurement_info.csv")
item_info = read_csv("Measurement_item_info.csv")
station_info = read_csv("Measurement_station_info.csv")

## Transform data

In [4]:
# Combine all data into one dataset
data = measurements.merge(item_info, on="Item code").merge(
    station_info, on="Station code"
)

In [5]:
# Removing unecessary columns
data = data[
    [
        "Measurement date",
        "Average value",
        "Item name",
        "Unit of measurement",
        "Station name(district)",
    ]
]

In [6]:
# Extract date from "Measurement date"
data["DayDate"] = data["Measurement date"].apply(lambda x: x[:10])

# Aggregate data to average per day instead of average per hour
data_avg = data.groupby(["Station name(district)", "Item name", "DayDate"]).agg("mean")

# Remove multi index structure from aggregated dataset
data = pd.melt(data_avg.T)

In [7]:
# Change "DayDate" to datetime format
data["DayDate"] = pd.to_datetime(data["DayDate"], format="%Y-%m-%d")
# Get string representation of date which will be used for x axis labels
data["Date"] = data["DayDate"].apply(lambda x: x.strftime("%y-%m-%d"))
# Get day name
data["DayName"] = data["DayDate"].apply(lambda x: x.day_name())

### Save it

In [9]:
# Save it locally
os.mkdir("data")
data.to_pickle("data/data_for_plots.pkl")

In [17]:
# Upload it to default blob storage

# log in the Machine Learning resource
ws = Workspace.from_config()

# Connect to the data store
datastore_name = "workspaceblobstore"
datastore = Datastore.get(ws, datastore_name=datastore_name)

# upload pickle file
pickle_file_directory = os.getcwd() + "/data/"
datastore.upload(
    src_dir=pickle_file_directory, target_path="/", overwrite=True, show_progress=True
)

Uploading an estimated of 1 files
Uploading /mnt/batch/tasks/shared/LS_root/mounts/clusters/my-example-compute/code/data/data_for_plots.pkl
Uploaded /mnt/batch/tasks/shared/LS_root/mounts/clusters/my-example-compute/code/data/data_for_plots.pkl, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_workspaceblobstore