Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

# Tutorial: Load Open Datasets Data

source: https://azure.microsoft.com/en-us/services/open-datasets/catalog/sample-oj-sales-simulated/#AzureNotebooks

## Install azureml-opendatasets SDK

In [None]:
# !pip uninstall -y azureml-opendatasets
# !pip install azureml-opendatasets

Import NoaaIsdWeather class from azureml-opendatasets

In [None]:
from azureml.opendatasets import NoaaIsdWeather
from azureml.opendatasets import OjSalesSimulated
from azureml.opendatasets import Diabetes

from datetime import datetime
from dateutil import parser
from dateutil.relativedelta import relativedelta

In [None]:
## Check core SDK version number
import azureml.core

from azureml.core import Workspace

ws = Workspace.from_config()
print("[SUCCESS] LOGGED IN: ",ws.name, ws.resource_group, ws.location, ws.subscription_id, sep=' @ ')


## Load Diabetes Data

In [None]:
diabetes = Diabetes.get_tabular_dataset()
pdf = diabetes.to_pandas_dataframe()

In [None]:
pdf.info()

In [None]:
# Create a Data Directory in local path
import os

data_path = os.path.join("data","download","diabetes")

if not os.path.exists(data_path):
    os.mkdir(data_path)

In [None]:
rows_in_sample = 50
n_samples = 9
row_start = 0
for i in range(n_samples):
    tmp = pdf[row_start:row_start+rows_in_sample]
    print(f"getting data from {row_start} to {row_start+rows_in_sample}")
    tmp.to_csv(os.path.join(data_path,f"diabetes{i}.csv"), index=False, header=True)
    row_start = row_start+rows_in_sample 



In [None]:
ds = ws.datastores["aml_data"]
ds.upload(src_dir=data_path, target_path='diabetes', overwrite=True, show_progress=True)

## Load OJ Sales Simulated Data

In [None]:
# Create a Data Directory in local path
import os

oj_sales_path = "data/download/ojs"

if not os.path.exists(oj_sales_path):
    os.mkdir(oj_sales_path)

In [None]:
# Pull all of the data
oj_sales_files = OjSalesSimulated.get_file_dataset()


In [None]:
oj_sales_files.take(6).to_path()

In [None]:
oj_sales_files.take(6).download(oj_sales_path, overwrite=True)

> **WARNING** data are being donwloaded to folder: `/data/download/ojs/https%3A/%2Fazureopendatastorage.azurefd.net/ojsales-simulatedcontainer/oj_sales_data` -> need to move first (in shell)

In [None]:
ds = ws.datastores["aml_data"]
ds.upload(src_dir= oj_sales_path, target_path='ojs', overwrite=True, show_progress=True)

## Load NoaaIsdWeather

> 1. Set start_date and end_date.
> 2. New an instance of NoaaIsdWeather.
> 3. Call to_pandas_dataframe() method to get a pandas DataFrame.

In [None]:
start_date = parser.parse('2018-1-1')
end_date = parser.parse('2018-1-4')
isd = NoaaIsdWeather(start_date, end_date)
# isd.to_pandas_dataframe().info()
# print('isd done')

In [None]:
pdf = isd.to_pandas_dataframe()

In [None]:
pdf.head(10)

In [None]:
pdf.reset_index(inplace=True)

In [None]:
# Create a Data Directory in local path
import os

weather_path = os.path.join("data","download","weather")

if not os.path.exists(weather_path):
    os.mkdir(weather_path)

In [None]:
rows_in_sample = 100
n_samples = 6
row_start = 0
for i in range(n_samples):
    tmp = pdf[row_start:row_start+rows_in_sample]
    print(f"getting data from {row_start} to {row_start+rows_in_sample}")
    tmp.to_csv(os.path.join(weather_path,f"weather{i}.csv"), index=False, header=True)
    row_start = row_start+rows_in_sample 



In [None]:
ds = ws.datastores["aml_data"]
ds.upload(src_dir=weather_path, target_path='weather', overwrite=True, show_progress=True)