Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

# Tutorial: Load Open Datasets Data

source: https://azure.microsoft.com/en-us/services/open-datasets/catalog/sample-oj-sales-simulated/#AzureNotebooks

## Install azureml-opendatasets SDK

In [None]:
# !pip uninstall -y azureml-opendatasets
# !pip install azureml-opendatasets

Import NoaaIsdWeather class from azureml-opendatasets

In [2]:
from azureml.opendatasets import NoaaIsdWeather
from azureml.opendatasets import OjSalesSimulated

from datetime import datetime
from dateutil import parser
from dateutil.relativedelta import relativedelta

In [3]:
## Check core SDK version number
import azureml.core

from azureml.core import Workspace

ws = Workspace.from_config()
print("[SUCCESS] LOGGED IN: ",ws.name, ws.resource_group, ws.location, ws.subscription_id, sep=' @ ')


[SUCCESS] LOGGED IN:  @ mlops-demo @ mlops-rg @ westeurope @ 6ee947fa-0d77-4915-bf68-4a83a8bec2a4


## Load OJ Sales Simulated Data

In [4]:
# Create a Data Directory in local path
import os

oj_sales_path = "data/download/ojs"

if not os.path.exists(oj_sales_path):
    os.mkdir(oj_sales_path)

In [5]:
# Pull all of the data
oj_sales_files = OjSalesSimulated.get_file_dataset()


In [6]:
oj_sales_files.take(6).to_path()

['/https%3A/%2Fazureopendatastorage.azurefd.net/ojsales-simulatedcontainer/oj_sales_data/Store1000_dominicks.csv',
 '/https%3A/%2Fazureopendatastorage.azurefd.net/ojsales-simulatedcontainer/oj_sales_data/Store1000_minute.maid.csv',
 '/https%3A/%2Fazureopendatastorage.azurefd.net/ojsales-simulatedcontainer/oj_sales_data/Store1000_tropicana.csv',
 '/https%3A/%2Fazureopendatastorage.azurefd.net/ojsales-simulatedcontainer/oj_sales_data/Store1001_dominicks.csv',
 '/https%3A/%2Fazureopendatastorage.azurefd.net/ojsales-simulatedcontainer/oj_sales_data/Store1001_minute.maid.csv',
 '/https%3A/%2Fazureopendatastorage.azurefd.net/ojsales-simulatedcontainer/oj_sales_data/Store1001_tropicana.csv']

In [7]:
oj_sales_files.take(6).download(oj_sales_path, overwrite=True)

['/mnt/batch/tasks/shared/LS_root/mounts/clusters/ntb-dev2/code/Users/mimarusa/_DEMO_/azureml-demos/data/download/ojs/https%3A/%2Fazureopendatastorage.azurefd.net/ojsales-simulatedcontainer/oj_sales_data/Store1000_dominicks.csv',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/ntb-dev2/code/Users/mimarusa/_DEMO_/azureml-demos/data/download/ojs/https%3A/%2Fazureopendatastorage.azurefd.net/ojsales-simulatedcontainer/oj_sales_data/Store1000_minute.maid.csv',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/ntb-dev2/code/Users/mimarusa/_DEMO_/azureml-demos/data/download/ojs/https%3A/%2Fazureopendatastorage.azurefd.net/ojsales-simulatedcontainer/oj_sales_data/Store1000_tropicana.csv',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/ntb-dev2/code/Users/mimarusa/_DEMO_/azureml-demos/data/download/ojs/https%3A/%2Fazureopendatastorage.azurefd.net/ojsales-simulatedcontainer/oj_sales_data/Store1001_dominicks.csv',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/ntb-dev2/code/Users/mimarusa

> **WARNING** data are being donwloaded to folder: `/data/download/ojs/https%3A/%2Fazureopendatastorage.azurefd.net/ojsales-simulatedcontainer/oj_sales_data` -> need to move first (in shell)

In [8]:
ds = ws.datastores["aml_data"]
ds.upload(src_dir= oj_sales_path, target_path='ojs', overwrite=True, show_progress=True)

Uploading an estimated of 9 files
Uploading data/download/ojs/Store140_dominicks.csv
Uploaded data/download/ojs/Store140_dominicks.csv, 1 files out of an estimated total of 9
Uploading data/download/ojs/Store140_tropicana.csv
Uploaded data/download/ojs/Store140_tropicana.csv, 2 files out of an estimated total of 9
Uploading data/download/ojs/Store141_tropicana.csv
Uploaded data/download/ojs/Store141_tropicana.csv, 3 files out of an estimated total of 9
Uploading data/download/ojs/Store142_minute.maid.csv
Uploaded data/download/ojs/Store142_minute.maid.csv, 4 files out of an estimated total of 9
Uploading data/download/ojs/Store142_tropicana.csv
Uploaded data/download/ojs/Store142_tropicana.csv, 5 files out of an estimated total of 9
Uploading data/download/ojs/Store140_minute.maid.csv
Uploaded data/download/ojs/Store140_minute.maid.csv, 6 files out of an estimated total of 9
Uploading data/download/ojs/Store141_dominicks.csv
Uploaded data/download/ojs/Store141_dominicks.csv, 7 files ou

$AZUREML_DATAREFERENCE_dd0ec2809f6348ce9a5b5b5427159716

## Load NoaaIsdWeather

> 1. Set start_date and end_date.
> 2. New an instance of NoaaIsdWeather.
> 3. Call to_pandas_dataframe() method to get a pandas DataFrame.

In [12]:
start_date = parser.parse('2018-1-1')
end_date = parser.parse('2018-1-4')
isd = NoaaIsdWeather(start_date, end_date)
# isd.to_pandas_dataframe().info()
# print('isd done')

In [13]:
pdf = isd.to_pandas_dataframe()

[Info] read from /tmp/tmpymd1x94q/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2018/month=1/part-00000-tid-8399976593945898762-a12919d0-896d-44dd-8706-1d3327044c4e-5126-1.c000.snappy.parquet
[Info] read from /tmp/tmpymd1x94q/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2018/month=1/part-00001-tid-8399976593945898762-a12919d0-896d-44dd-8706-1d3327044c4e-5130-1.c000.snappy.parquet
[Info] read from /tmp/tmpymd1x94q/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2018/month=1/part-00002-tid-8399976593945898762-a12919d0-896d-44dd-8706-1d3327044c4e-5128-1.c000.snappy.parquet
[Info] read from /tmp/tmpymd1x94q/https%3A/%2Fazureopendatastorage.azurefd.net/isdweatherdatacontainer/ISDWeather/year=2018/month=1/part-00003-tid-8399976593945898762-a12919d0-896d-44dd-8706-1d3327044c4e-5132-1.c000.snappy.parquet
[Info] read from /tmp/tmpymd1x94q/https%3A/%2Fazureopendatastorage.azurefd.n

In [14]:
pdf.head(10)

Unnamed: 0,usaf,wban,datetime,latitude,longitude,elevation,windAngle,windSpeed,temperature,seaLvlPressure,...,precipTime,precipDepth,snowDepth,stationName,countryOrRegion,p_k,year,day,version,month
9,29580,99999,2018-01-03 05:20:00,61.045,28.144,106.0,180.0,3.1,1.0,,...,,,,LAPPEENRANTA,FI,029580-99999,2018,3,1.0,1
33,726228,94740,2018-01-02 11:51:00,44.385,-74.207,507.0,0.0,0.0,-28.9,1026.0,...,6.0,0.0,,ADIRONDACK REGIONAL ARPT,US,726228-94740,2018,2,1.0,1
76,999999,94074,2018-01-01 03:55:00,40.807,-104.755,1643.0,,,-12.4,,...,,,,NUNN 7 NNE,US,999999-94074,2018,1,1.0,1
86,999999,3067,2018-01-03 12:55:00,38.87,-100.963,875.0,,,-10.4,,...,,,,OAKLEY 19 SSW,US,999999-03067,2018,3,1.0,1
102,999999,94077,2018-01-02 19:50:00,42.425,-103.736,1343.0,,,-3.8,,...,,,,HARRISON 20 SSE,US,999999-94077,2018,2,1.0,1
108,999999,3067,2018-01-03 15:20:00,38.87,-100.963,875.0,,,-6.8,,...,,,,OAKLEY 19 SSW,US,999999-03067,2018,3,1.0,1
131,11120,99999,2018-01-01 11:00:00,65.45,12.217,9.0,90.0,4.0,3.1,984.2,...,,,,BRONNOY,NO,011120-99999,2018,1,1.0,1
136,999999,94077,2018-01-01 15:20:00,42.425,-103.736,1343.0,,,-16.1,,...,,,,HARRISON 20 SSE,US,999999-94077,2018,1,1.0,1
187,82840,99999,2018-01-03 10:00:00,39.485,-0.475,56.0,260.0,5.7,15.2,1029.2,...,,,,VALENCIA,SP,082840-99999,2018,3,1.0,1
215,486980,99999,2018-01-01 23:30:00,1.35,103.994,7.0,60.0,2.6,25.0,,...,,,,SINGAPORE CHANGI INTL,SN,486980-99999,2018,1,1.0,1


In [15]:
pdf.reset_index(inplace=True)

In [16]:
# Create a Data Directory in local path
import os

weather_path = os.path.join("data","download","weather")

if not os.path.exists(weather_path):
    os.mkdir(weather_path)

In [17]:
rows_in_sample = 100
n_samples = 6
row_start = 0
for i in range(n_samples):
    tmp = pdf[row_start:row_start+rows_in_sample]
    print(f"getting data from {row_start} to {row_start+rows_in_sample}")
    tmp.to_csv(os.path.join(weather_path,f"weather{i}.csv"), index=False, header=True)
    row_start = row_start+rows_in_sample 



getting data from 0 to 100
getting data from 100 to 200
getting data from 200 to 300
getting data from 300 to 400
getting data from 400 to 500
getting data from 500 to 600


In [18]:
ds = ws.datastores["aml_data"]
ds.upload(src_dir=weather_path, target_path='weather', overwrite=True, show_progress=True)

Uploading an estimated of 6 files
Uploading data/download/weather/weather0.csv
Uploaded data/download/weather/weather0.csv, 1 files out of an estimated total of 6
Uploading data/download/weather/weather1.csv
Uploaded data/download/weather/weather1.csv, 2 files out of an estimated total of 6
Uploading data/download/weather/weather2.csv
Uploaded data/download/weather/weather2.csv, 3 files out of an estimated total of 6
Uploading data/download/weather/weather3.csv
Uploaded data/download/weather/weather3.csv, 4 files out of an estimated total of 6
Uploading data/download/weather/weather4.csv
Uploaded data/download/weather/weather4.csv, 5 files out of an estimated total of 6
Uploading data/download/weather/weather5.csv
Uploaded data/download/weather/weather5.csv, 6 files out of an estimated total of 6
Uploaded 6 files


$AZUREML_DATAREFERENCE_cab07397548d419f8952b77fbabe7d31