In [None]:
import azureml.core
from azureml.core import Workspace, Datastore, Dataset

mlonazure_ws = Workspace.from_config()

mlonazure_ds = mlonazure_ws.get_default_datastore()

print('Workspace Name: ' + mlonazure_ws.name, 
      'Resource Group: ' + mlonazure_ws.resource_group,
      'Default Storage Account Name: ' + mlonazure_ds.account_name,
      'AzureML Core Version: ' + azureml.core.VERSION,
      sep = '\n')

In [None]:
Datastore.get_default(mlonazure_ws)

In [None]:
from azureml.opendatasets import OjSalesSimulated

oj_sales_files = OjSalesSimulated.get_file_dataset()

#Note: We can also limit the number of files by supplying num_files=x parameter. 

In [None]:
#mount_context = oj_sales_files.take(20).mount(mount_point=None)

mount_context = oj_sales_files.mount(mount_point=None)

mount_context.start() #mount file streams

print("Temp Mount Point: " + mount_context.mount_point) 

data_reference = mlonazure_ds.upload(src_dir=mount_context.mount_point, target_path='MyDatasets/New/OJSales_All', overwrite=False, show_progress=False)
mount_context.stop() #unmount file streams

print('Path on Datastore: ' + data_reference.path_on_datastore)
print('DataReference Mode: ' + data_reference.mode)


In [None]:
datasetName_train = 'OJSales_All'

In [None]:
ojsales_ds = Dataset.get_by_name(mlonazure_ws,datasetName_train,version='latest')

ojsales_ds.take(10).to_pandas_dataframe()

In [None]:
alldata_pd = ojsales_ds.to_pandas_dataframe()
alldata_pd.shape

In [None]:
alldata_pd.dtypes

In [None]:
alldata_pd.describe()

In [None]:
alldata_pd.Brand.unique()

In [None]:
alldata_pd.Brand.value_counts()

In [None]:
(alldata_pd.Brand.value_counts()).count()

In [None]:
(alldata_pd.Store.value_counts()).count()

In [None]:
alldata_pd.WeekStarting.unique()

In [None]:
ojsales_ds_withtimestamp = ojsales_ds.with_timestamp_columns(timestamp='WeekStarting', partition_timestamp=None, validate=True)

In [None]:
from datetime import datetime 

data_train = ojsales_ds_withtimestamp.time_before(datetime.strptime("1992-05-28", "%Y-%m-%d"), include_boundary = True)
data_test = ojsales_ds_withtimestamp.time_after(datetime.strptime("1992-05-28", "%Y-%m-%d"), include_boundary = False)

data_train_ds = data_train.register(workspace=mlonazure_ws,
                                 name='OJSales_Train',
                                 description='Data on or before 1992-05-28 OJSales_All Files',
                                 tags= {'type': 'csv', 'date':'April 2020'},
                                 create_new_version=True)

data_test_ds = data_test.register(workspace=mlonazure_ws,
                                 name='OJSales_Validate',
                                 description='Data after 1992-05-28 OJSales_All Files',
                                 tags= {'type': 'csv', 'date':'April 2020'},
                                 create_new_version=True)

In [None]:
#ojsales_all_ds = Dataset.get_by_name(mlonazure_ws,'OJSales_All',version='latest')
ojsales_train_ds = Dataset.get_by_name(mlonazure_ws,'OJSales_Train',version='latest')
ojsales_validate_ds = Dataset.get_by_name(mlonazure_ws,'OJSales_Validate',version='latest')

#ojsales_all_pd = ojsales_all_ds.to_pandas_dataframe()
ojsales_train_pd = ojsales_train_ds.to_pandas_dataframe()
ojsales_validate_pd = ojsales_validate_ds.to_pandas_dataframe()

In [None]:
print ("Full dataset Shape:", ojsales_all_pd.shape[0:2])
print ("Train dataset Shape:", ojsales_train_pd.shape[0:2])
print ("Validate dataset Shape:", ojsales_validate_pd.shape[0:2])

#### Create a Smaller version of the dataset

In [None]:
cntAll = (alldata_pd.Store.value_counts()).count()

ojsales_train_subset_pd = ojsales_train_pd.query('Store>=1000 & Store<1200')
ojsales_validate_subset_pd = ojsales_validate_pd.query('Store>=1000 & Store<1200')
                                 
cntTrainSubset = (ojsales_train_subset_pd.Store.value_counts()).count()
cntValidateSubset = (ojsales_train_subset_pd.Store.value_counts()).count()

print(cntAll, cntTrainSubset, cntValidateSubset)


In [None]:
import os
os.getcwd()

In [None]:
ojsales_train_subset_pd.to_csv('Dataset/OJSales_Train_Subset.csv')
ojsales_validate_subset_pd.to_csv('Dataset/OJSales_Validate_Subset.csv')

In [None]:
from azureml.data.datapath import DataPath

data_reference_Subset = mlonazure_ds.upload(
    src_dir='Dataset',
    target_path='MyDatasets/OJSales_Subset',
    overwrite=True,
    show_progress=False)

mlonazure_ds_train_subset = [
    DataPath(mlonazure_ds, 'MyDatasets/OJSales_Subset/OJSales_Train_Subset.csv')
]

ojsales_train_subset_ds = Dataset.Tabular.from_delimited_files(mlonazure_ds_train_subset)


mlonazure_ds_validate_subset = [
    DataPath(mlonazure_ds, 'MyDatasets/OJSales_Subset/OJSales_Validate_Subset.csv')
]

ojsales_validate_subset_ds = Dataset.Tabular.from_delimited_files(mlonazure_ds_validate_subset)

In [None]:
ojsales_train_subset_ds = ojsales_train_subset_ds.register(workspace=mlonazure_ws,
                                 name='OJSales_Train_Subset',
                                 description='200 Stores Data on or before 1992-05-28 OJSales_All Files',
                                 tags= {'type': 'csv', 'date':'April 2020'},
                                 create_new_version=True)

ojsales_validate_subset_ds = ojsales_validate_subset_ds.register(workspace=mlonazure_ws,
                                 name='OJSales_Validate_Subset',
                                 description='200 Stores Data after 1992-05-28 OJSales_All Files',
                                 tags= {'type': 'csv', 'date':'April 2020'},
                                 create_new_version=True)

In [None]:
#ojsales_all_ds = Dataset.get_by_name(mlonazure_ws,'OJSales_All',version='latest')
ojsales_train_subset_ds = Dataset.get_by_name(mlonazure_ws,'OJSales_Train_Subset',version='latest')
ojsales_validate_subset_ds = Dataset.get_by_name(mlonazure_ws,'OJSales_Validate_Subset',version='latest')

#ojsales_all_pd = ojsales_all_ds.to_pandas_dataframe()
ojsales_train_subset_pd = ojsales_train_subset_ds.to_pandas_dataframe()
ojsales_validate_subset_pd = ojsales_validate_subset_ds.to_pandas_dataframe()

In [None]:
ojsales_train_subset_pd.shape

In [None]:
ojsales_validate_subset_pd.shape

In [None]:
ojsales_all_pd.describe()

In [None]:
ojsales_all_pd.dtypes

In [None]:
ojsales_all_pd.Store.unique()