In [None]:
from azureml.core import Workspace,Dataset

ws = Workspace.from_config()
#print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

# Getting raw data
This part is only required to run once. It will collect the diabetes dataset and register it as a dataset called 'Diabetes_Raw'.

In [None]:
from azureml.opendatasets import Diabetes

diabetes = Diabetes.get_tabular_dataset()
raw_data = diabetes.to_pandas_dataframe()


In [None]:
ds=ws.get_default_datastore()
Dataset.Tabular.register_pandas_dataframe(dataframe=raw_data,target=(ds, 'diabetes_raw_data'),name='Diabetes_Raw')
print("Raw dataset registered")

# Getting raw dataset
Performing simple data processing and storing it as a temporary file

In [None]:
dataset = Dataset.get_by_name(ws, name='Diabetes_Raw')
df=dataset.to_pandas_dataframe()

In [None]:
print(df.columns)
df.rename(columns={'Y':'Target'},inplace=True)
df.head()

###### Converting label to binary outcome

In [None]:
import pandas as pd
#Creating binary target
df['Binary_Target']=pd.cut(df['Target'], 2,labels=[0,1])
#Dropping target
df.drop(columns=['Target'],inplace=True)

In [None]:
import os
try:
    os.mkdir('./data')
except Exception:
    print("Dir already exists")
#Save to local path
df.to_csv('./data/train.csv',index=False)

# Register file as dataset

In [None]:
#Defining dataset version and description
dataset_version='8'
description='Convert target column to binary'

# Upload file to datastore
ds = ws.get_default_datastore()
dataset_path_train=f'diabetes_dataset/train_v{dataset_version}'
ds.upload_files(['./data/train.csv'], overwrite=True,target_path=dataset_path_train)

In [None]:
#Path to file in datastore
datastore_paths_train = [(ds, dataset_path_train+'/train.csv')]

#Creating dataset object
dataset_train = Dataset.Tabular.from_delimited_files(
    path=datastore_paths_train, header=True
)

#Registering dataset in aml
dataset_train.register(workspace=ws, 
                name="Diabetes_Processed", 
                create_new_version=True,
                tags={'my_tag': 'tag'},
                description=description
                )


# Dataset

Docs: </br>
[How to use or create datasets](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-create-register-datasets) </br>
[Code docs](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.dataset.dataset?view=azure-ml-py)


### Register pandas dataframe as dataset

```
Dataset.Tabular.register_pandas_dataframe(dataframe=raw_data,target=(ds, 'diabetes_raw_data'),name='Diabetes_Raw')
```

Docs </br>
[Register pandas dataframe as dataset](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.dataset_factory.tabulardatasetfactory?view=azure-ml-py#azureml-data-dataset-factory-tabulardatasetfactory-register-pandas-dataframe)
