# Question #143

In [None]:
import os

training_folder = 'question-143-training'
os.makedirs(training_folder, exist_ok=True)

os.makedirs('question-143-dataset', exist_ok=True)

### Create an artificial dataset

In [None]:
%%writefile question-143-dataset/file1.csv
col1,col2,col3
1,a,t6t
2,b,u7u
3,c,r8r
4,d,s9s
5,e,h4h

In [None]:
%%writefile question-143-dataset/file2.csv
col1,col2,col3
1,f,k6t
2,g,f7u
3,h,p8r
4,i,f9s
5,j,s4h

### Register artificial dataset

In [None]:
import azureml.core
from azureml.core import Dataset, Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()

default_ds = ws.get_default_datastore()
default_ds.upload_files(
    files=[
        os.path.join('question-143-dataset', 'file1.csv'), 
        os.path.join('question-143-dataset', 'file2.csv'), 
    ],
    target_path='question-143-data/', # Put it in a folder path in the datastore
    overwrite=True, # Replace existing files of the same name
    show_progress=True
)

# Create a tabular dataset from the path on the datastore (this may take a short while)
tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'question-143-data/*.csv'))

# Register dataset
tab_data_set = tab_data_set.register(
    workspace=ws, 
    name='csv_training',
    description='csv training - question 143',
    tags = {'format':'CSV'},
    create_new_version=True
)

## Carry on with the question

In [None]:
%%writefile $training_folder/train.py
import os
import argparse
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from azureml.core import Run

run = Run.get_context()
parser = argparse.ArgumentParser()
parser.add_argument('--data-folder', type=str, dest='data_folder', help='data reference')
args = parser.parse_args()

data_folder = args.data_folder
csv_files = os.listdir(data_folder)
training_data = pd.concat((pd.read_csv(os.path.join(data_folder, csv_file)) for csv_file in csv_files))

# Code to split the training data and train a logistic regression model
# ...

In [None]:
from azureml.core import Workspace, Datastore, Experiment
from azureml.train.sklearn import SKLearn

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name='question-143')
#ds = Datastore.get(ws, datastore_name='training_data')
data_ref = default_ds.path('question-143-data')

#
script_params = {
    #'--data-folder': data_ref.as_mount()
    '--data-folder': data_ref.as_download(path_on_compute='csv_files')
}
estimator = SKLearn(
    source_directory='./question-143-training',
    script_params=script_params,
    compute_target='local',
    entry_script='train.py'
)

#
run = exp.submit(config=estimator)
run.wait_for_completion(show_output=True)