In [1]:
import pandas as pd
import repo.repo as repo
import repo.memory_handler as memory_handler
from repo.repo_objects import RepoInfoKey, MeasureConfiguration
from job_runner.job_runner import SimpleJobRunner, JobState
import logging as logging

logging.basicConfig(level=logging.DEBUG)

# Read housing data

In [2]:
data = pd.read_csv('housing.csv')

# Create a new repository

In [3]:
# setting up the repository
handler = memory_handler.RepoObjectMemoryStorage()
numpy_handler = memory_handler.NumpyMemoryStorage()
job_runner = SimpleJobRunner(None)
ml_repo = repo.MLRepo('test_user', handler, numpy_handler, handler, job_runner)
job_runner.set_repo(ml_repo)
ml_repo._job_runner = job_runner

## Adding data
The data in the repository is handled by two different data objects:
- RawData is the object containing real data.
- DataSet is the object conaining the logical data, i.e. a reference to a RawData object together with a specification, which data from the RawData will be used

Normally one will add RawData and then define DataSets which are used to train or test a model which is exactly the way shown in the following.

In [4]:
# Add RawData. A convenient way to add RawData is simply to use the method add_data.
# This method just takes a pandas dataframe and the specification, which columns belong to the input 
#and which to the targets.
ml_repo.add_data('boston_housing', data, input_variables=['RM', 'LSTAT', 'PTRATIO'], target_variables = ['MEDV'])

In [5]:
# create DataSet objects for training and test data
training_data = repo.DataSet('boston_housing', 0, 300, 
                            repo_info = {RepoInfoKey.NAME.value: 'training_data', RepoInfoKey.CATEGORY.value: repo.MLObjectType.TRAINING_DATA})
test_data = repo.DataSet('boston_housing', 301, -1, 
                            repo_info = {RepoInfoKey.NAME.value: 'test_data',  RepoInfoKey.CATEGORY.value: repo.MLObjectType.TEST_DATA})
# add the objects to the repository
ml_repo.add([training_data, test_data], message = 'add training and test data')

## Adding sklearn model
In the following we want to use models from the sklearn package. For the sklearn package, there is a simple module (externals.sklearn_interface) interfacing 
the sklearn package so that this can be used within the repository. This interface prvides a simple method (add_model) to add an arbitrary sklearn model as a model which can be handled by the repository. This method adds a bunch of repo objects to the repository:
- An object defining the function to be called to evaluate the model
- An object defining the function to be called to train the model
- An object defining the model
- An object defining the model parameter

In [6]:
import externals.sklearn_interface as sklearn_interface
from sklearn.tree import DecisionTreeRegressor
sklearn_interface.add_model(ml_repo, DecisionTreeRegressor(), model_param={'max_depth': 5})

## Train the model

In [8]:
job_id = ml_repo.run_training()  

INFO:root:Training job added to jobrunner, job_id: 85ca79a4-bc8e-11e8-a5d7-fc084a6691eb


## Run evaluation
To measure errors and to provide plots the model must be evaluated on all test and training datasets.

In [9]:
job_id = ml_repo.run_evaluation()

INFO:root:Eval job added to jobrunner, job_id: 85d2d3fa-bc8e-11e8-97aa-fc084a6691eb
INFO:root:Eval job added to jobrunner, job_id: 85d2d3fb-bc8e-11e8-bb5f-fc084a6691eb


[UUID('85d2d3fa-bc8e-11e8-97aa-fc084a6691eb'),
 UUID('85d2d3fb-bc8e-11e8-bb5f-fc084a6691eb')]

## Add and compute measures

In [10]:
ml_repo.add_measure(MeasureConfiguration.MAX)

In [20]:
job_ids = ml_repo.run_measures()

INFO:root:Measure job max added to jobrunner, job_id: e2e6557a-bc93-11e8-9360-fc084a6691eb
INFO:root:Measure job max added to jobrunner, job_id: e2e6f18c-bc93-11e8-9f6d-fc084a6691eb


## Working with the repository

In [21]:
for k in repo.MLObjectType:
    names = ml_repo.get_names(k.value)
    for n in names: 
        print(n + '  ' + k.value)

DecisionTreeRegressor/eval/test_data  eval_data
DecisionTreeRegressor/eval/training_data  eval_data
boston_housing  raw_data
training_data  training_data
test_data  test_data
DecisionTreeRegressor/model_param  model_param
train_sklearn  training_function
eval_sklearn  model_eval_function
DecisionTreeRegressor  model
DecisionTreeRegressor/model  calib_model
CommitInfo  commit_info
repo_mapping  mapping
DecisionTreeRegressor/test_data/max  measure
DecisionTreeRegressor/training_data/max  measure
measure_config  measure_config


In [22]:
for k in ml_repo.get_commits():
    print(str(k))

time: 2018-09-20 06:34:54.266914, author: test_user, message: data boston_housing added to repository, objects: {'boston_housing': 0, 'repo_mapping': 0}
time: 2018-09-20 06:34:54.318845, author: test_user, message: add training and test data, objects: {'training_data': 0, 'test_data': 0, 'repo_mapping': 1}
time: 2018-09-20 06:34:55.024663, author: test_user, message: add model evaluation function eval_sklearn, objects: {'eval_sklearn': 0, 'repo_mapping': 2}
time: 2018-09-20 06:34:55.024663, author: test_user, message: add model training function train_sklearn, objects: {'train_sklearn': 0, 'repo_mapping': 3}
time: 2018-09-20 06:34:55.025784, author: test_user, message: adding model and training parameter, objects: {'DecisionTreeRegressor/model_param': 0, 'repo_mapping': 4}
time: 2018-09-20 06:34:55.025784, author: test_user, message: add model DecisionTreeRegressor, objects: {'DecisionTreeRegressor': 0, 'repo_mapping': 5}
time: 2018-09-20 06:34:55.081718, author: test_user, message: tr

In [14]:
job_runner._job_status

{UUID('85ca79a4-bc8e-11e8-a5d7-fc084a6691eb'): <JobState.SUCCESSFULLY_FINISHED: 'successfully_finished'>,
 UUID('85d2d3fa-bc8e-11e8-97aa-fc084a6691eb'): <JobState.SUCCESSFULLY_FINISHED: 'successfully_finished'>,
 UUID('85d2d3fb-bc8e-11e8-bb5f-fc084a6691eb'): <JobState.SUCCESSFULLY_FINISHED: 'successfully_finished'>,
 UUID('85e21118-bc8e-11e8-b626-fc084a6691eb'): <JobState.SUCCESSFULLY_FINISHED: 'successfully_finished'>,
 UUID('85e2ad1c-bc8e-11e8-8621-fc084a6691eb'): <JobState.SUCCESSFULLY_FINISHED: 'successfully_finished'>}

In [15]:
param = ml_repo._get('DecisionTreeRegressor/model_param')

In [16]:
ml_repo.add(param, message='change max_depth to 3')

1

In [17]:
ml_repo.add(param, message='change max_depth to 3')

2

In [18]:
ml_repo.get_names(repo.MLObjectType.MEASURE_CONFIGURATION)

['measure_config']

In [19]:
job_runner.get_error_message(depp[0])
job_runner.get_trace_back(depp[0])

KeyError: UUID('85e21118-bc8e-11e8-b626-fc084a6691eb')

In [None]:
ml_repo.get_names(repo.MLObjectType.CALIBRATED_MODEL)
ml_repo.get_names(repo.MLObjectType.MODEL)