In [1]:
import pandas as pd
import repo.repo as repo
import repo.memory_handler as memory_handler
from repo.repo_objects import RepoInfoKey, MeasureConfiguration
from job_runner.job_runner import SimpleJobRunner, JobState
import logging as logging

#FORMAT = "[%(filename)s:%(lineno)s - %(funcName)20s() ] %(message)s"
#logging.basicConfig(format=FORMAT, level=logging.DEBUG)
logging.basicConfig(level=logging.DEBUG)

# Read housing data

In [2]:
data = pd.read_csv('housing.csv')

# Create a new repository

In [3]:
# setting up the repository
handler = memory_handler.RepoObjectMemoryStorage()
numpy_handler = memory_handler.NumpyMemoryStorage()
job_runner = SimpleJobRunner(None)
ml_repo = repo.MLRepo('test_user', handler, numpy_handler, handler, job_runner)
job_runner.set_repo(ml_repo)
ml_repo._job_runner = job_runner

## Adding data
The data in the repository is handled by two different data objects:
- RawData is the object containing real data.
- DataSet is the object conaining the logical data, i.e. a reference to a RawData object together with a specification, which data from the RawData will be used

Normally one will add RawData and then define DataSets which are used to train or test a model which is exactly the way shown in the following.

In [4]:
# Add RawData. A convenient way to add RawData is simply to use the method add_data.
# This method just takes a pandas dataframe and the specification, which columns belong to the input 
#and which to the targets.
ml_repo.add_data('boston_housing', data, input_variables=['RM', 'LSTAT', 'PTRATIO'], target_variables = ['MEDV'])

In [5]:
# create DataSet objects for training and test data
training_data = repo.DataSet('boston_housing', 0, 300, 
                            repo_info = {RepoInfoKey.NAME.value: 'training_data', RepoInfoKey.CATEGORY.value: repo.MLObjectType.TRAINING_DATA})
test_data = repo.DataSet('boston_housing', 301, -1, 
                            repo_info = {RepoInfoKey.NAME.value: 'test_data',  RepoInfoKey.CATEGORY.value: repo.MLObjectType.TEST_DATA})
# add the objects to the repository
ml_repo.add([training_data, test_data], message = 'add training and test data')

{'repo_mapping': 1, 'test_data': 0, 'training_data': 0}

## Adding sklearn model
In the following we want to use models from the sklearn package. For the sklearn package, there is a simple module (externals.sklearn_interface) interfacing 
the sklearn package so that this can be used within the repository. This interface prvides a simple method (add_model) to add an arbitrary sklearn model as a model which can be handled by the repository. This method adds a bunch of repo objects to the repository:
- An object defining the function to be called to evaluate the model
- An object defining the function to be called to train the model
- An object defining the model
- An object defining the model parameter

In [6]:
import externals.sklearn_interface as sklearn_interface
from sklearn.tree import DecisionTreeRegressor
sklearn_interface.add_model(ml_repo, DecisionTreeRegressor(), model_param={'max_depth': 5})

## Train the model

In [7]:
job_id = ml_repo.run_training()  

INFO:root:Training job added to jobrunner, job_id: b8cb9d64-c1af-11e8-8a8b-fc084a6691eb


## Run evaluation
To measure errors and to provide plots the model must be evaluated on all test and training datasets.

In [8]:
job_id = ml_repo.run_evaluation()

INFO:root:Eval job added to jobrunner, job_id: b8d4894c-c1af-11e8-bccd-fc084a6691eb
INFO:root:Eval job added to jobrunner, job_id: b8d4d6d4-c1af-11e8-9f09-fc084a6691eb


## Add and compute measures

In [9]:
ml_repo.add_measure(MeasureConfiguration.MAX)
ml_repo.add_measure(MeasureConfiguration.R2)

In [10]:
job_ids = ml_repo.run_measures()

INFO:repo:run MeasureJob on data test_data:0, DecisionTreeRegressor/eval/test_data:0
DEBUG:repo:computing maximum error
INFO:root:Measure job max added to jobrunner, job_id: b8e5d0e6-c1af-11e8-b244-fc084a6691eb
INFO:repo:run MeasureJob on data test_data:0, DecisionTreeRegressor/eval/test_data:0
INFO:root:Measure job r2 added to jobrunner, job_id: b8e693a4-c1af-11e8-8bc7-fc084a6691eb
INFO:repo:run MeasureJob on data training_data:0, DecisionTreeRegressor/eval/training_data:0
DEBUG:repo:computing maximum error
INFO:root:Measure job max added to jobrunner, job_id: b8e72f80-c1af-11e8-944c-fc084a6691eb
INFO:repo:run MeasureJob on data training_data:0, DecisionTreeRegressor/eval/training_data:0
INFO:root:Measure job r2 added to jobrunner, job_id: b8e7cb50-c1af-11e8-a6a3-fc084a6691eb


In [11]:
max_measure = ml_repo._get('DecisionTreeRegressor/measure/training_data/max')
print(str(max_measure.value))
max_measure = ml_repo._get('DecisionTreeRegressor/measure/test_data/max')
print(str(max_measure.value))


223200.0
387530.769231


## Working with the repository

In [12]:
for k in repo.MLObjectType:
    names = ml_repo.get_names(k.value)
    for n in names: 
        print(n + '\t  ' + k.value)

DecisionTreeRegressor/eval/test_data	  eval_data
DecisionTreeRegressor/eval/training_data	  eval_data
boston_housing	  raw_data
training_data	  training_data
test_data	  test_data
DecisionTreeRegressor/model_param	  model_param
train_sklearn	  training_function
eval_sklearn	  model_eval_function
DecisionTreeRegressor	  model
DecisionTreeRegressor/model	  calib_model
CommitInfo	  commit_info
repo_mapping	  mapping
DecisionTreeRegressor/measure/test_data/max	  measure
DecisionTreeRegressor/measure/test_data/r2	  measure
DecisionTreeRegressor/measure/training_data/max	  measure
DecisionTreeRegressor/measure/training_data/r2	  measure
measure_config	  measure_config


In [13]:
for k in ml_repo.get_commits():
    print(str(k))

time: 2018-09-26 19:15:09.153685, author: test_user, message: data boston_housing added to repository, objects: {'boston_housing': 0, 'repo_mapping': 0}
time: 2018-09-26 19:15:09.212734, author: test_user, message: add training and test data, objects: {'training_data': 0, 'test_data': 0, 'repo_mapping': 1}
time: 2018-09-26 19:15:09.843144, author: test_user, message: add model evaluation function eval_sklearn, objects: {'eval_sklearn': 0, 'repo_mapping': 2}
time: 2018-09-26 19:15:09.843144, author: test_user, message: add model training function train_sklearn, objects: {'train_sklearn': 0, 'repo_mapping': 3}
time: 2018-09-26 19:15:09.843144, author: test_user, message: adding model and training parameter, objects: {'DecisionTreeRegressor/model_param': 0, 'repo_mapping': 4}
time: 2018-09-26 19:15:09.843144, author: test_user, message: add model DecisionTreeRegressor, objects: {'DecisionTreeRegressor': 0, 'repo_mapping': 5}
time: 2018-09-26 19:15:09.858954, author: test_user, message: tr

In [14]:
for k, v in job_runner._job_info.items():
    print(str(v))

test_user, successfully_finished, started 2018-09-26 19:15:09.858954, finished 2018-09-26 19:15:09.858954
test_user, successfully_finished, started 2018-09-26 19:15:09.917422, finished 2018-09-26 19:15:09.918414
test_user, successfully_finished, started 2018-09-26 19:15:09.919406, finished 2018-09-26 19:15:09.921401
test_user, successfully_finished, started 2018-09-26 19:15:10.030666, finished 2018-09-26 19:15:10.033657
test_user, successfully_finished, started 2018-09-26 19:15:10.035651, finished 2018-09-26 19:15:10.037646
test_user, successfully_finished, started 2018-09-26 19:15:10.039641, finished 2018-09-26 19:15:10.042632
test_user, successfully_finished, started 2018-09-26 19:15:10.043630, finished 2018-09-26 19:15:10.046622


## Change model parameter, check consistency and train

In [15]:
param = ml_repo._get('DecisionTreeRegressor/model_param')
param.sklearn_params['max_depth'] = 2
ml_repo.add(param)

1

In [16]:
import repo.tools as tools
#depp = ml_repo._get('DecisionTreeRegressor/model_param')
results = tools.check_model(ml_repo, 'DecisionTreeRegressor')
print(results)

{'DecisionTreeRegressor/model_param': {'modifier version': 0, 'latest version': 1}}


In [25]:
ml_repo.run_training()

INFO:root:Training job added to jobrunner, job_id: ee7910e4-c1af-11e8-8186-fc084a6691eb


UUID('ee7910e4-c1af-11e8-8186-fc084a6691eb')

In [26]:
results = tools.check_model(ml_repo, 'DecisionTreeRegressor')
print(results)

{}


In [27]:
ml_repo.run_evaluation()
ml_repo.run_measures()

INFO:root:Eval job added to jobrunner, job_id: 61628f12-c1b0-11e8-b604-fc084a6691eb
INFO:root:Eval job added to jobrunner, job_id: 61630406-c1b0-11e8-a385-fc084a6691eb
INFO:repo:run MeasureJob on data test_data:0, DecisionTreeRegressor/eval/test_data:1
DEBUG:repo:computing maximum error
INFO:root:Measure job max added to jobrunner, job_id: 616378d8-c1b0-11e8-9028-fc084a6691eb
INFO:repo:run MeasureJob on data test_data:0, DecisionTreeRegressor/eval/test_data:1
INFO:root:Measure job r2 added to jobrunner, job_id: 61643bae-c1b0-11e8-831f-fc084a6691eb
INFO:repo:run MeasureJob on data training_data:0, DecisionTreeRegressor/eval/training_data:1
DEBUG:repo:computing maximum error
INFO:root:Measure job max added to jobrunner, job_id: 6164d792-c1b0-11e8-a042-fc084a6691eb
INFO:repo:run MeasureJob on data training_data:0, DecisionTreeRegressor/eval/training_data:1
INFO:root:Measure job r2 added to jobrunner, job_id: 61657352-c1b0-11e8-a186-fc084a6691eb


[UUID('616378d8-c1b0-11e8-9028-fc084a6691eb'),
 UUID('61643bae-c1b0-11e8-831f-fc084a6691eb'),
 UUID('6164d792-c1b0-11e8-a042-fc084a6691eb'),
 UUID('61657352-c1b0-11e8-a186-fc084a6691eb')]

In [42]:
measure = ml_repo._get('DecisionTreeRegressor/measure/test_data/r2',version = (0,100))
for x in measure:
    print(str(x.repo_info))

{'version': 0, 'name': 'DecisionTreeRegressor/measure/test_data/r2', 'history': None, 'classname': 'repo.repo_objects.Measure', 'modification_info': {'DecisionTreeRegressor/eval/test_data': 0, 'DecisionTreeRegressor/model': 0, 'DecisionTreeRegressor/model_param': 0, 'training_data': 0, 'DecisionTreeRegressor': 0, 'test_data': 0}, 'description': None, 'category': 'measure', 'big_objects': [], 'commit_message': 'computing  measure r2 on data test_data'}
{'version': 1, 'name': 'DecisionTreeRegressor/measure/test_data/r2', 'history': None, 'classname': 'repo.repo_objects.Measure', 'modification_info': {'DecisionTreeRegressor/eval/test_data': 1, 'DecisionTreeRegressor/model': 1, 'DecisionTreeRegressor/model_param': 3, 'training_data': 0, 'DecisionTreeRegressor': 0, 'test_data': 0}, 'description': None, 'category': 'measure', 'big_objects': [], 'commit_message': 'computing  measure r2 on data test_data'}


In [19]:
ml_repo.get_names(repo.MLObjectType.MEASURE_CONFIGURATION)

['measure_config']

In [20]:
m = ml_repo._get('DecisionTreeRegressor/measure/training_data/r2')

In [21]:
str(m.repo_info)

"{'version': 0, 'name': 'DecisionTreeRegressor/measure/training_data/r2', 'history': None, 'classname': 'repo.repo_objects.Measure', 'modification_info': {'DecisionTreeRegressor/eval/training_data': 0, 'DecisionTreeRegressor/model': 0, 'DecisionTreeRegressor/model_param': 0, 'training_data': 0, 'DecisionTreeRegressor': 0}, 'description': None, 'category': 'measure', 'big_objects': [], 'commit_message': 'computing  measure r2 on data training_data'}"

In [22]:
print(results)

{'DecisionTreeRegressor/model_param': {'modifier version': 0, 'latest version': 1}}


In [23]:
data = ml_repo._get('boston_housing')

In [24]:
print(str(data.repo_info))

{'version': 0, 'name': 'boston_housing', 'history': None, 'classname': 'repo.repo_objects.RawData', 'modification_info': {}, 'description': None, 'category': <MLObjectType.RAW_DATA: 'raw_data'>, 'big_objects': ['x_data', 'y_data'], 'commit_message': 'data boston_housing added to repository'}
