In [1]:
import pandas as pd
import repo.repo as repo
import repo.memory_handler as memory_handler
from repo.repo_objects import RepoInfoKey, MeasureConfiguration
from job_runner.job_runner import SimpleJobRunner, JobState
import logging as logging

#FORMAT = "[%(filename)s:%(lineno)s - %(funcName)20s() ] %(message)s"
#logging.basicConfig(format=FORMAT, level=logging.DEBUG)
logging.basicConfig(level=logging.ERROR)

# Read housing data

In [2]:
data = pd.read_csv('housing.csv')
#data.head()

# Create a new repository

In [3]:
# setting up the repository
handler = memory_handler.RepoObjectMemoryStorage()
numpy_handler = memory_handler.NumpyMemoryStorage()
job_runner = SimpleJobRunner(None)
ml_repo = repo.MLRepo('test_user', handler, numpy_handler, handler, job_runner)
job_runner.set_repo(ml_repo)
ml_repo._job_runner = job_runner

## Adding data
The data in the repository is handled by two different data objects:
- RawData is the object containing real data.
- DataSet is the object conaining the logical data, i.e. a reference to a RawData object together with a specification, which data from the RawData will be used

Normally one will add RawData and then define DataSets which are used to train or test a model which is exactly the way shown in the following.

In [4]:
# Add RawData. A convenient way to add RawData is simply to use the method add_data.
# This method just takes a pandas dataframe and the specification, which columns belong to the input 
#and which to the targets.
ml_repo.add_data('boston_housing', data, input_variables=['RM', 'LSTAT', 'PTRATIO'], target_variables = ['MEDV'])

In [5]:
# create DataSet objects for training and test data
training_data = repo.DataSet('boston_housing', 0, 300, 
                            repo_info = {RepoInfoKey.NAME.value: 'training_data', RepoInfoKey.CATEGORY.value: repo.MLObjectType.TRAINING_DATA})
test_data = repo.DataSet('boston_housing', 301, -1, 
                            repo_info = {RepoInfoKey.NAME.value: 'test_data',  RepoInfoKey.CATEGORY.value: repo.MLObjectType.TEST_DATA})
# add the objects to the repository
ml_repo.add([training_data, test_data], message = 'add training and test data')

{'repo_mapping': 1, 'test_data': 0, 'training_data': 0}

## Adding sklearn model
In the following we want to use models from the sklearn package. For the sklearn package, there is a simple module (externals.sklearn_interface) interfacing 
the sklearn package so that this can be used within the repository. This interface prvides a simple method (add_model) to add an arbitrary sklearn model as a model which can be handled by the repository. This method adds a bunch of repo objects to the repository:
- An object defining the function to be called to evaluate the model
- An object defining the function to be called to train the model
- An object defining the model
- An object defining the model parameter

In [6]:
import externals.sklearn_interface as sklearn_interface
from sklearn.tree import DecisionTreeRegressor
sklearn_interface.add_model(ml_repo, DecisionTreeRegressor(), model_param={'max_depth': 5})

## Train the model

In [7]:
job_id = ml_repo.run_training()  

## Run evaluation
To measure errors and to provide plots the model must be evaluated on all test and training datasets.

In [8]:
job_id = ml_repo.run_evaluation()

## Add and compute measures

In [9]:
ml_repo.add_measure(MeasureConfiguration.MAX)
ml_repo.add_measure(MeasureConfiguration.R2)

In [10]:
job_ids = ml_repo.run_measures()

In [11]:
max_measure = ml_repo._get('DecisionTreeRegressor/measure/training_data/max')
print(str(max_measure.value))
max_measure = ml_repo._get('DecisionTreeRegressor/measure/test_data/max')
print(str(max_measure.value))


223200.0
387530.769231


# Working with the repository

In [12]:
for k in repo.MLObjectType:
    names = ml_repo.get_names(k.value)
    for n in names: 
        print(n + '\t  ' + k.value)

DecisionTreeRegressor/eval/test_data	  eval_data
DecisionTreeRegressor/eval/training_data	  eval_data
boston_housing	  raw_data
training_data	  training_data
test_data	  test_data
DecisionTreeRegressor/model_param	  model_param
train_sklearn	  training_function
eval_sklearn	  model_eval_function
DecisionTreeRegressor	  model
DecisionTreeRegressor/model	  calib_model
CommitInfo	  commit_info
repo_mapping	  mapping
DecisionTreeRegressor/measure/test_data/max	  measure
DecisionTreeRegressor/measure/test_data/r2	  measure
DecisionTreeRegressor/measure/training_data/max	  measure
DecisionTreeRegressor/measure/training_data/r2	  measure
measure_config	  measure_config


In [13]:
for k in ml_repo.get_commits():
    print(str(k))

time: 2018-10-07 10:18:58.659458, author: test_user, message: data boston_housing added to repository, objects: {'boston_housing': 0, 'repo_mapping': 0}
time: 2018-10-07 10:18:58.699408, author: test_user, message: add training and test data, objects: {'training_data': 0, 'test_data': 0, 'repo_mapping': 1}
time: 2018-10-07 10:19:01.430049, author: test_user, message: add model evaluation function eval_sklearn, objects: {'eval_sklearn': 0, 'repo_mapping': 2}
time: 2018-10-07 10:19:01.430049, author: test_user, message: add model training function train_sklearn, objects: {'train_sklearn': 0, 'repo_mapping': 3}
time: 2018-10-07 10:19:01.430049, author: test_user, message: adding model and training parameter, objects: {'DecisionTreeRegressor/model_param': 0, 'repo_mapping': 4}
time: 2018-10-07 10:19:01.430049, author: test_user, message: add model DecisionTreeRegressor, objects: {'DecisionTreeRegressor': 0, 'repo_mapping': 5}
time: 2018-10-07 10:19:01.445833, author: test_user, message: tr

In [14]:
for k, v in job_runner._job_info.items():
    print(str(k) + ':  ' + str(v))
#job_runner.get_info('34484a2c-c225-11e8-9693-fc084a6691eb')

a5788fdc-ca09-11e8-b29a-fc084a6691eb:  test_user, successfully_finished, started 2018-10-07 10:19:01.445833, finished 2018-10-07 10:19:01.445833
a57da4a4-ca09-11e8-a8cc-fc084a6691eb:  test_user, successfully_finished, started 2018-10-07 10:19:01.479133, finished 2018-10-07 10:19:01.479133
a57da4a5-ca09-11e8-9544-fc084a6691eb:  test_user, successfully_finished, started 2018-10-07 10:19:01.479133, finished 2018-10-07 10:19:01.479133
a58c45ae-ca09-11e8-a2db-fc084a6691eb:  test_user, successfully_finished, started 2018-10-07 10:19:01.575006, finished 2018-10-07 10:19:01.575006
a58c45af-ca09-11e8-80c4-fc084a6691eb:  test_user, successfully_finished, started 2018-10-07 10:19:01.575006, finished 2018-10-07 10:19:01.575006
a58c45b0-ca09-11e8-a1b9-fc084a6691eb:  test_user, successfully_finished, started 2018-10-07 10:19:01.575006, finished 2018-10-07 10:19:01.579030
a58ce2e4-ca09-11e8-a1b3-fc084a6691eb:  test_user, successfully_finished, started 2018-10-07 10:19:01.579030, finished 2018-10-07 1

## Change model parameter, check consistency and train

In [15]:
param = ml_repo._get('DecisionTreeRegressor/model_param')
param.sklearn_params['max_depth'] = 2
version = ml_repo.add(param)

In [16]:
import repo.tools as tools
#depp = ml_repo._get('DecisionTreeRegressor/model_param')
results = tools.check_model(ml_repo, 'DecisionTreeRegressor')
print(results)

{'DecisionTreeRegressor/model_param': {'modifier version': 0, 'latest version': 1}}


In [17]:
ml_repo.run_training()

UUID('a5b78ca8-ca09-11e8-abc7-fc084a6691eb')

In [18]:
results = tools.check_model(ml_repo, 'DecisionTreeRegressor')
print(results)

{}


In [19]:
ml_repo.run_evaluation()
ml_repo.run_measures()

[UUID('a5c59190-ca09-11e8-bbb9-fc084a6691eb'),
 UUID('a5c59191-ca09-11e8-8d5e-fc084a6691eb'),
 UUID('a5c62d98-ca09-11e8-b140-fc084a6691eb'),
 UUID('a5c62d99-ca09-11e8-9f42-fc084a6691eb')]

In [20]:
measure = ml_repo._get('DecisionTreeRegressor/measure/test_data/r2',version = (0,100))
for x in measure:
    print(str(x.repo_info))

{'version': 0, 'name': 'DecisionTreeRegressor/measure/test_data/r2', 'history': None, 'classname': 'repo.repo_objects.Measure', 'modification_info': {'DecisionTreeRegressor/eval/test_data': 0, 'DecisionTreeRegressor/model': 0, 'DecisionTreeRegressor/model_param': 0, 'training_data': 0, 'DecisionTreeRegressor': 0, 'test_data': 0}, 'description': None, 'category': 'measure', 'big_objects': [], 'commit_message': 'computing  measure r2 on data test_data'}
{'version': 1, 'name': 'DecisionTreeRegressor/measure/test_data/r2', 'history': None, 'classname': 'repo.repo_objects.Measure', 'modification_info': {'DecisionTreeRegressor/eval/test_data': 1, 'DecisionTreeRegressor/model': 1, 'DecisionTreeRegressor/model_param': 1, 'training_data': 0, 'DecisionTreeRegressor': 0, 'test_data': 0}, 'description': None, 'category': 'measure', 'big_objects': [], 'commit_message': 'computing  measure r2 on data test_data'}


In [21]:
ml_repo.get_names(repo.MLObjectType.MEASURE_CONFIGURATION)

['measure_config']

In [22]:
m = ml_repo._get('DecisionTreeRegressor/measure/training_data/r2')

In [23]:
str(m.repo_info)

"{'version': 1, 'name': 'DecisionTreeRegressor/measure/training_data/r2', 'history': None, 'classname': 'repo.repo_objects.Measure', 'modification_info': {'DecisionTreeRegressor/eval/training_data': 1, 'DecisionTreeRegressor/model': 1, 'DecisionTreeRegressor/model_param': 1, 'training_data': 0, 'DecisionTreeRegressor': 0}, 'description': None, 'category': 'measure', 'big_objects': [], 'commit_message': 'computing  measure r2 on data training_data'}"

In [24]:
print(results)

{}


In [25]:
data = ml_repo._get('boston_housing')

In [26]:
print(str(data.repo_info))

{'version': 0, 'name': 'boston_housing', 'history': None, 'classname': 'repo.repo_objects.RawData', 'modification_info': {}, 'description': None, 'category': <MLObjectType.RAW_DATA: 'raw_data'>, 'big_objects': ['x_data', 'y_data'], 'commit_message': 'data boston_housing added to repository'}


## Append RawData

In [27]:
train_data = ml_repo.get_training_data(full_object = False)
print(train_data.repo_info[RepoInfoKey.NAME] +': ' +str(train_data))
test_data = ml_repo.get_names(repo.MLObjectType.TEST_DATA)
for k in test_data:
    t = ml_repo._get(k)
    print(str(t)+ ' Version: ' + str(t.repo_info[RepoInfoKey.VERSION]))

training_data: {'raw_data': 'boston_housing', 'start_index': 0, 'end_index': 300, 'raw_data_version': 'last', 'x_coord_names': ['RM', 'LSTAT', 'PTRATIO'], 'y_coord_names': ['MEDV'], 'n_data': 489}
{'raw_data': 'boston_housing', 'start_index': 301, 'end_index': -1, 'raw_data_version': 'last', 'x_coord_names': ['RM', 'LSTAT', 'PTRATIO'], 'y_coord_names': ['MEDV'], 'n_data': 489} Version: 0


In [None]:
from numpy import array
ml_repo.append_raw_data('boston_housing', x_data = array([[ 6.575, 4.98, 15.3]]), y_data =array([[504000.0]]))

In [None]:
print(train_data.repo_info[RepoInfoKey.NAME] +': ' +str(train_data))
for k in test_data:
    t = ml_repo._get(k)
    print(str(t) + ' Version: ' + str(t.repo_info[RepoInfoKey.VERSION]))

In [None]:
results = tools.check_model(ml_repo, 'DecisionTreeRegressor')
print(results)

# Repo-Analysis

In [None]:
import repo.plot as plot

In [None]:
for j in range(2):
    training_data = ml_repo._get('training_data')
    training_data.end_index += 50
    ml_repo.add(training_data, message='add 50 datapoints to end_index')
    for i in range(6,12):
        #print(i)
        param = ml_repo._get('DecisionTreeRegressor/model_param')
        param.sklearn_params['max_depth'] = i
        version = ml_repo.add(param)
        ml_repo.add(param)
        ml_repo.run_training()
        ml_repo.run_evaluation()
        ml_repo.run_measures()
    

## Plotting
### Plot measures by parameter

In [None]:
#import repo.plot_helper as plt_helper
#if False:
#print(pd.DataFrame(
#plt_helper.get_measure_by_model_parameter(ml_repo, 'DecisionTreeRegressor/measure/test_data/r2', 'max_depth')
#))
plot.measure_by_model_parameter(ml_repo, 'DecisionTreeRegressor/measure/test_data/r2', 'max_depth')


### Plot histograms

In [None]:
plot.histogram(ml_repo, 'test_data', x_coordinate = 'PTRATIO') #, y_coordinate='MEDV')

In [None]:
depp = ml_repo._get( 'training_data', version = (0,100))
#print(str(depp))
for x in depp:
    print(str(x)+ ', version: ' + str(x.repo_info[RepoInfoKey.VERSION]))

In [None]:
print(str(depp))

In [None]:
str(None)