In [0]:
from google.colab import files
uploaded = files.upload()

In [0]:
from pymongo import MongoClient
import numpy as np
import pickle
import math 
import datetime

class DataProvider:

    NO_OCCUPATION_INPUT_ROW_LENGTH = 41

    FAMILY_STATUS_OPTIONS = [
        'Keine Angabe',
        'Paar mit Kind(ern)',
        'Single mit Kind(ern)',
        'Keine Kinder',
    ]

    EDUCATION_TYPE_OPTIONS = [
        'Promotion',
        'Studium (Master oder Diplom)',
        'Studium (Bachelor)',
        'Meister',
        'Berufsausbildung',
        'Abitur',
        'Realschulabschluss',
        'Hauptschulabschluss',
        'Kein Abschluss',
    ]

    JOB_SITUATION_OPTIONS = [
        'Angestellt/Selbstständig',
        'Beamter',
        'Sonstiges',
    ]

    INDUSTRY_OPTIONS = [
        'Elektro',
        'Gesundheitswesen',
        'Holz',
        'IT',
        'Kunststoff',
        'Metall',
        'Stahl',
        'Textil',
        'Sonstige Branche',
        'Sonstige',
    ]

    def load_from_db(self):
        with MongoClient('localhost', 27017) as client:
            db = client['bu-config']
            examples_collection = db['set_1151']
            count = examples_collection.find().count()
            print('Reading from database ', count)     
            progress = 0;       
            for example in examples_collection.find():
                yield example
                progress += 1
                if progress % 1000 == 0:
                    print('Progress', math.floor(1.0 * progress / count * 100.0))

    def save_to_file(self, file_name, array):
        np.save(file_name, array)

    def load_from_file(self, file_name):
        return np.load(file_name)

    def split_examples(self, inputs, outputs):
        assert len(inputs) == len(outputs)
        p = np.random.permutation(len(inputs))
        shuffled_inputs = inputs[p]
        shuffled_outputs = outputs[p]
        train_len = int(len(inputs) * 0.8)
        
        train_inputs = shuffled_inputs[0:train_len]
        train_outputs = shuffled_outputs[0:train_len]
        
        test_inputs = shuffled_inputs[train_len:]
        test_outputs = shuffled_outputs[train_len:]
        
        return train_inputs, train_outputs, test_inputs, test_outputs

    def read_examples_inputs_arrays_without_occupations(self, examples):
        return list(map(lambda example: list(self.get_example_input_array_without_occupation(example)), examples))

    def read_examples_exists_outputs_arrays(self, examples):
        return list(map(lambda example: list(self._get_example_exists_output_array(example)), examples))

    def read_examples_net_outputs_arrays(self, examples):
        return list(map(lambda example: list(self._get_example_net_output_array(example)), examples))

    def get_example_input_array_without_occupation(self, example):
        yield from self._generate_one_hot(example['input']['familyStatus'], self.FAMILY_STATUS_OPTIONS)
        yield from self._generate_one_hot(example['input']['educationType'], self.EDUCATION_TYPE_OPTIONS)
        yield from self._generate_one_hot(example['input']['jobSituation'], self.JOB_SITUATION_OPTIONS)
        yield from self._generate_one_hot(example['input']['industry'], self.INDUSTRY_OPTIONS)

        yield example['input']['benefitAgeLimit']
        yield example['input']['benefitAmount']
        yield example['input']['fractionOfficeWork']
        yield example['input']['staffResponsibility']

        yield example['input']['smoker']

        yield from self._generate_datetime_attributes(example['input']['birthday'])
        yield from self._generate_datetime_attributes(example['input']['insuranceStart'])

    def _generate_one_hot(self, value, possible_values):
        yield from (possible_value == value for possible_value in possible_values)

    def _generate_datetime_attributes(self, dt):
        yield dt.year
        yield dt.month
        yield dt.day
        yield dt.timetuple().tm_yday
        yield (dt - datetime.datetime(1970,1,1)).days

    def _get_example_output_array(self, example):
        return [
            example['output']['exists'],
            example['output']['net'],
            example['output']['gross']
        ]

    def _get_example_exists_output_array(self, example):
        return [
            example['output']['exists']
        ]

    def _get_example_net_output_array(self, example):
        return [
            example['output']['net']
        ]


In [0]:
!pip install auto-sklearn
!pip install pyrfr
!wget -c https://repo.continuum.io/archive/Anaconda3-5.1.0-Linux-x86_64.sh
!chmod +x Anaconda3-5.1.0-Linux-x86_64.sh
!bash ./Anaconda3-5.1.0-Linux-x86_64.sh -b -f -p /usr/local


In [0]:
!conda install swig --yes

In [0]:
!pip install pyrfr
!pip install auto-sklearn
!pip install tensorboardcolab

In [0]:
import sys
sys.path.append('/usr/local/lib/python3.6/site-packages/')

In [0]:
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
from joblib import dump
from google.colab import files

import autosklearn.regression

from keras.callbacks import TensorBoard
import time


data_provider = DataProvider()

X = data_provider.load_from_file('net_price_model_inputs.npy')
y = data_provider.load_from_file('net_price_model_outputs.npy')

feature_types = (['categorical'] * 26) + (['numerical'] * 4) + ['categorical'] + (['numerical'] * 10)
X_train, X_test, y_train, y_test = \
    sklearn.model_selection.train_test_split(X, y, random_state=1)


automl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=120,
    per_run_time_limit=30,
    tmp_folder='tmp/autosklearn_regression_example_tmp3',
    output_folder='tmp/autosklearn_regression_example_out3',
)
automl.fit(X_train, y_train, dataset_name='set_1151',
           feat_type=feature_types)

print(automl.show_models())
predictions = automl.predict(X_test)
print("mean_absolute_error:", sklearn.metrics.mean_absolute_error(y_test, predictions))
print("median_absolute_error:", sklearn.metrics.median_absolute_error(y_test, predictions))
print("explained_variance_score:", sklearn.metrics.explained_variance_score(y_test, predictions))

dump(automl, 'model.joblib')
files.download('model.joblib')


In [0]:
files.download('model.joblib')