In [0]:
from google.colab import files
uploaded = files.upload()

In [0]:
from pymongo import MongoClient
import numpy as np
import pickle
import math 
import datetime

class DataProvider:

    NO_OCCUPATION_INPUT_ROW_LENGTH = 41

    FAMILY_STATUS_OPTIONS = [
        'Keine Angabe',
        'Paar mit Kind(ern)',
        'Single mit Kind(ern)',
        'Keine Kinder',
    ]

    EDUCATION_TYPE_OPTIONS = [
        'Promotion',
        'Studium (Master oder Diplom)',
        'Studium (Bachelor)',
        'Meister',
        'Berufsausbildung',
        'Abitur',
        'Realschulabschluss',
        'Hauptschulabschluss',
        'Kein Abschluss',
    ]

    JOB_SITUATION_OPTIONS = [
        'Angestellt/Selbstständig',
        'Beamter',
        'Sonstiges',
    ]

    INDUSTRY_OPTIONS = [
        'Elektro',
        'Gesundheitswesen',
        'Holz',
        'IT',
        'Kunststoff',
        'Metall',
        'Stahl',
        'Textil',
        'Sonstige Branche',
        'Sonstige',
    ]

    def load_from_db(self):
        with MongoClient('localhost', 27017) as client:
            db = client['bu-config']
            examples_collection = db['set_1151']
            count = examples_collection.find().count()
            print('Reading from database ', count)     
            progress = 0;       
            for example in examples_collection.find():
                yield example
                progress += 1
                if progress % 1000 == 0:
                    print('Progress', math.floor(1.0 * progress / count * 100.0))

    def save_to_file(self, input_np_array, output_np_array):
        np.save('net_price_model_inputs.npy', input_np_array)
        np.save('net_price_model_outputs.npy', output_np_array)

    def load_from_file(self):
        input_np_array = np.load('net_price_model_inputs.npy')
        output_np_array = np.load('net_price_model_outputs.npy')
        return input_np_array, output_np_array

    def split_examples(self, inputs, outputs):
        assert len(inputs) == len(outputs)
        p = np.random.permutation(len(inputs))
        shuffled_inputs = inputs[p]
        shuffled_outputs = outputs[p]
        train_len = int(len(inputs) * 0.8)
        
        train_inputs = shuffled_inputs[0:train_len]
        train_outputs = shuffled_outputs[0:train_len]
        
        test_inputs = shuffled_inputs[train_len:]
        test_outputs = shuffled_outputs[train_len:]
        
        return train_inputs, train_outputs, test_inputs, test_outputs

    def read_examples_inputs_arrays_without_occupations(self, examples):
        return list(map(lambda example: list(self._get_example_input_array_without_occupation(example)), examples))

    def read_examples_exists_outputs_arrays(self, examples):
        return list(map(lambda example: list(self._get_example_exists_output_array(example)), examples))

    def read_examples_net_outputs_arrays(self, examples):
        return list(map(lambda example: list(self._get_example_net_output_array(example)), examples))

    def _get_example_input_array_without_occupation(self, example):
        yield from self._generate_one_hot(example['input']['familyStatus'], self.FAMILY_STATUS_OPTIONS)
        yield from self._generate_one_hot(example['input']['educationType'], self.EDUCATION_TYPE_OPTIONS)
        yield from self._generate_one_hot(example['input']['jobSituation'], self.JOB_SITUATION_OPTIONS)
        yield from self._generate_one_hot(example['input']['industry'], self.INDUSTRY_OPTIONS)

        yield example['input']['benefitAgeLimit']
        yield example['input']['benefitAmount']
        yield example['input']['fractionOfficeWork']
        yield example['input']['staffResponsibility']
        
        yield example['input']['smoker']

        yield from self._generate_datetime_attributes(example['input']['birthday'])
        yield from self._generate_datetime_attributes(example['input']['insuranceStart'])

    def _generate_one_hot(self, value, possible_values):
        yield from (possible_value == value for possible_value in possible_values)

    def _generate_datetime_attributes(self, dt):
        yield dt.year
        yield dt.month
        yield dt.day
        yield dt.timetuple().tm_yday
        yield (dt - datetime.datetime(1970,1,1)).days

    def _get_example_output_array(self, example):
        return [
            example['output']['exists'],
            example['output']['net'],
            example['output']['gross']
        ]

    def _get_example_exists_output_array(self, example):
        return [
            example['output']['exists']
        ]

    def _get_example_net_output_array(self, example):
        return [
            example['output']['net']
        ]


In [0]:
from keras.layers import Input, Dense, BatchNormalization, Activation
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Model
from keras import optimizers
import keras.backend as K
from keras import regularizers

def max_abs_error(y_true, y_pred):
    return K.max(K.abs(y_true-y_pred))

def error_power_8(y_true, y_pred):
    return K.mean((y_true-y_pred)**8)

class NoOccupationNetModelProvider:

    def __init__(self):
        self.model = None
        self.init_model()

    def init_model(self):
        x_input = Input((DataProvider.NO_OCCUPATION_INPUT_ROW_LENGTH,))

        x = x_input

        x = BatchNormalization()(x)
        x = Dense(
            1000, 
            activation='sigmoid',            
            kernel_regularizer=regularizers.l2(0.01), 
            activity_regularizer=regularizers.l1(0.01)
        )(x)
        #x = LeakyReLU()(x)

        x = Dense(1)(x)
        x = LeakyReLU()(x)

        self.model = Model(inputs=x_input, outputs=x)

        optimizer = optimizers.Adam(lr=0.01)
        self.model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mean_absolute_error', max_abs_error])



In [0]:
!pip install tensorboardcolab

In [0]:
from tensorboardcolab import *
from keras.callbacks import TensorBoard
import time

data_provider = DataProvider()
inputs, outputs = data_provider.load_from_file()
train_inputs, train_outputs, test_inputs, test_outputs = data_provider.split_examples(inputs, outputs)

model_provider = NoOccupationNetModelProvider()

tbc = TensorBoardColab()
model_provider.model.fit(
    train_inputs,
    train_outputs,
    epochs=100000, batch_size=512,
    validation_data=(test_inputs, test_outputs),
    callbacks=[TensorBoardColabCallback(tbc)],
    verbose=0
)

model_provider.model.save('model.h5')

In [0]:
from google.colab import files
files.download('model.h5')

In [0]:
from google.colab import files
files.download('Graph/training')
files.download('Graph/validation')