In [None]:
#import some necessary librairies

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib.pyplot as plt  # Matlab-style plotting
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)
warnings.warn_explicit = ignore_warn

import pickle
from scipy import stats
from scipy.stats import norm, skew #for some statistics

pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points

from keras.callbacks import ModelCheckpoint, History, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from keras.metrics import RootMeanSquaredError, MeanSquaredError
from matplotlib import pyplot as plt
import keras_tuner as kt

# Daten aus vorigem Schritt laden

In [None]:
with open('../data/house-prices-advanced-regression-techniques/x_preprocessed_train.pkl', 'rb') as handle:
    X_preprocessed_train = pickle.load(handle)

with open('../data/house-prices-advanced-regression-techniques/y_train.pkl', 'rb') as handle:
    y_preprocessed_train = pickle.load(handle)
    
with open('../data/house-prices-advanced-regression-techniques/x_test.pkl', 'rb') as handle:
    X_preprocessed_test = pickle.load(handle)


# Train Test Split

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
seed = 4354 # Random_state ist ein seed, damit gegebenenfalls immer mit der selben pseudo Random Folge gearbeitet wird.
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed_train, y_preprocessed_train, test_size=0.2, random_state=seed) 


In [None]:
from keras_tuner import HyperParameters as hp

In [None]:
# see https://www.tensorflow.org/tutorials/keras/keras_tuner
def model_builder(hp):
    model = Sequential()
    # The Input Layer :
    model.add(Dense(hp.Choice('units1', [32, 64, 128, 256]), kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))
    
    # The Hidden Layers :
    model.add(Dense(hp.Choice('units2', [32, 64, 128, 256, 1024]), kernel_initializer='normal',activation='relu'))
    model.add(Dense(hp.Choice('units3', [32, 64, 128, 256, 1024]), kernel_initializer='normal',activation='relu'))
    model.add(Dense(hp.Choice('units4', [32, 64, 128, 256, 1024]), kernel_initializer='normal',activation='relu'))    
    # The Output Layer :
    model.add(Dense(1, kernel_initializer='normal',activation='linear'))

    # Tune the learning rate for the optimizer
    # Choose an optimal value from 0.01, 0.001, or 0.0001
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    # Compile the network :
    model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=hp_learning_rate), metrics=[MeanSquaredError()])
    return model

In [None]:
tuner = kt.Hyperband(model_builder,
                     objective='mean_squared_error',
                     max_epochs=20,
                     factor=3,
                     directory='tuner_dir',
                     project_name='tune deep house prices')

In [None]:
stop_early = EarlyStopping(monitor='val_loss', patience=5)

In [None]:
tuner.search(X_train, y_train, epochs=50, validation_split=0.2, callbacks=[stop_early])

In [None]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete.
The optimal number of units in the first densely-connected layer is {best_hps.get('units1')}.
The optimal number of units in the second densely-connected layer is {best_hps.get('units2')}.
The optimal number of units in the third densely-connected layer is {best_hps.get('units3')}.
The optimal number of units in the fourth densely-connected layer is {best_hps.get('units4')}.
The optimal learning_rate for Adam is {best_hps.get('learning_rate')}.

""")

In [None]:
# Build the model with the optimal hyperparameters and train it on the data for 50 epochs
model = tuner.hypermodel.build(best_hps)

In [None]:
#checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint_name = 'model.weights.best.hdf5'
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
history = History()
callbacks_list = [checkpoint, history]

In [None]:
result = model.fit(X_train, y_train, epochs=500, batch_size=32, validation_split = 0.2, callbacks=callbacks_list)


In [None]:
history.history['mean_squared_error']

In [None]:
root_mean_squared_error = np.sqrt(result.history['mean_squared_error'])
val_root_mean_squared_error = np.sqrt(result.history['val_mean_squared_error'])

best_epoch = list (val_root_mean_squared_error).index(min(val_root_mean_squared_error)) + 1
print('Best epoch: %d' % (best_epoch,))
print (f'Best RMSE: {min(val_root_mean_squared_error)}')

In [None]:
fig, ax1 = plt.subplots()
ax1.plot(history.history['loss'], color='red', linestyle='--')
ax1.plot(history.history['val_loss'], color='green', linestyle='--')
plt.title('model performance')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train loss', 'val loss', 'train RMSE', 'varl RMSE'], loc='upper left')

ax2 = ax1.twinx()

ax2.plot(root_mean_squared_error)
ax2.plot(val_root_mean_squared_error)
ax2.set_ylabel('RMSE')
ax2.legend(['train RMSE', 'val RMSE'], loc='upper right')

In [None]:
# Load the weights with the best validation accuracy
model.load_weights('model.weights.best.hdf5')

In [None]:
# Evaluate the model on test set
score = model.evaluate(X_test, y_test, verbose=0)

# Print test RMSE
print('\n', 'Test RMSE:', np.sqrt(score[1]))

In [None]:
model.metrics_names 

In [None]:
y_hat = model.predict(X_test)
y_hat = y_hat.reshape(y_hat.shape[0])

In [None]:
rand_vals=np.random.choice(X_test.shape[0], size=15, replace=False)
rand_vals

In [None]:
for i, index in enumerate(rand_vals):
    predict_index = y_hat[index]
    true_index = y_test.to_numpy()[index]
    print (f'{index}: predict={predict_index} / true={true_index}')

In [None]:
fig, ax1 = plt.subplots()
plt.title('Prediction Delta')
plt.ylabel('Delta')
plt.xlabel('Index')
ax1.plot(np.subtract(y_test.to_numpy(),y_hat), color='red', linestyle='-')

# Weitere Optimierungsmöglichkeiten
## GPU Support

https://lifewithdata.com/2022/01/16/how-to-install-tensorflow-and-keras-with-gpu-support-on-windows/

## Anregungen zur Hyperparameter-Suche

https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/

## Keras Tuner Dokumentation

https://keras.io/api/keras_tuner/