Install keras and scikit-learn Keras with     
```!conda install keras ```  
```!pip install tensorflow==2.17.0```  
```!pip install scikeras```

In [1]:
import warnings

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore",category=UserWarning)
warnings.simplefilter(action="ignore",category=FutureWarning)

# Suppress valuewarning when fitting ARIMA model.
from statsmodels.tools.sm_exceptions import ValueWarning
warnings.simplefilter('ignore', ValueWarning)


# Interactive plots embedded within the notebook
#%matplotlib notebook 
# Static images of plots embedded within the notebook
# %matplotlib inline   
%config InlineBackend.figure_formats = {'png', 'retina'}

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from platform import python_version
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels as stm
import sklearn 
import keras 
import scikeras
import tensorflow as tf

print('Python version', python_version())
print('Numpy version', np.__version__)
print('Scipy version', sp.__version__)
print('Pandas version', pd.__version__)
print('Matplotlib version', mpl.__version__)
print('Seaborn version', sns.__version__)
print('Statsmodels version', stm.__version__)
print('Tensor flow version ', tf.__version__)
print('Scikit-learn version', sklearn.__version__)
print('Kera version', keras.__version__)
print('Scikeras learn version ', scikeras.__version__)

from matplotlib import rc
rc('font',**{'family':'serif','serif':['Helvetica']})

ModuleNotFoundError: No module named 'scikeras'

## Neural Network Regression

###  Explore the house sale dataset and prepare the input

We will use a house price dataset

In [None]:
# Import dataset
house_df = pd.read_csv('data/house_sales.csv', sep='\t')
house_df.head()
house_df.info()

Select columns to be used 

In [None]:
num_columns = ["SqFtTotLiving", "SqFtLot", "Bathrooms", "Bedrooms", 
               "BldgGrade", "NbrLivingUnits","SqFtFinBasement"]
cat_columns = ['PropertyType', 'NewConstruction']
target_column = ['AdjSalePrice']
house_df = house_df[num_columns + cat_columns + target_column]

Split train and test set

In [None]:
from sklearn.model_selection import train_test_split 

house_train, house_test = train_test_split(house_df, test_size=0.25, random_state=0, shuffle=True)

In [None]:
house_train.describe()

## Data preparation

We need to scale the input as GD is highly sensitive to scaling difference.  
The output scaling should be performed if the range of y is more than hundreds or thousands as    
a large magnitude will affect the convergence. 

### Transformation Pipeline

Define the components in the pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Can add more steps to the pipeline
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])


full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_columns),
    ("cat", OneHotEncoder(drop='first', handle_unknown='ignore', 
                          dtype=int, sparse_output=False), cat_columns)], remainder='drop')

Apply the pipeline to transform the features

In [None]:
X_transformed = full_pipeline.fit_transform(house_train)
X_train = pd.DataFrame(X_transformed, columns=full_pipeline.get_feature_names_out())
y_train = np.log(house_train[target_column]).reset_index(drop=True)

In [None]:
X_train.head()
y_train.head()

Look at shape and dimension of data

In [None]:
print(f'Train set Feature shape: {X_train.shape}, Dim: {X_train.ndim}') 
print(f'Train set Target shape: {y_train.shape}, Dim: {y_train.ndim}') 

## Model building

### Create a feed-forward neural network 

Network with one input layer, two hidden layers and one output layer <br>
https://keras.io/layers/core/

- The hidden layers respectively have 16 and 4 nodes, all with tanh activation. 
  We will call it **tanh(16,4)** hidden layer configuration.
- The output layer has 1 node with ReLU activation.   

The warnings will be resolved by updating tf1.14 to 1.5

In [None]:
from keras.layers import Dense
from keras.models import Sequential

network = Sequential([      
    Dense(16, activation='tanh', input_shape=X_train.shape[1:]),
    Dense(4, activation='tanh'),
    Dense(1, activation='relu')
])

<font color='blue'>**Configure the model for training and check its configuration**</font>
- Optimization algorithm
- Loss function to optimize
- Metric to visualize the training performance  

### Train model

Fit the model and let it validate the test data in each training epoch. <br>
`history` returned by fit() is a dictionary containing metrics being monitored during training and validation. <br>

<font color='blue'>Train the model using `.fit()`</font>

### Save pipeline and trained model for later use.

We can use `pickle` package to save the transformation pipeline and the fitted model, which can later be retrieved.  
The model weights can also be saved to and loaded from file by using `save_weights('file.h5')`   and `load_weights('file.h5')`.

In [None]:
import pickle

pickle.dump(full_pipeline, open("house_pipeline.pickle", "wb"))
pickle.dump(network, open("house_nn_model.pickle", "wb"))

### Evaluate the train loss and train metric

In [None]:
train_loss, train_metric = network.evaluate(X_train, y_train)
print(f'Train Loss: {train_loss}\nTrain Metric: {train_metric}') 

## Learning curve

Let's take the metrics recorded in history for the training data and validation data. <br>
In the case of neural networks, the loss is usually negative log-likelihood and residual sum of squares <br>
for classification and regression respectively.  

Plot the training and validation losses over epoches.

In [None]:
history_df = pd.DataFrame(history.history)
history_df[['loss','val_loss']].plot(style=['bs:','ro-'], 
                                     ms=3, lw=1.5, alpha=0.5, figsize=(6, 4));
plt.grid(True);
plt.gca().set_ylim(0, 1) 
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.tight_layout();

## Performance evaluation

### Trained performance

In [None]:
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.metrics import mean_squared_error as MSE

y_pred = network.predict(X_train)

print(f'MSE on train data: {MSE(y_train,y_pred):2f}')
print(f'MAPE on train data: {MAPE(y_train,y_pred):.3f}%\n')
print(f'MSE on train data (unscaled): {MSE(np.exp(y_train),np.exp(y_pred)):2f}')
print(f'MAPE on train data (unscaled): {MAPE(np.exp(y_train),np.exp(y_pred)):.3f}%')

### Test performance

Load the saved transformation pipeline and model

In [None]:
import pickle

loaded_pipeline = pickle.load(open("house_pipeline.pickle", "rb"))
loaded_model = pickle.load(open("house_nn_model.pickle", "rb"))

X_test_transformed = loaded_pipeline.transform(house_test)
y_test = np.log(house_test[target_column])

Apply the model to test data and determine the test performance.

In [None]:
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.metrics import mean_squared_error as MSE

y_test_pred = loaded_model.predict(X_test_transformed)
print(f'MSE on test data: {MSE(y_test, y_test_pred):2f}')
print(f'MAPE on test data: {MAPE(y_test, y_test_pred):.3f}%\n')

print(f'MSE on train data (unscaled): {MSE(np.exp(y_test),np.exp(y_test_pred)):2f}')
print(f'MAPE on train data (unscaled): {MAPE(np.exp(y_test),np.exp(y_test_pred)):.3f}%')

## Weight Regularization and Dropout

Dropout layer prevents neurons to update their weights with some probability in each training step.  
Weight regularization forces redundant weights to zero by including weights in the loss function.

Example: Consider a network with two hidden layers. To add dropout to each hidden layer, 
```
from keras.layers import Dense
from keras.models import Sequential
from keras.layers import Dropout
from keras import regularizers

reg_network = Sequential([
    Dense(16, activation='tanh', input_shape=(X_train.shape[1],), kernel_regularizer=regularizers.l2(0.02)),
    Dropout(rate=0.2),
    Dense(8, activation='relu'),
    Dropout(rate=0.2),
    Dense(1, activation='relu')
])

reg_network.compile(loss="mean_squared_error", optimizer="adam", metrics=[wape])
reg_network.summary()
```

## Hyperparameter Tuning

### Define the hyperparameter range

In [None]:
HIDDENLAYER1 = [8, 12]
HIDDENLAYER2 = [4, 8]
LEARN_RATE = [1e-2, 1e-3, 1e-4]
DROPOUT_RATE = [0.2, 0.3, 0.4]
BATCH_SIZE = [8, 16, 32]
EPOCHS = [100, 200]


grid_params = dict(
    hiddenLayerOne=HIDDENLAYER1,
    hiddenLayerTwo=HIDDENLAYER2,
    dropout=DROPOUT_RATE,
    learnRate=LEARN_RATE,
    optimizer__batch_size=BATCH_SIZE,
    optimizer__epochs=EPOCHS
)

### Define the baseline model

In [None]:
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.optimizers import Adam

# define baseline model
# Other parameters such as the activation function and loss function can also be tuned.
def baseline_model(inShape, hiddenLayerOne, hiddenLayerTwo, learnRate, dropout):
    
    # create model
    model = Sequential([
        Dense(hiddenLayerOne, activation='tanh', input_shape=inShape),
        Dropout(rate=dropout),
        Dense(hiddenLayerTwo, activation='tanh'),
        Dropout(rate=dropout),
        Dense(1, activation='relu')
    ])
    
    # compile model 
    
    model.compile(optimizer=Adam(learning_rate=learnRate), loss='mse')
    model.summary()
    
    return model 

### Perform Randomized Grid search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scikeras.wrappers import KerasClassifier, KerasRegressor

# wrap our model into a scikit-learn compatible classifier
base_estimator = KerasRegressor(build_fn=baseline_model, verbose = 1, inShape=X_train.shape[1:],
                                hiddenLayerOne=4, hiddenLayerTwo=4, learnRate = 0.1, dropout=0.2)

print("Performing random search...")
searcher = RandomizedSearchCV(estimator=base_estimator, cv=3,
                              param_distributions=grid_params, 
                              scoring='neg_mean_absolute_percentage_error')
searchResults = searcher.fit(X_train, y_train)



### Evaluate the tuned model on the trained data

In [None]:
# summarize grid search information
bestScore = searchResults.best_score_
bestParams = searchResults.best_params_
print(f'Best MAPE is {-bestScore:.2f}% using \n{bestParams}') 

In [None]:
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.metrics import mean_squared_error as MSE

tuned_model = searchResults.best_estimator_
y_pred = tuned_model.predict(X_train)

print(f'MSE on train data: {MSE(y_train,y_pred):2f}')
print(f'MAPE on train data: {MAPE(y_train,y_pred):.3f}%')