In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import scipy

In [None]:
plt.rcParams['figure.figsize'] = [10., 10.]
plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.labelsize'] = 14 
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['legend.fontsize'] = 14

# Regression

Here we explore different regression algorithms using the Boston Housing dataset

In [None]:
from sklearn.datasets import load_boston
boston_dataset = load_boston()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
print(boston_dataset['DESCR'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(boston_dataset['data'], boston_dataset['target'], test_size = 0.2, random_state=5)

We can now try out different regression models

In [None]:
# from tensorflow import keras

# inputs = keras.Input(shape=(X_train.shape[1],))
# h = keras.layers.Dense(32, activation="relu")(inputs)
# outputs = keras.layers.Dense(1, activation='linear')(h)
# reg = keras.Model(inputs=inputs, outputs=outputs)
# optimizer = keras.optimizers.Adam(0.01)

# reg.compile(loss='mse', optimizer=optimizer)

In [None]:
# from sklearn.linear_model import LinearRegression
# reg = LinearRegression()

In [None]:
# from sklearn.neural_network import MLPRegressor
# reg = MLPRegressor(hidden_layer_sizes=(50,), activation='relu', max_iter=1000, tol=0.000001)

In [None]:
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor(n_estimators=20, max_depth=4, criterion='mse')

In [None]:
reg.fit(X_train, y_train) #, epochs=50, verbose=0, )

In [None]:
plt.plot(y_test, reg.predict(X_test), '.', label='test')
plt.plot(y_train, reg.predict(X_train), '.', label='train')
plt.plot([0,55], [0,55], c='grey')
plt.gca().set_xlabel('True price (k$)')
plt.gca().set_ylabel('Predicted price (k$)')
plt.legend()
#plt.savefig('boston_DT.png', bbox_inches='tight')

## Feature Importance

We can analyze the imprtance of the various input features, here done via two separate methods:. In both cases the results are comparable and `LSTAT` is the most important variable, while `ZN`, `CHAS` amd `RAD` carry virtually no information.

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

You may need to `pip install eli5`
And later `pip install shap`

In [None]:
perm = PermutationImportance(reg, random_state=1).fit(X_train, y_train)
eli5.show_weights(perm, feature_names = boston_dataset['feature_names'])

In [None]:
import shap

In [None]:
shap.initjs()
X,y = shap.datasets.boston()
explainer = shap.TreeExplainer(reg)
shap_values = explainer.shap_values(X)

In [None]:
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:])

In [None]:
shap.force_plot(explainer.expected_value, shap_values, X)

In [None]:
shap.summary_plot(shap_values, X)

In [None]:
shap.summary_plot(shap_values, X, plot_type="bar")