In [None]:
# ML_in_Finance-Interpretability
# Author: Matthew Dixon
# Version: 1.0 (08.09.2019)
# License: MIT
# Email: matthew.dixon@iit.edu
# Notes: tested on Mac OS X with Python 3.6.9 and the following packages:
# numpy=1.18.1, keras=2.3.1, tensorflow=2.0.0, statsmodels=0.10.1, scikit-learn=0.22.1
# Citation: Please cite the following reference if this notebook is used for research purposes:
# Dixon M.F., I. Halperin and P. Bilokon, Machine Learning in Finance: From Theory to Practice, Springer Graduate textbook Series, 2020. 

# Overview
The purpose of this notebook is to illustrate a neural network interpretability method which is compatible with linear regression. 

In linear regression, provided the independent variables are scaled, one can view the regression coefficients as a measure of importance of the variables. Equivalently, the dependent variable can be differentiated w.r.t. the inputs to give the coefficient. 

Similarly, the derivatives of the network w.r.t. the inputs are a non-linear generalization of interpretability in linear regression. Moreover, we should expect the neural network gradients to approximate the linear regression coefficients when the data is generated by a linear regression model. 

Various simple experimental tests, corresponding to Section 3 of Chpt 5, are performed to illustrate the properties of network interpretability.

In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.wrappers.scikit_learn import KerasRegressor
import statsmodels.api as sm
import sklearn

## Simple Data Generation Process (DGP)


Let us generate data from the following linear regression model

$Y=X_1+X_2 + \epsilon~, ~~X_1, X_2 \sim N(0,1)~, ~~\epsilon \sim N(0,1)$

In [3]:
M = 5000 # Number of samples
np.random.seed(7) # Set NumPy's random seed for reproducibility
X = np.zeros(shape=(M, 2))
X[:int(M/2), 0] = np.random.randn(int(M/2))
X[:int(M/2), 1] = np.random.randn(int(M/2))

# Use antithetic sampling to reduce the bias in the mean
X[int(M/2):, 0] = -X[:int(M/2), 0]
X[int(M/2):, 1] = -X[:int(M/2), 1]

eps = np.zeros(shape=(M,1))
eps[:int(M/2)] = np.random.randn(int(M/2), 1)
eps[int(M/2):] = -eps[:int(M/2)]
Y = X[:, 0] + X[:, 1] + eps.flatten()

## Use ordinary least squares to fit a linear model to the data
For a baseline, let us compare the neural network with OLS regression. 

We fit statsmodels' OLS model to the data

In [5]:
ols_results = sm.OLS(Y, sm.add_constant(X)).fit()

For each input, get the predicted $Y$ value according to the model

In [6]:
y_ols = ols_results.predict(sm.add_constant(X))

View characteristics of the resulting model. You should observe that the intercept is close to zero and the other coefficients are close to one.

In [7]:
ols_results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.678
Model:,OLS,Adj. R-squared:,0.677
Method:,Least Squares,F-statistic:,5249.0
Date:,"Mon, 18 May 2020",Prob (F-statistic):,0.0
Time:,16:36:00,Log-Likelihood:,-7020.4
No. Observations:,5000,AIC:,14050.0
Df Residuals:,4997,BIC:,14070.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.1e-18,0.014,2.94e-16,1.000,-0.027,0.027
x1,0.9858,0.014,70.409,0.000,0.958,1.013
x2,1.0190,0.014,72.695,0.000,0.992,1.047

0,1,2,3
Omnibus:,0.8,Durbin-Watson:,1.941
Prob(Omnibus):,0.67,Jarque-Bera (JB):,0.75
Skew:,0.0,Prob(JB):,0.687
Kurtosis:,3.06,Cond. No.,1.02


## Compare with a feedforward NN with no hidden layers

Recall that the feedforward network with no hidden layers or activation function is a linear regression model.

Create a build function for the linear perceptron, which transforms the inputs directly to a single output

In [8]:
def linear_NN0_model(l1_reg=0.0):    
    model = Sequential()
    model.add(Dense(1, input_dim=2, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae', 'mse'])
    return model

An early stopping callback to terminate training once the weights appear to have converged to an optimum. 

In [9]:
es = EarlyStopping(monitor='loss', mode='min', verbose=1, patience=10)

Passing the build function for our model and training parameters to the `KerasRegressor` constructor to create a Scikit-learn-compatible regression model. This allows you to take advantage of the library's built-in tools and estimator methods, and to incorporate it into Scikit-learn pipelines. 

In [10]:
lm = KerasRegressor(build_fn=linear_NN0_model, epochs=40, batch_size=10, verbose=1, callbacks=[es])

Train the model

In [11]:
lm.fit(X, Y)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 00026: early stopping


<keras.callbacks.callbacks.History at 0x1a3e26dc50>

### Check that the weights are close to one
The weights should be close to unity. The bias term is the second entry and should be close to zero.

In [12]:
print("weights: " + str(lm.model.layers[0].get_weights()[0]))
print("bias: " + str(lm.model.layers[0].get_weights()[1]))

weights: [[0.9869313]
 [1.0236046]]
bias: [0.00017654]


## Compare with a FFW Neural Network with one hidden layer (unactivated)


This time we create a neural network with a hidden layer with 10 units.

In [13]:
n = 10 # number of hidden units

In [14]:
def linear_NN1_model(l1_reg=0.0):    
    model = Sequential()
    model.add(Dense(n, input_dim=2, kernel_initializer='normal')) 
    model.add(Dense(1, kernel_initializer='normal', activation='linear'))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae', 'mse'])
    return model

In [15]:
lm = KerasRegressor(build_fn=linear_NN1_model, epochs=50, batch_size=10, verbose=1, callbacks=[es])

In [16]:
lm.fit(X, Y)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 00015: early stopping


<keras.callbacks.callbacks.History at 0x1a3eb94fd0>

In [17]:
W1 = lm.model.get_weights()[0]
b1 = lm.model.get_weights()[1]
W2 = lm.model.get_weights()[2]
b2 = lm.model.get_weights()[3]
print(W1, W2)

[[ 0.21191542 -0.3510564   0.32653695  0.30687252  0.27739337  0.30587277
   0.26292333  0.24803898  0.250718   -0.3381098 ]
 [ 0.33603936 -0.28845066  0.25735727  0.3006563   0.32002118  0.28473574
   0.35539797  0.28512466  0.28092325 -0.26294112]] [[ 0.34959066]
 [-0.33088237]
 [ 0.29290077]
 [ 0.3498653 ]
 [ 0.26861414]
 [ 0.3326019 ]
 [ 0.35045084]
 [ 0.37188262]
 [ 0.40377927]
 [-0.34295896]]


### Check that the coefficients are close to one and the intercept is close to zero

In [18]:
beta_0 = np.dot(np.transpose(W2), b1) + b2
beta_1 = np.dot(np.transpose(W2), W1[0])
beta_2 = np.dot(np.transpose(W2), W1[1])

In [19]:
print(beta_0, beta_1, beta_2)

[-0.04406428] [0.97107023] [1.0083461]


## Compare with a feedforward NN with one hidden layer ($tanh$ activated)

Finally, we create another model with a 10 unit hidden layer, this time with a $tanh$ activation function.

In [20]:
# number of hidden neurons
n = 10

In [21]:
# with non-linear activation
def linear_NN1_model_act(l1_reg=0.0):    
    model = Sequential()
    model.add(Dense(n, input_dim=2, kernel_initializer='normal', activation='tanh'))
    model.add(Dense(1, kernel_initializer='normal')) 
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae', 'mse'])
    return model

In [22]:
lm = KerasRegressor(build_fn=linear_NN1_model_act, epochs=100, batch_size=10, verbose=1, callbacks=[es])

Train the model

In [23]:
lm.fit(X, Y)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 00039: early stopping


<keras.callbacks.callbacks.History at 0x1a3eb43160>

### Compute the Sensitivities

In [24]:
# Assumes that the activation function is tanh
def sensitivities(lm, X):
    
    W1 = lm.model.get_weights()[0]
    b1 = lm.model.get_weights()[1]
    W2 = lm.model.get_weights()[2]
    b2 = lm.model.get_weights()[3]
    
    
    M = np.shape(X)[0]
    p = np.shape(X)[1]

    beta = np.array([0]*M*(p+1), dtype='float32').reshape(M,p+1)
    
    beta[:, 0] = (np.dot(np.transpose(W2), np.tanh(b1)) + b2)[0] # intercept \beta_0= F_{W,b}(0)
    for i in range(M):
 
        Z1 = np.tanh(np.dot(np.transpose(W1),np.transpose(X[i,])) + b1)
      
        D = np.diag(1 - Z1**2)
        
        for j in range(p):  
            beta[i, j+1] = np.dot(np.transpose(W2), np.dot(D, W1[j]))
            
    return beta

In [25]:
beta = sensitivities(lm, X)

### Check that the intercept is close to one and the coefficients are close to one

In [26]:
print(np.mean(beta, axis=0))

[-0.01284499  0.9332601   1.05014   ]


In [27]:
print(np.std(beta, axis=0))

[5.578546e-07 6.809256e-02 7.658932e-02]
