In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
import keras
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
#from sklearn import dataset
from sklearn.model_selection import train_test_split
import sklearn
import torch
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

In [None]:
keras.backend.backend()

## Data

In [None]:
N = 500
np.random.seed(12345)
train_x = np.random.random((N,2)) * 2.0 - 1.0
train_y = np.array([((train_x[:,0]**2+train_x[:,1]**2)<1) + 0,((train_x[:,0]**2+train_x[:,1]**2)>=1) + 0]).T

In [None]:
plt.scatter(train_x[:,0],train_x[:,1],c=train_y[:,0],edgecolors='black')

## Scikit-learn

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(5), max_iter=100, alpha=1e-4,
                    solver='lbfgs', verbose=10, tol=1e-4, random_state=1,
                    learning_rate_init=.1,activation='logistic')

In [None]:
mlp.fit(train_x, train_y)
print("Training set score: %f" % mlp.score(train_x, train_y))

In [None]:
predictions = mlp.predict(train_x)

In [None]:
plt.scatter(train_x[:,0],train_x[:,1],c=predictions[:,0],edgecolors='black')

In [None]:
NG = 100
gx, gy = np.meshgrid(np.linspace(-1, 1, NG),np.linspace(-1, 1, NG))
fgx = gx.flatten()
fgy = gy.flatten()
grid = np.array([fgx,fgy]).T
C = mlp.predict(grid)[:,0].reshape(NG,NG)
plt.contourf(gx,gy,C)
plt.show()

In [None]:
w = mlp.coefs_[0]
b = mlp.intercepts_[0]

In [None]:
w[1]

In [None]:
x = np.linspace(-1,1)
plt.axis((-1,1,-1,1))
for i in range(5):
    plt.plot(x,-(x*w[0][i]+b[i])/w[1][i])

## Keras (tensorflow interface)

In [None]:
model = Sequential()
D = Dense(10, input_shape=(2,), activation='relu')
model.add(D)
model.add(Dense(2, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(train_x, train_y, epochs=150, batch_size=10)

In [None]:
predictions = model.predict(train_x)

In [None]:
plt.scatter(train_x[:,0],train_x[:,1],c=predictions[:,0],edgecolors='black')

In [None]:
NG = 100
gx, gy = np.meshgrid(np.linspace(-1, 1, NG),np.linspace(-1, 1, NG))
fgx = gx.flatten()
fgy = gy.flatten()
grid = np.array([fgx,fgy]).T
C = model.predict(grid)[:,0].reshape(NG,NG)
plt.contourf(gx,gy,C)
plt.show()

In [None]:
model.get_weights()

In [None]:
w,b = D.get_weights()
print(w)

## Tensorflow
Low level tool 1.0

In [None]:
# Layer 0 
x0 = tf.constant( train_x  , dtype=tf.float32 )
y0 = tf.constant( train_y , dtype=tf.float32 )

# Layer 1
m1 = tf.Variable( tf.random_uniform( [2,20] , minval=-0.9 , maxval=0.9 , dtype=tf.float32  ))
b1 = tf.Variable( tf.random_uniform( [20]   , minval=-0.9 , maxval=0.9 , dtype=tf.float32  ))
h1 = tf.sigmoid( tf.matmul( x0,m1 ) + b1 )

# Layer 2
m2 = tf.Variable( tf.random_uniform( [20,20] , minval=-0.9 , maxval=0.9 , dtype=tf.float32  ))
b2 = tf.Variable( tf.random_uniform( [20]   , minval=-0.9 , maxval=0.9 , dtype=tf.float32  ))
h2 = tf.sigmoid( tf.matmul( h1,m2 ) + b2 )

# Layer 3
m3 = tf.Variable( tf.random_uniform( [20,2] , minval=-0.9 , maxval=0.9 , dtype=tf.float32  ))
b3 = tf.Variable( tf.random_uniform( [2]   , minval=-0.9 , maxval=0.9 , dtype=tf.float32  ))
y_out = tf.sigmoid( tf.matmul( h2,m3 ) + b3 )


### loss
# loss : sum of the squares of y0 - y_out
loss = tf.reduce_mean( tf.square( y0 - y_out ))

# training step : gradient decent (1.0) to minimize loss
train = tf.train.GradientDescentOptimizer(1.0).minimize(loss)


### training
C = []
with tf.Session() as sess:
    sess.run( tf.global_variables_initializer() )
    for step in range(10000) :
        sess.run(train)
    #print(y_out.eval())
    results = sess.run([m1,b1,m2,b2,m3,b3,y_out,loss])
    labels  = "m1,b1,m2,b2,m3,b3,y_out,loss".split(",")
    '''
    for label,result in zip(*(labels,results)) :
        print("")
        print(label)
        print(result)
    '''
    C = y_out.eval()

print("")


In [None]:
X0 = tf.constant( train_x, dtype=tf.float32 )
H1 = tf.sigmoid( tf.matmul( X0,m1 ) + b1 )
H2 = tf.sigmoid( tf.matmul( H1,m2 ) + b2 )
y_est = tf.sigmoid( tf.matmul( H2,m3 ) + b3 )
init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    y_est_v = sess.run(y_out)
    #y_est_v = y_est.eval()
    


In [None]:
y_est_v[0:10]

In [None]:
plt.scatter(train_x[:,0],train_x[:,1],c=C[:,0])
plt.show()

## Pytorch
by Justin Johnson

In [None]:
dtype = torch.double
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

H = 10
x = torch.from_numpy(train_x)
y = torch.from_numpy(train_y)

# Randomly initialize weights
w1 = torch.randn(2, H, device=device, dtype=dtype)
w2 = torch.randn(H, 2, device=device, dtype=dtype)

learning_rate = 1e-3
for t in range(10000):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 1000 == 999:
        print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

In [None]:
plt.scatter(train_x[:,0],train_x[:,1],c=y_pred.detach().numpy()[:,0])
plt.show()

In [None]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.

D_in = 2
D_out = 2
H = 10
x = torch.from_numpy(train_x).float()
y = torch.from_numpy(train_y).float()

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algorithms. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(10000):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    if t % 1000 == 999:
        print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

In [None]:
plt.scatter(train_x[:,0],train_x[:,1],c=y_pred.detach().numpy()[:,0])
plt.show()

# Regularization

## Keras

In [None]:
N = 500
np.random.seed(12345)
train_x = np.random.random((N,2)) * 2.0 - 1.0
cond = ((train_x[:,0]**2+train_x[:,1]**2)<1)  * ((train_x[:,0]**2+train_x[:,1]**2)>0.25)
train_y = np.array([cond*1,1-cond*1]).T

In [None]:
plt.scatter(train_x[:,0],train_x[:,1],c=train_y[:,0],edgecolors='black')

In [None]:
model = Sequential()
D = Dense(10, input_shape=(2,), activation='relu',
            kernel_regularizer=keras.regularizers.l1_l2(l1=1e-2, l2=1e-2),
            bias_regularizer=keras.regularizers.l2(1e-2),
            activity_regularizer=keras.regularizers.l2(1e-2))
model.add(D)
model.add(Dense(10, activation='relu',
            kernel_regularizer=keras.regularizers.l1_l2(l1=1e-2, l2=1e-2),
            bias_regularizer=keras.regularizers.l2(1e-2),
            activity_regularizer=keras.regularizers.l2(1e-2)))
model.add(Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.1))

In [None]:
model = Sequential()
model.add(Dense(20, input_shape=(2,),activation='relu'))
model.add(Dense(20,activation='relu'))
model.add(Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.11))

In [None]:
model.fit(train_x, train_y, epochs=1000, batch_size=100)

In [None]:
predictions = model.predict(train_x)
plt.scatter(train_x[:,0],train_x[:,1],c=predictions[:,0],edgecolors='black')

In [None]:
NG = 100
gx, gy = np.meshgrid(np.linspace(-1, 1, NG),np.linspace(-1, 1, NG))
fgx = gx.flatten()
fgy = gy.flatten()
grid = np.array([fgx,fgy]).T
C = model.predict(grid)[:,0].reshape(NG,NG)
plt.contourf(gx,gy,C)
plt.show()

In [None]:
model = Sequential()
model.add(Dense(20, input_shape=(2,),activation='relu',kernel_regularizer=keras.regularizers.l1_l2(l1=1e-4, l2=1e-4)))
model.add(Dense(20,activation='relu',kernel_regularizer=keras.regularizers.l1_l2(l1=1e-4, l2=1e-4)))
model.add(Dense(2, activation='sigmoid',kernel_regularizer=keras.regularizers.l1_l2(l1=1e-4, l2=1e-4)))
model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.11))

In [None]:
model.fit(train_x, train_y, epochs=1000, batch_size=100)

In [None]:
predictions = model.predict(train_x)
plt.scatter(train_x[:,0],train_x[:,1],c=predictions[:,0],edgecolors='black')

In [None]:
NG = 100
gx, gy = np.meshgrid(np.linspace(-1, 1, NG),np.linspace(-1, 1, NG))
fgx = gx.flatten()
fgy = gy.flatten()
grid = np.array([fgx,fgy]).T
C = model.predict(grid)[:,0].reshape(NG,NG)
plt.contourf(gx,gy,C)
plt.show()