In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

# Intro to Tensorflow

### Goals
- Gain a basic understanding of the what/how/why of Tensorflow
- Implement a simple multi-layer perceptron 

## Tensorflow Basics

Tensorflow (and other 'deep learning' libraries) are really good at gradient descent. 

Three types of objects
- Placeholders where we will use real data
- Variables. These are the model parameters - they can be updated using gradient descent.
- Constants.

Use these objects to construct a loss function. Then use gradient descent to find the best parameters, given the data.

### Constants

In [2]:
node1 = tf.constant(3.0, tf.float32)
node2 = tf.constant(4.0)

In [3]:
node1 + node2

<tf.Tensor 'add:0' shape=() dtype=float32>

In [4]:
sess = tf.InteractiveSession()

In [5]:
sess.run(node1 + node2)

7.0

### Placeholders
Placeholders are the objects that will be filled with real data at runtime

In [6]:
a = tf.placeholder(tf.float32)
b = tf.placeholder(tf.float32)
adder_node = a + b 

In [7]:
sess.run(adder_node, feed_dict={a: [3, 4], b: [7, 7]})

array([ 10.,  11.], dtype=float32)

### Variables

Think about the linear equation
$$
y = 3 x - 3
$$

In [8]:
# Define the data
x = tf.placeholder(tf.float32)

# Define a linear model
W = tf.Variable([3.], tf.float32)
b = tf.Variable([-3.], tf.float32)
linear_model = W * x + b


Variables need to be initialized

In [9]:
sess.run(tf.global_variables_initializer())

In [10]:
sess.run(linear_model, {x: [4, 2, 1]})

array([ 9.,  3.,  0.], dtype=float32)

Or we could define some y values and see how well it fits the model

In [11]:
y = tf.placeholder(tf.float32)
error = tf.square(linear_model - y)

In [12]:
sess.run(error, {x: [4, 2, 1], y: [1, 1, 1]})

array([ 64.,   4.,   1.], dtype=float32)

# Linear Regression

## Crime Data

In [13]:
from sklearn.model_selection import train_test_split

# Load some crime data
headers = pd.read_csv('comm_names.txt', squeeze=True)
headers = headers.apply(lambda s: s.split()[1])
crime = (pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data', 
                    header=None, na_values=['?'], names=headers)
         .iloc[:, 5:]
         .dropna()
         )

# Set target and predictors
target = 'ViolentCrimesPerPop'
predictors = [c for c in crime.columns if not c == target]

# Train/test split
X = crime[predictors]
y = crime[[target]]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

### Define the model

In [14]:
# Parameters
dim_input = X_train.shape[1]
dim_output = 1

# input
x = tf.placeholder(tf.float32, [None, dim_input])

# output
y_ = tf.placeholder(tf.float32, [None, 1])

# Variables
W = tf.Variable(tf.random_normal([dim_input, dim_output]))
b = tf.Variable(tf.random_normal([1]))

# Model
y = tf.matmul(x, W) + b

# Loss
mse = tf.reduce_mean(tf.square(y - y_))

# Optimizer
optimizer = tf.train.AdamOptimizer(0.01)  # Automatically adjusts learning rate
train_step = optimizer.minimize(mse)

Initialize

In [15]:
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()

View loss

In [16]:
sess.run(mse, feed_dict={x: X_train, y_: y_train})

98.40609

Parameters

In [17]:
sess.run(b)

array([ 0.34602281], dtype=float32)

In [18]:
sess.run(train_step, feed_dict={x: X_train, y_: y_train})

### Exercise

1: Run 10000 gradient descent steps of the model above. Every 500 iterations, note the train error and the test error.

In [19]:
for i in range(10000):
    if i % 500 == 0:
        train_mse = sess.run(mse, {x: X_train, y_: y_train})
        test_mse = sess.run(mse, {x: X_test, y_: y_test})
        print 'Iteration: {:04} \t Train Loss: {:.3} \t Test Loss: {:.3}'.format(i, train_mse, test_mse)
    sess.run(train_step, feed_dict = {x: X_train, y_: y_train})


Iteration: 0000 	 Train Loss: 89.5 	 Test Loss: 93.0
Iteration: 0500 	 Train Loss: 0.679 	 Test Loss: 0.617
Iteration: 1000 	 Train Loss: 0.304 	 Test Loss: 0.332
Iteration: 1500 	 Train Loss: 0.186 	 Test Loss: 0.227
Iteration: 2000 	 Train Loss: 0.13 	 Test Loss: 0.177
Iteration: 2500 	 Train Loss: 0.097 	 Test Loss: 0.151
Iteration: 3000 	 Train Loss: 0.0739 	 Test Loss: 0.131
Iteration: 3500 	 Train Loss: 0.0569 	 Test Loss: 0.113
Iteration: 4000 	 Train Loss: 0.0445 	 Test Loss: 0.0966
Iteration: 4500 	 Train Loss: 0.0356 	 Test Loss: 0.0833
Iteration: 5000 	 Train Loss: 0.0294 	 Test Loss: 0.0729
Iteration: 5500 	 Train Loss: 0.0251 	 Test Loss: 0.0651
Iteration: 6000 	 Train Loss: 0.022 	 Test Loss: 0.0596
Iteration: 6500 	 Train Loss: 0.0197 	 Test Loss: 0.0559
Iteration: 7000 	 Train Loss: 0.0181 	 Test Loss: 0.0534
Iteration: 7500 	 Train Loss: 0.0169 	 Test Loss: 0.0516
Iteration: 8000 	 Train Loss: 0.016 	 Test Loss: 0.0501
Iteration: 8500 	 Train Loss: 0.0153 	 Test Loss: 

2: Compare your results above to LinearRegression in scikit-learn.

In [37]:
from sklearn.linear_model import LinearRegression, Ridge

lr = LinearRegression()
# lr = Ridge(alpha=11.7)
lr.fit(X_train, y_train)
train_pred = lr.predict(X_train)
test_pred = lr.predict(X_test)
train_mse = np.mean((train_pred - y_train) ** 2)
test_mse = np.mean((test_pred - y_test) ** M2)

print train_mse

print test_mse

3: In Week 5, we found that the best ridge regularization parameter for this data was alpha=11.8. Try to add the same amount of regularization to the tensorflow model above, then compare with ridge regression in scikit-learn.

# Multi-layer Perceptron (MLP)

![](mlp.png)

### Exercise

Build a multi-layer perceptron to predict crime rates.

Start with two hidden units. You should be able to define one matrix transforms the inputs to the hidden layer, and a second matrix that will transform the hidden layer to the output.

Don't forget add bias at each step and to apply a nonlinear transformation to the hidden layer (e.g. tf.nn.sigmoid())

In [23]:
dim_hidden = 2

# input
x = tf.placeholder(tf.float32, [None, dim_input])

# target
y_ = tf.placeholder(tf.float32, [None, 1])

# Input to hidden
W = tf.Variable(tf.random_normal([dim_input, dim_hidden]))
b1 = tf.Variable(tf.random_normal([dim_hidden]))

# Hidden to output
V = tf.Variable(tf.random_normal([dim_hidden, dim_output]))
b2 = tf.Variable(tf.random_normal([1]))

# Model
H = tf.nn.sigmoid(tf.matmul(x, W) + b1)
y = tf.matmul(H, V) + b2

# Loss
mse = tf.reduce_mean(tf.square(y - y_))
lam = 1
reg = tf.reduce_mean(lam * tf.square(W)) + tf.reduce_mean(lam * tf.square(V))
loss = mse + reg

# Optimizer
train_step = tf.train.AdamOptimizer(0.001).minimize(loss)

Once you have something working, it is time to tune your network to find the right number of hidden layers and amount of regularization.

1. Use your code block from above that performs gradient descent steps and records intermediate results.
2. You might want to force the optimizer to be stochastic. That is, feed it 100 random training examples at each step instead of the whole training dataset.
3. Start with two hidden units and try to get the regularization right. Then slowly increase the number of hidden units and continue tuning the regularization.
4. If the training error is high, you have too much bias. If the training and testing errors are very different, you have too much variance. If the training or testing errors are jumping all over the place, your step size is too high.

In [24]:
sess = tf.InteractiveSession()

In [25]:
tf.global_variables_initializer().run()

In [27]:
for i in range(10000):
    idx = np.random.choice(X_train.shape[0], 100, replace=True)
    X_batch = X_train.iloc[idx, :]
    y_batch = y_train.iloc[idx, :]
    if i % 1000 == 0:
        train_mse = sess.run(mse, {x: X_batch, y_: y_batch})
        test_mse = sess.run(mse, {x: X_test, y_: y_test})
        print 'Iteration: {:04} \t Train Loss: {:.3} \t Test Loss: {:.3}'.format(i, train_mse, test_mse)
    sess.run(train_step, feed_dict = {x: X_batch, y_: y_batch})


Iteration: 0000 	 Train Loss: 0.831 	 Test Loss: 0.635
Iteration: 1000 	 Train Loss: 0.034 	 Test Loss: 0.0325
Iteration: 2000 	 Train Loss: 0.0305 	 Test Loss: 0.0257
Iteration: 3000 	 Train Loss: 0.0275 	 Test Loss: 0.0255
Iteration: 4000 	 Train Loss: 0.0236 	 Test Loss: 0.024
Iteration: 5000 	 Train Loss: 0.018 	 Test Loss: 0.0236
Iteration: 6000 	 Train Loss: 0.0238 	 Test Loss: 0.0236
Iteration: 7000 	 Train Loss: 0.0238 	 Test Loss: 0.0237
Iteration: 8000 	 Train Loss: 0.02 	 Test Loss: 0.0232
Iteration: 9000 	 Train Loss: 0.02 	 Test Loss: 0.0237


# Bonus: Add _another_ hidden layer.

Can you decrease the MSE on the test set even further?

In [28]:
dim_h1 = 8
dim_h2 = 8

# input
x = tf.placeholder(tf.float32, [None, dim_input])

# target
y_ = tf.placeholder(tf.float32, [None, 1])

# Input to hidden 1
W1 = tf.Variable(tf.random_normal([dim_input, dim_h1]))
b1 = tf.Variable(tf.random_normal([dim_h1]))

# Hidden 1 to hidden 2
W2 = tf.Variable(tf.random_normal([dim_h1, dim_h2]))
b2 = tf.Variable(tf.random_normal([dim_h2]))

# Hidden 2 to output
W3 = tf.Variable(tf.random_normal([dim_h2, dim_output]))
b3 = tf.Variable(tf.random_normal([1]))

# Model
H1 = tf.nn.tanh(tf.matmul(x, W1) + b1)
H2 = tf.nn.tanh(tf.matmul(H1, W2) + b2)
y = tf.matmul(H2, W3) + b3

# Loss
mse = tf.reduce_mean(tf.square(y - y_))
lam = .4
reg = tf.reduce_mean(lam * tf.square(W1)) + \
    tf.reduce_mean(lam * tf.square(W2)) + \
    tf.reduce_mean(lam * tf.square(W3))
loss = mse + reg

# Optimizer
train_step = tf.train.AdamOptimizer(0.0005).minimize(loss)

In [29]:
sess = tf.InteractiveSession()

In [30]:
tf.global_variables_initializer().run()

In [33]:
for i in range(10000):
    idx = np.random.choice(X_train.shape[0], 100, replace=True)
    X_batch = X_train.iloc[idx, :]
    y_batch = y_train.iloc[idx, :]
    if i % 1000 == 0:
        train_mse = sess.run(mse, {x: X_batch, y_: y_batch})
        test_mse = sess.run(mse, {x: X_test, y_: y_test})
        print 'Iteration: {:04} \t Train Loss: {:.3} \t Test Loss: {:.3}'.format(i, train_mse, test_mse)
    sess.run(train_step, feed_dict = {x: X_batch, y_: y_batch})


Iteration: 0000 	 Train Loss: 0.0186 	 Test Loss: 0.0252
Iteration: 1000 	 Train Loss: 0.0204 	 Test Loss: 0.0242
Iteration: 2000 	 Train Loss: 0.0177 	 Test Loss: 0.0244
Iteration: 3000 	 Train Loss: 0.0219 	 Test Loss: 0.0244
Iteration: 4000 	 Train Loss: 0.017 	 Test Loss: 0.0246
Iteration: 5000 	 Train Loss: 0.0223 	 Test Loss: 0.024
Iteration: 6000 	 Train Loss: 0.0224 	 Test Loss: 0.0239
Iteration: 7000 	 Train Loss: 0.018 	 Test Loss: 0.0243
Iteration: 8000 	 Train Loss: 0.0162 	 Test Loss: 0.024
Iteration: 9000 	 Train Loss: 0.0163 	 Test Loss: 0.024
