In [79]:
#### TensorFlow vs. PyTorch
# TensorFlow is a Google Brain project
# PyTorch is based off lua Torch framework developed at FB (not just wrapper; rewritten to run fast & natively)

# To compare them, code an approximator that will find
# unknown parameter phi given data X: f(x) = x^phi
# using stochastic gradient descent (obviously can find
# an analytic solution for this, but just an example).

# pytorch's optimize module contains optimization

## Static vs. Dynamic graphs

# TensorFlow uses static computational graphs -- defined once and
# is executed over and over again -- optimized up front (makes it easy
# to distribute/amortize it among different machines)
# PyTorch uses dynamic computational graphs -- each forward pass defines
# a new graph. Provides more granular control over the control flow.
# For example, might want to perform a different computation for each
# data point. In TF, need to use TF.scan to embed this in the graph's
# loop. In Torch, can control the computation that differs for each 
# input in your own normal imperative loop. In TF everything works
# via the session. Different architectures might want to use static or
# dynamic graphs -- for example, RNNs require dynamic inputs (like changing
# sentence length). Since define TF graphs once, at the beginning, will
# need to pick a max input length and pad the rest of the input sentences
# by 0. 

## TF also has better visualization & debugging.

## Deployment

# TF is much better for deployment -- has TensorFLow Serving which
# is a framework for deploying models on a specialized server (also
# supports mobile!) Though can still use PyTorch with flask or whatever.

# TF also supports distributed training.

## Framework

# TF feels more like a low-level library than a framework. But things like
# Keras can run on top of TF.

## Adoption

# TF: well-documented, go-to, tons of tutorials
# Torch: still in beta as of summer 2017

# Adam optimization algorithm: the best current performance
# optimizer out there. Is invarient to scaling the gradient,
# which means that when you do k * f(x) on an iteration,
# there is no effect on the performance. Will converge 
# even if f(x) changes with time. Don't need to decrease
# step size after a certain number of epochs -- naturally does
# step size annealing.
# MORE HERE: http://ruder.io/optimizing-gradient-descent/index.html
# and here: https://medium.com/@nishantnikhil/adam-optimizer-notes-ddac4fd7218

In [31]:
import torch
from torch.autograd import Variable
import numpy as np

In [32]:
def rmse(y, y_hat):
    return torch.sqrt(torch.mean((y - y_hat).pow(2).sum()))

# performs a forward pass on the function
# takes in two pytorch Variables. Variable is a wrapper
# around a tensor that is a node around a computational graph.
def forward(x, e):
    return x.pow(e.repeat(x.size(0)))

In [54]:
# number of datapoints
n = 1000

# learning rate
learning_rate = 5e6

# create random datapoints & define model
x = Variable(torch.rand(n) * 10, requires_grad=False)

# model parameters - the exponent and the exponent squared
exp = Variable(torch.FloatTensor([2.0]), requires_grad=False)
exp_hat = Variable(torch.FloatTensor([4.0]), requires_grad=True)

In [43]:
x

Variable containing:
 7.8744
 6.9353
 6.6861
 6.8646
 3.5993
 1.5355
 5.8961
 5.6437
 3.1669
 0.0022
[torch.FloatTensor of size 10]

In [35]:
exp

Variable containing:
 2
[torch.FloatTensor of size 1]

In [36]:
exp_hat

Variable containing:
 4
[torch.FloatTensor of size 1]

In [37]:
# returns torch.Size([10])
x.size()
# returns 10
x.size(0)

10

In [38]:
# repeats exp 10 times
exp.repeat(10)

Variable containing:
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
[torch.FloatTensor of size 10]

In [55]:
# exploit our forward pass function to merely compute x^exp
# and get our output data
y = forward(x, exp)
# in this case x^2
# y

In [56]:
loss_history = []
exp_history = []

# OPTIONAL: use an optimizer (stochastic gradient descent)
opt = torch.optim.SGD([exp_hat], lr=learning_rate, momentum=0.9)

# training loop
for i in range(200):
    # IF using optimizer
    opt.zero_grad()
    
    # Compute the current estimate
    y_hat = forward(x, exp_hat)
    
    # Compute the loss
    loss = rmse(y, y_hat)
    
    # Compute gradients
    loss.backward()
    
    # IF using optimizer
    opt.step()
    
    print('Iteration', i, '\t', loss.data[0], exp_hat.data[0])
    
    # Update params
    exp_hat.data -= learning_rate * exp_hat.grad.data
    exp_hat.grad.data.zero_()


Iteration 0 	 107778.8359375 -1195734859776.0
Iteration 1 	 inf nan
Iteration 2 	 nan nan
Iteration 3 	 nan nan
Iteration 4 	 nan nan
Iteration 5 	 nan nan
Iteration 6 	 nan nan
Iteration 7 	 nan nan
Iteration 8 	 nan nan
Iteration 9 	 nan nan
Iteration 10 	 nan nan
Iteration 11 	 nan nan
Iteration 12 	 nan nan
Iteration 13 	 nan nan
Iteration 14 	 nan nan
Iteration 15 	 nan nan
Iteration 16 	 nan nan
Iteration 17 	 nan nan
Iteration 18 	 nan nan
Iteration 19 	 nan nan
Iteration 20 	 nan nan
Iteration 21 	 nan nan
Iteration 22 	 nan nan
Iteration 23 	 nan nan
Iteration 24 	 nan nan
Iteration 25 	 nan nan
Iteration 26 	 nan nan
Iteration 27 	 nan nan
Iteration 28 	 nan nan
Iteration 29 	 nan nan
Iteration 30 	 nan nan
Iteration 31 	 nan nan
Iteration 32 	 nan nan
Iteration 33 	 nan nan
Iteration 34 	 nan nan
Iteration 35 	 nan nan
Iteration 36 	 nan nan
Iteration 37 	 nan nan
Iteration 38 	 nan nan
Iteration 39 	 nan nan
Iteration 40 	 nan nan
Iteration 41 	 nan nan
Iteration 42 	 nan n

In [27]:
# Not sure why I'm only getting nan/inf

In [74]:
# Using TensorFlow
import tensorflow as tf

def rmse(y, y_hat):
    return tf.sqrt(tf.reduce_mean(tf.square((y - y_hat))))

def forward(x, e):
    return tf.pow(x, e)

In [75]:
# placeholders for inputs/outputs
x = tf.placeholder(tf.float32)
y = tf.placeholder(tf.float32)

exp = tf.constant(2.0)
exp_hat = tf.Variable(4.0, name='exp_hat')

y_hat = forward(x, exp_hat)

loss = rmse(y, y_hat)
opt = tf.train.GradientDescentOptimizer(learning_rate)

# Perform a single training step -- equivalent to opt.step() in torch
# This also updates model parameters (since it's using a static graph,
# just performing the operations on the graph, don't need to update it
# ourselves on a new graph).
train_op = opt.minimize(loss)

In [76]:
# training data
x_train = np.random.rand(n) + 10
y_train = x_train ** 2

In [77]:
x_train.shape, y_train.shape

((1000,), (1000,))

In [78]:
with tf.Session() as session:
    tf.initialize_all_variables().run()
    
    # Training loop
    for i in range(500):
        curr_loss, curr_exp, _ = session.run([loss, exp_hat, train_op], feed_dict={x: x_train, y: y_train})
        
        print(i, curr_loss, curr_exp)

Instructions for updating:
Use `tf.global_variables_initializer` instead.
0 12161.8 -1.44604e+11
1 110.464 -1.44604e+11
2 110.464 -1.44604e+11
3 110.464 -1.44604e+11
4 110.464 -1.44604e+11
5 110.464 -1.44604e+11
6 110.464 -1.44604e+11
7 110.464 -1.44604e+11
8 110.464 -1.44604e+11
9 110.464 -1.44604e+11
10 110.464 -1.44604e+11
11 110.464 -1.44604e+11
12 110.464 -1.44604e+11
13 110.464 -1.44604e+11
14 110.464 -1.44604e+11
15 110.464 -1.44604e+11
16 110.464 -1.44604e+11
17 110.464 -1.44604e+11
18 110.464 -1.44604e+11
19 110.464 -1.44604e+11
20 110.464 -1.44604e+11
21 110.464 -1.44604e+11
22 110.464 -1.44604e+11
23 110.464 -1.44604e+11
24 110.464 -1.44604e+11
25 110.464 -1.44604e+11
26 110.464 -1.44604e+11
27 110.464 -1.44604e+11
28 110.464 -1.44604e+11
29 110.464 -1.44604e+11
30 110.464 -1.44604e+11
31 110.464 -1.44604e+11
32 110.464 -1.44604e+11
33 110.464 -1.44604e+11
34 110.464 -1.44604e+11
35 110.464 -1.44604e+11
36 110.464 -1.44604e+11
37 110.464 -1.44604e+11
38 110.464 -1.44604e+11


In [None]:
# wtf?